blob: 71ed843d101f8863c2247b2a68569f7076d05843 [file] [log] [blame]
Bram Moolenaare19defe2005-03-21 08:23:33 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10/*
11 * spell.c: code for spell checking
Bram Moolenaarfc735152005-03-22 22:54:12 +000012 *
Bram Moolenaar51485f02005-06-04 21:55:20 +000013 * The spell checking mechanism uses a tree (aka trie). Each node in the tree
14 * has a list of bytes that can appear (siblings). For each byte there is a
15 * pointer to the node with the byte that follows in the word (child).
Bram Moolenaar9f30f502005-06-14 22:01:04 +000016 *
17 * A NUL byte is used where the word may end. The bytes are sorted, so that
18 * binary searching can be used and the NUL bytes are at the start. The
19 * number of possible bytes is stored before the list of bytes.
20 *
21 * The tree uses two arrays: "byts" stores the characters, "idxs" stores
22 * either the next index or flags. The tree starts at index 0. For example,
23 * to lookup "vi" this sequence is followed:
24 * i = 0
25 * len = byts[i]
26 * n = where "v" appears in byts[i + 1] to byts[i + len]
27 * i = idxs[n]
28 * len = byts[i]
29 * n = where "i" appears in byts[i + 1] to byts[i + len]
30 * i = idxs[n]
31 * len = byts[i]
32 * find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
Bram Moolenaar51485f02005-06-04 21:55:20 +000033 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +000034 * There are two word trees: one with case-folded words and one with words in
Bram Moolenaar51485f02005-06-04 21:55:20 +000035 * original case. The second one is only used for keep-case words and is
36 * usually small.
37 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +000038 * There is one additional tree for when prefixes are not applied when
39 * generating the .spl file. This tree stores all the possible prefixes, as
40 * if they were words. At each word (prefix) end the prefix nr is stored, the
41 * following word must support this prefix nr. And the condition nr is
42 * stored, used to lookup the condition that the word must match with.
43 *
Bram Moolenaar51485f02005-06-04 21:55:20 +000044 * Thanks to Olaf Seibert for providing an example implementation of this tree
45 * and the compression mechanism.
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +000046 *
47 * Matching involves checking the caps type: Onecap ALLCAP KeepCap.
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +000048 *
Bram Moolenaar402d2fe2005-04-15 21:00:38 +000049 * Why doesn't Vim use aspell/ispell/myspell/etc.?
50 * See ":help develop-spell".
51 */
52
Bram Moolenaar51485f02005-06-04 21:55:20 +000053/*
Bram Moolenaar9f30f502005-06-14 22:01:04 +000054 * Use this to adjust the score after finding suggestions, based on the
55 * suggested word sounding like the bad word. This is much faster than doing
56 * it for every possible suggestion.
57 * Disadvantage: When "the" is typed as "hte" it sounds different and goes
58 * down in the list.
Bram Moolenaard857f0e2005-06-21 22:37:39 +000059 * Used when 'spellsuggest' is set to "best".
60 */
61#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
62
63/*
64 * The double scoring mechanism is based on the principle that there are two
65 * kinds of spelling mistakes:
66 * 1. You know how to spell the word, but mistype something. This results in
67 * a small editing distance (character swapped/omitted/inserted) and
68 * possibly a word that sounds completely different.
69 * 2. You don't know how to spell the word and type something that sounds
70 * right. The edit distance can be big but the word is similar after
71 * sound-folding.
72 * Since scores for these two mistakes will be very different we use a list
73 * for each.
74 * The sound-folding is slow, only do double scoring when 'spellsuggest' is
75 * "double".
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +000076 */
77
78/*
Bram Moolenaar1d73c882005-06-19 22:48:47 +000079 * Vim spell file format: <HEADER>
80 * <SUGGEST>
81 * <LWORDTREE>
82 * <KWORDTREE>
83 * <PREFIXTREE>
Bram Moolenaar51485f02005-06-04 21:55:20 +000084 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +000085 * <HEADER>: <fileID>
86 * <regioncnt> <regionname> ...
87 * <charflagslen> <charflags>
88 * <fcharslen> <fchars>
Bram Moolenaarcf6bf392005-06-27 22:27:46 +000089 * <midwordlen> <midword>
Bram Moolenaar1d73c882005-06-19 22:48:47 +000090 * <prefcondcnt> <prefcond> ...
Bram Moolenaar51485f02005-06-04 21:55:20 +000091 *
Bram Moolenaarcf6bf392005-06-27 22:27:46 +000092 * <fileID> 10 bytes "VIMspell08"
Bram Moolenaar51485f02005-06-04 21:55:20 +000093 * <regioncnt> 1 byte number of regions following (8 supported)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +000094 * <regionname> 2 bytes Region name: ca, au, etc. Lower case.
Bram Moolenaar51485f02005-06-04 21:55:20 +000095 * First <regionname> is region 1.
96 *
97 * <charflagslen> 1 byte Number of bytes in <charflags> (should be 128).
98 * <charflags> N bytes List of flags (first one is for character 128):
Bram Moolenaar9f30f502005-06-14 22:01:04 +000099 * 0x01 word character CF_WORD
100 * 0x02 upper-case character CF_UPPER
Bram Moolenaar51485f02005-06-04 21:55:20 +0000101 * <fcharslen> 2 bytes Number of bytes in <fchars>.
102 * <fchars> N bytes Folded characters, first one is for character 128.
103 *
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000104 * <midwordlen> 2 bytes Number of bytes in <midword>.
105 * <midword> N bytes Characters that are word characters only when used
106 * in the middle of a word.
107 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000108 * <prefcondcnt> 2 bytes Number of <prefcond> items following.
109 *
110 * <prefcond> : <condlen> <condstr>
111 *
112 * <condlen> 1 byte Length of <condstr>.
113 *
114 * <condstr> N bytes Condition for the prefix.
115 *
Bram Moolenaar51485f02005-06-04 21:55:20 +0000116 *
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000117 * <SUGGEST> : <repcount> <rep> ...
118 * <salflags> <salcount> <sal> ...
119 * <maplen> <mapstr>
Bram Moolenaar51485f02005-06-04 21:55:20 +0000120 *
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000121 * <repcount> 2 bytes number of <rep> items, MSB first.
122 *
123 * <rep> : <repfromlen> <repfrom> <reptolen> <repto>
124 *
125 * <repfromlen> 1 byte length of <repfrom>
126 *
127 * <repfrom> N bytes "from" part of replacement
128 *
129 * <reptolen> 1 byte length of <repto>
130 *
131 * <repto> N bytes "to" part of replacement
132 *
133 * <salflags> 1 byte flags for soundsalike conversion:
134 * SAL_F0LLOWUP
135 * SAL_COLLAPSE
136 * SAL_REM_ACCENTS
137 *
138 * <sal> : <salfromlen> <salfrom> <saltolen> <salto>
139 *
140 * <salfromlen> 1 byte length of <salfrom>
141 *
142 * <salfrom> N bytes "from" part of soundsalike
143 *
144 * <saltolen> 1 byte length of <salto>
145 *
146 * <salto> N bytes "to" part of soundsalike
147 *
148 * <maplen> 2 bytes length of <mapstr>, MSB first
149 *
150 * <mapstr> N bytes String with sequences of similar characters,
151 * separated by slashes.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000152 *
153 *
154 * <LWORDTREE>: <wordtree>
155 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000156 * <KWORDTREE>: <wordtree>
157 *
158 * <PREFIXTREE>: <wordtree>
159 *
160 *
Bram Moolenaar51485f02005-06-04 21:55:20 +0000161 * <wordtree>: <nodecount> <nodedata> ...
162 *
163 * <nodecount> 4 bytes Number of nodes following. MSB first.
164 *
165 * <nodedata>: <siblingcount> <sibling> ...
166 *
167 * <siblingcount> 1 byte Number of siblings in this node. The siblings
168 * follow in sorted order.
169 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000170 * <sibling>: <byte> [ <nodeidx> <xbyte>
171 * | <flags> [<region>] [<prefixID>]
172 * | <prefixID> <prefcondnr> ]
Bram Moolenaar51485f02005-06-04 21:55:20 +0000173 *
174 * <byte> 1 byte Byte value of the sibling. Special cases:
175 * BY_NOFLAGS: End of word without flags and for all
176 * regions.
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000177 * For PREFIXTREE <prefixID> and
178 * <prefcondnr> follow.
179 * BY_FLAGS: End of word, <flags> follow.
180 * For PREFIXTREE <prefixID> and
181 * <prefcondnr> follow for rare prefix.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000182 * BY_INDEX: Child of sibling is shared, <nodeidx>
183 * and <xbyte> follow.
184 *
185 * <nodeidx> 3 bytes Index of child for this sibling, MSB first.
186 *
187 * <xbyte> 1 byte byte value of the sibling.
188 *
189 * <flags> 1 byte bitmask of:
190 * WF_ALLCAP word must have only capitals
191 * WF_ONECAP first char of word must be capital
192 * WF_RARE rare word
193 * WF_REGION <region> follows
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000194 * WF_PFX <prefixID> follows
Bram Moolenaar51485f02005-06-04 21:55:20 +0000195 *
196 * <region> 1 byte Bitmask for regions in which word is valid. When
197 * omitted it's valid in all regions.
198 * Lowest bit is for region 1.
199 *
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000200 * <prefixID> 1 byte ID of prefix that can be used with this word. For
201 * PREFIXTREE used for the required prefix ID.
202 *
203 * <prefcondnr> 2 bytes Prefix condition number, index in <prefcond> list
204 * from HEADER.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000205 *
Bram Moolenaar51485f02005-06-04 21:55:20 +0000206 * All text characters are in 'encoding', but stored as single bytes.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000207 */
208
Bram Moolenaare19defe2005-03-21 08:23:33 +0000209#if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64)
210# include <io.h> /* for lseek(), must be before vim.h */
211#endif
212
213#include "vim.h"
214
215#if defined(FEAT_SYN_HL) || defined(PROTO)
216
217#ifdef HAVE_FCNTL_H
218# include <fcntl.h>
219#endif
220
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000221#define MAXWLEN 250 /* Assume max. word len is this many bytes.
222 Some places assume a word length fits in a
223 byte, thus it can't be above 255. */
Bram Moolenaarfc735152005-03-22 22:54:12 +0000224
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000225/* Type used for indexes in the word tree need to be at least 3 bytes. If int
226 * is 8 bytes we could use something smaller, but what? */
227#if SIZEOF_INT > 2
228typedef int idx_T;
229#else
230typedef long idx_T;
231#endif
232
233/* Flags used for a word. Only the lowest byte can be used, the region byte
234 * comes above it. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000235#define WF_REGION 0x01 /* region byte follows */
236#define WF_ONECAP 0x02 /* word with one capital (or all capitals) */
237#define WF_ALLCAP 0x04 /* word must be all capitals */
238#define WF_RARE 0x08 /* rare word */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000239#define WF_BANNED 0x10 /* bad word */
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000240#define WF_PFX 0x20 /* prefix ID list follows */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000241#define WF_KEEPCAP 0x80 /* keep-case word */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000242
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000243#define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000244
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000245#define WF_RAREPFX 0x1000000 /* in sl_pidxs: flag for rare postponed
246 prefix; must be above prefixID (one byte)
247 and prefcondnr (two bytes) */
248
Bram Moolenaar51485f02005-06-04 21:55:20 +0000249#define BY_NOFLAGS 0 /* end of word without flags or region */
250#define BY_FLAGS 1 /* end of word, flag byte follows */
251#define BY_INDEX 2 /* child is shared, index follows */
252#define BY_SPECIAL BY_INDEX /* hightest special byte value */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000253
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000254/* Info from "REP" and "SAL" entries in ".aff" file used in si_rep, sl_rep,
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000255 * and si_sal. Not for sl_sal!
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000256 * One replacement: from "ft_from" to "ft_to". */
257typedef struct fromto_S
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000258{
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000259 char_u *ft_from;
260 char_u *ft_to;
261} fromto_T;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000262
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000263/* Info from "SAL" entries in ".aff" file used in sl_sal.
264 * The info is split for quick processing by spell_soundfold().
265 * Note that "sm_oneof" and "sm_rules" point into sm_lead. */
266typedef struct salitem_S
267{
268 char_u *sm_lead; /* leading letters */
269 int sm_leadlen; /* length of "sm_lead" */
270 char_u *sm_oneoff; /* letters from () or NULL */
271 char_u *sm_rules; /* rules like ^, $, priority */
272 char_u *sm_to; /* replacement. */
273} salitem_T;
274
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000275/*
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000276 * Structure used to store words and other info for one language, loaded from
277 * a .spl file.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000278 * The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
279 * case-folded words. "sl_kbyts/sl_kidxs" is for keep-case words.
280 *
281 * The "byts" array stores the possible bytes in each tree node, preceded by
282 * the number of possible bytes, sorted on byte value:
283 * <len> <byte1> <byte2> ...
284 * The "idxs" array stores the index of the child node corresponding to the
285 * byte in "byts".
286 * Exception: when the byte is zero, the word may end here and "idxs" holds
287 * the flags and region for the word. There may be several zeros in sequence
288 * for alternative flag/region combinations.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000289 */
290typedef struct slang_S slang_T;
291struct slang_S
292{
293 slang_T *sl_next; /* next language */
294 char_u *sl_name; /* language name "en", "en.rare", "nl", etc. */
Bram Moolenaarb765d632005-06-07 21:00:02 +0000295 char_u *sl_fname; /* name of .spl file */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000296 int sl_add; /* TRUE if it's a .add file. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000297
Bram Moolenaar51485f02005-06-04 21:55:20 +0000298 char_u *sl_fbyts; /* case-folded word bytes */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000299 idx_T *sl_fidxs; /* case-folded word indexes */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000300 char_u *sl_kbyts; /* keep-case word bytes */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000301 idx_T *sl_kidxs; /* keep-case word indexes */
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000302 char_u *sl_pbyts; /* prefix tree word bytes */
303 idx_T *sl_pidxs; /* prefix tree word indexes */
304
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000305 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000306
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000307 int sl_prefixcnt; /* number of items in "sl_prefprog" */
308 regprog_T **sl_prefprog; /* table with regprogs for prefixes */
309
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000310 garray_T sl_rep; /* list of fromto_T entries from REP lines */
311 short sl_rep_first[256]; /* indexes where byte first appears, -1 if
312 there is none */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000313 garray_T sl_sal; /* list of salitem_T entries from SAL lines */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000314 short sl_sal_first[256]; /* indexes where byte first appears, -1 if
315 there is none */
316 int sl_followup; /* SAL followup */
317 int sl_collapse; /* SAL collapse_result */
318 int sl_rem_accents; /* SAL remove_accents */
Bram Moolenaarea424162005-06-16 21:51:00 +0000319 int sl_has_map; /* TRUE if there is a MAP line */
320#ifdef FEAT_MBYTE
321 hashtab_T sl_map_hash; /* MAP for multi-byte chars */
322 int sl_map_array[256]; /* MAP for first 256 chars */
323#else
324 char_u sl_map_array[256]; /* MAP for first 256 chars */
325#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000326};
327
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000328/* First language that is loaded, start of the linked list of loaded
329 * languages. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000330static slang_T *first_lang = NULL;
331
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000332/* Flags used in .spl file for soundsalike flags. */
333#define SAL_F0LLOWUP 1
334#define SAL_COLLAPSE 2
335#define SAL_REM_ACCENTS 4
336
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000337/*
338 * Structure used in "b_langp", filled from 'spelllang'.
339 */
340typedef struct langp_S
341{
342 slang_T *lp_slang; /* info for this language (NULL for last one) */
343 int lp_region; /* bitmask for region or REGION_ALL */
344} langp_T;
345
346#define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
347
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000348#define REGION_ALL 0xff /* word valid in all regions */
349
350/* Result values. Lower number is accepted over higher one. */
351#define SP_BANNED -1
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000352#define SP_OK 0
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000353#define SP_RARE 1
354#define SP_LOCAL 2
355#define SP_BAD 3
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000356
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000357#define VIMSPELLMAGIC "VIMspell08" /* string at start of Vim spell file */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000358#define VIMSPELLMAGICL 10
359
360/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000361 * Information used when looking for suggestions.
362 */
363typedef struct suginfo_S
364{
365 garray_T su_ga; /* suggestions, contains "suggest_T" */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000366 int su_maxcount; /* max. number of suggestions displayed */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000367 int su_maxscore; /* maximum score for adding to su_ga */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000368 garray_T su_sga; /* like su_ga, sound-folded scoring */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000369 char_u *su_badptr; /* start of bad word in line */
370 int su_badlen; /* length of detected bad word in line */
Bram Moolenaar0c405862005-06-22 22:26:26 +0000371 int su_badflags; /* caps flags for bad word */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000372 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */
373 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */
374 hashtab_T su_banned; /* table with banned words */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000375} suginfo_T;
376
377/* One word suggestion. Used in "si_ga". */
378typedef struct suggest_S
379{
380 char_u *st_word; /* suggested word, allocated string */
381 int st_orglen; /* length of replaced text */
382 int st_score; /* lower is better */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000383 int st_altscore; /* used when st_score compares equal */
384 int st_salscore; /* st_score is for soundalike */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000385 int st_had_bonus; /* bonus already included in score */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000386} suggest_T;
387
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000388#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000389
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000390/* Number of suggestions kept when cleaning up. When rescore_suggestions() is
391 * called the score may change, thus we need to keep more than what is
392 * displayed. */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000393#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 50 ? 50 : (su)->su_maxcount)
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000394
395/* Threshold for sorting and cleaning up suggestions. Don't want to keep lots
396 * of suggestions that are not going to be displayed. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000397#define SUG_MAX_COUNT(su) ((su)->su_maxcount + 50)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000398
399/* score for various changes */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000400#define SCORE_SPLIT 149 /* split bad word */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000401#define SCORE_ICASE 52 /* slightly different case */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000402#define SCORE_REGION 70 /* word is for different region */
403#define SCORE_RARE 180 /* rare word */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000404#define SCORE_SWAP 90 /* swap two characters */
405#define SCORE_SWAP3 110 /* swap two characters in three */
406#define SCORE_REP 87 /* REP replacement */
407#define SCORE_SUBST 93 /* substitute a character */
408#define SCORE_SIMILAR 33 /* substitute a similar character */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000409#define SCORE_DEL 94 /* delete a character */
Bram Moolenaarea408852005-06-25 22:49:46 +0000410#define SCORE_DELDUP 64 /* delete a duplicated character */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000411#define SCORE_INS 96 /* insert a character */
Bram Moolenaarea408852005-06-25 22:49:46 +0000412#define SCORE_INSDUP 66 /* insert a duplicate character */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000413#define SCORE_NONWORD 103 /* change non-word to word char */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000414
415#define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower.
416 * 350 allows for about three changes. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000417
418#define SCORE_BIG SCORE_INS * 3 /* big difference */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000419#define SCORE_MAXMAX 999999 /* accept any score */
420
421/*
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000422 * Structure to store info for word matching.
423 */
424typedef struct matchinf_S
425{
426 langp_T *mi_lp; /* info for language and region */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000427
428 /* pointers to original text to be checked */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000429 char_u *mi_word; /* start of word being checked */
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000430 char_u *mi_end; /* end of matching word so far */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000431 char_u *mi_fend; /* next char to be added to mi_fword */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000432 char_u *mi_cend; /* char after what was used for
433 mi_capflags */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000434
435 /* case-folded text */
436 char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000437 int mi_fwordlen; /* nr of valid bytes in mi_fword */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000438
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000439 /* for when checking word after a prefix */
440 int mi_prefarridx; /* index in sl_pidxs with list of
441 prefixID/condition */
442 int mi_prefcnt; /* number of entries at mi_prefarridx */
443 int mi_prefixlen; /* byte length of prefix */
444
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000445 /* others */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000446 int mi_result; /* result so far: SP_BAD, SP_OK, etc. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000447 int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000448} matchinf_T;
449
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000450/*
451 * The tables used for recognizing word characters according to spelling.
452 * These are only used for the first 256 characters of 'encoding'.
453 */
454typedef struct spelltab_S
455{
456 char_u st_isw[256]; /* flags: is word char */
457 char_u st_isu[256]; /* flags: is uppercase char */
458 char_u st_fold[256]; /* chars: folded case */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000459 char_u st_upper[256]; /* chars: upper case */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000460} spelltab_T;
461
462static spelltab_T spelltab;
463static int did_set_spelltab;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000464static char_u spell_ismw[256]; /* flags: is midword char */
465#ifdef FEAT_MBYTE
466static char_u *spell_ismw_mb = NULL; /* multi-byte midword chars */
467#endif
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000468
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000469#define CF_WORD 0x01
470#define CF_UPPER 0x02
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000471
472static void clear_spell_chartab __ARGS((spelltab_T *sp));
473static int set_spell_finish __ARGS((spelltab_T *new_st));
Bram Moolenaarea408852005-06-25 22:49:46 +0000474static int spell_iswordp __ARGS((char_u *p));
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000475static void write_spell_prefcond __ARGS((FILE *fd, garray_T *gap));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000476
477/*
Bram Moolenaarea408852005-06-25 22:49:46 +0000478 * Return TRUE if "p" points to a word character. Like spell_iswordp() but
479 * without the special handling of a single quote.
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000480 * Checking for a word character is done very often, avoid the function call
481 * overhead.
482 */
483#ifdef FEAT_MBYTE
484# define SPELL_ISWORDP(p) ((has_mbyte && MB_BYTE2LEN(*(p)) > 1) \
485 ? (mb_get_class(p) >= 2) : spelltab.st_isw[*(p)])
486#else
487# define SPELL_ISWORDP(p) (spelltab.st_isw[*(p)])
488#endif
489
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000490/*
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000491 * For finding suggestions: At each node in the tree these states are tried:
Bram Moolenaarea424162005-06-16 21:51:00 +0000492 */
493typedef enum
494{
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000495 STATE_START = 0, /* At start of node check for NUL bytes (goodword
496 * ends); if badword ends there is a match, otherwise
497 * try splitting word. */
498 STATE_SPLITUNDO, /* Undo splitting. */
Bram Moolenaarea424162005-06-16 21:51:00 +0000499 STATE_ENDNUL, /* Past NUL bytes at start of the node. */
500 STATE_PLAIN, /* Use each byte of the node. */
501 STATE_DEL, /* Delete a byte from the bad word. */
502 STATE_INS, /* Insert a byte in the bad word. */
503 STATE_SWAP, /* Swap two bytes. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000504 STATE_UNSWAP, /* Undo swap two characters. */
505 STATE_SWAP3, /* Swap two characters over three. */
506 STATE_UNSWAP3, /* Undo Swap two characters over three. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000507 STATE_UNROT3L, /* Undo rotate three characters left */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000508 STATE_UNROT3R, /* Undo rotate three characters right */
Bram Moolenaarea424162005-06-16 21:51:00 +0000509 STATE_REP_INI, /* Prepare for using REP items. */
510 STATE_REP, /* Use matching REP items from the .aff file. */
511 STATE_REP_UNDO, /* Undo a REP item replacement. */
512 STATE_FINAL /* End of this node. */
513} state_T;
514
515/*
Bram Moolenaar0c405862005-06-22 22:26:26 +0000516 * Struct to keep the state at each level in suggest_try_change().
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000517 */
518typedef struct trystate_S
519{
Bram Moolenaarea424162005-06-16 21:51:00 +0000520 state_T ts_state; /* state at this level, STATE_ */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000521 int ts_score; /* score */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000522 idx_T ts_arridx; /* index in tree array, start of node */
Bram Moolenaarea424162005-06-16 21:51:00 +0000523 short ts_curi; /* index in list of child nodes */
524 char_u ts_fidx; /* index in fword[], case-folded bad word */
525 char_u ts_fidxtry; /* ts_fidx at which bytes may be changed */
526 char_u ts_twordlen; /* valid length of tword[] */
527#ifdef FEAT_MBYTE
528 char_u ts_tcharlen; /* number of bytes in tword character */
529 char_u ts_tcharidx; /* current byte index in tword character */
530 char_u ts_isdiff; /* DIFF_ values */
531 char_u ts_fcharstart; /* index in fword where badword char started */
532#endif
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000533 char_u ts_save_prewordlen; /* saved "prewordlen" */
Bram Moolenaarea424162005-06-16 21:51:00 +0000534 char_u ts_save_splitoff; /* su_splitoff saved here */
Bram Moolenaar0c405862005-06-22 22:26:26 +0000535 char_u ts_save_badflags; /* su_badflags saved here */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000536} trystate_T;
537
Bram Moolenaarea424162005-06-16 21:51:00 +0000538/* values for ts_isdiff */
539#define DIFF_NONE 0 /* no different byte (yet) */
540#define DIFF_YES 1 /* different byte found */
541#define DIFF_INSERT 2 /* inserting character */
542
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000543/* mode values for find_word */
544#define FIND_FOLDWORD 0 /* find word case-folded */
545#define FIND_KEEPWORD 1 /* find keep-case word */
546#define FIND_PREFIX 2 /* find word after prefix */
547
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000548static slang_T *slang_alloc __ARGS((char_u *lang));
549static void slang_free __ARGS((slang_T *lp));
Bram Moolenaarb765d632005-06-07 21:00:02 +0000550static void slang_clear __ARGS((slang_T *lp));
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000551static void find_word __ARGS((matchinf_T *mip, int mode));
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000552static int valid_word_prefix __ARGS((int totprefcnt, int arridx, int prefid, char_u *word, slang_T *slang));
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000553static void find_prefix __ARGS((matchinf_T *mip));
554static int fold_more __ARGS((matchinf_T *mip));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000555static int spell_valid_case __ARGS((int origflags, int treeflags));
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000556static int no_spell_checking __ARGS((void));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000557static void spell_load_lang __ARGS((char_u *lang));
Bram Moolenaarb765d632005-06-07 21:00:02 +0000558static char_u *spell_enc __ARGS((void));
559static void spell_load_cb __ARGS((char_u *fname, void *cookie));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000560static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent));
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000561static idx_T read_tree __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx, int prefixtree, int maxprefcondnr));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000562static int find_region __ARGS((char_u *rp, char_u *region));
563static int captype __ARGS((char_u *word, char_u *end));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000564static void spell_reload_one __ARGS((char_u *fname, int added_word));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000565static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000566static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
567static void write_spell_chartab __ARGS((FILE *fd));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000568static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
Bram Moolenaarea408852005-06-25 22:49:46 +0000569static void spell_find_suggest __ARGS((char_u *badptr, suginfo_T *su, int maxcount, int banbadword));
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000570static void spell_find_cleanup __ARGS((suginfo_T *su));
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000571static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper));
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000572static void allcap_copy __ARGS((char_u *word, char_u *wcopy));
Bram Moolenaar0c405862005-06-22 22:26:26 +0000573static void suggest_try_special __ARGS((suginfo_T *su));
574static void suggest_try_change __ARGS((suginfo_T *su));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000575static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add));
576static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000577static void score_comp_sal __ARGS((suginfo_T *su));
578static void score_combine __ARGS((suginfo_T *su));
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000579static int stp_sal_score __ARGS((suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound));
Bram Moolenaar0c405862005-06-22 22:26:26 +0000580static void suggest_try_soundalike __ARGS((suginfo_T *su));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000581static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
Bram Moolenaarea424162005-06-16 21:51:00 +0000582static void set_map_str __ARGS((slang_T *lp, char_u *map));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000583static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000584static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000585static void add_banned __ARGS((suginfo_T *su, char_u *word));
586static int was_banned __ARGS((suginfo_T *su, char_u *word));
587static void free_banned __ARGS((suginfo_T *su));
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000588static void rescore_suggestions __ARGS((suginfo_T *su));
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000589static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000590static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res));
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000591static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000592static int spell_edit_score __ARGS((char_u *badword, char_u *goodword));
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000593static void dump_word __ARGS((char_u *word, int round, int flags, linenr_T lnum));
594static linenr_T apply_prefixes __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T startlnum));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000595
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000596/*
597 * Use our own character-case definitions, because the current locale may
598 * differ from what the .spl file uses.
599 * These must not be called with negative number!
600 */
601#ifndef FEAT_MBYTE
602/* Non-multi-byte implementation. */
603# define SPELL_TOFOLD(c) ((c) < 256 ? spelltab.st_fold[c] : (c))
604# define SPELL_TOUPPER(c) ((c) < 256 ? spelltab.st_upper[c] : (c))
605# define SPELL_ISUPPER(c) ((c) < 256 ? spelltab.st_isu[c] : FALSE)
606#else
607/* Multi-byte implementation. For Unicode we can call utf_*(), but don't do
608 * that for ASCII, because we don't want to use 'casemap' here. Otherwise use
609 * the "w" library function for characters above 255 if available. */
610# ifdef HAVE_TOWLOWER
611# define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
612 : (c) < 256 ? spelltab.st_fold[c] : towlower(c))
613# else
614# define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
615 : (c) < 256 ? spelltab.st_fold[c] : (c))
616# endif
617
618# ifdef HAVE_TOWUPPER
619# define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
620 : (c) < 256 ? spelltab.st_upper[c] : towupper(c))
621# else
622# define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
623 : (c) < 256 ? spelltab.st_upper[c] : (c))
624# endif
625
626# ifdef HAVE_ISWUPPER
627# define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
628 : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
629# else
630# define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
631 : (c) < 256 ? spelltab.st_isu[c] : (c))
632# endif
633#endif
634
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000635
636static char *e_format = N_("E759: Format error in spell file");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000637
638/*
639 * Main spell-checking function.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000640 * "ptr" points to a character that could be the start of a word.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000641 * "*attrp" is set to the attributes for a badly spelled word. For a non-word
642 * or when it's OK it remains unchanged.
643 * This must only be called when 'spelllang' is not empty.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000644 *
645 * "sug" is normally NULL. When looking for suggestions it points to
646 * suginfo_T. It's passed as a void pointer to keep the struct local.
647 *
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000648 * Returns the length of the word in bytes, also when it's OK, so that the
649 * caller can skip over the word.
650 */
651 int
Bram Moolenaar51485f02005-06-04 21:55:20 +0000652spell_check(wp, ptr, attrp)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000653 win_T *wp; /* current window */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000654 char_u *ptr;
655 int *attrp;
656{
657 matchinf_T mi; /* Most things are put in "mi" so that it can
658 be passed to functions quickly. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000659 int nrlen = 0; /* found a number first */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000660
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000661 /* A word never starts at a space or a control character. Return quickly
662 * then, skipping over the character. */
663 if (*ptr <= ' ')
664 return 1;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000665
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000666 /* A number is always OK. Also skip hexadecimal numbers 0xFF99 and
Bram Moolenaar0c405862005-06-22 22:26:26 +0000667 * 0X99FF. But when a word character follows do check spelling to find
668 * "3GPP". */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000669 if (*ptr >= '0' && *ptr <= '9')
Bram Moolenaar51485f02005-06-04 21:55:20 +0000670 {
Bram Moolenaar3982c542005-06-08 21:56:31 +0000671 if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X'))
672 mi.mi_end = skiphex(ptr + 2);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000673 else
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000674 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000675 mi.mi_end = skipdigits(ptr);
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000676 nrlen = mi.mi_end - ptr;
677 }
Bram Moolenaarea408852005-06-25 22:49:46 +0000678 if (!spell_iswordp(mi.mi_end))
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000679 return (int)(mi.mi_end - ptr);
Bram Moolenaar0c405862005-06-22 22:26:26 +0000680
681 /* Try including the digits in the word. */
682 mi.mi_fend = ptr + nrlen;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000683 }
Bram Moolenaar0c405862005-06-22 22:26:26 +0000684 else
685 mi.mi_fend = ptr;
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000686
Bram Moolenaar0c405862005-06-22 22:26:26 +0000687 /* Find the normal end of the word (until the next non-word character). */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000688 mi.mi_word = ptr;
Bram Moolenaarea408852005-06-25 22:49:46 +0000689 if (spell_iswordp(mi.mi_fend))
Bram Moolenaar51485f02005-06-04 21:55:20 +0000690 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000691 do
Bram Moolenaar51485f02005-06-04 21:55:20 +0000692 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000693 mb_ptr_adv(mi.mi_fend);
Bram Moolenaarea408852005-06-25 22:49:46 +0000694 } while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend));
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000695 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000696
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000697 /* We always use the characters up to the next non-word character,
698 * also for bad words. */
699 mi.mi_end = mi.mi_fend;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000700
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000701 /* Check caps type later. */
702 mi.mi_capflags = 0;
703 mi.mi_cend = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000704
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000705 /* Include one non-word character so that we can check for the
706 * word end. */
707 if (*mi.mi_fend != NUL)
708 mb_ptr_adv(mi.mi_fend);
709
710 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword,
711 MAXWLEN + 1);
712 mi.mi_fwordlen = STRLEN(mi.mi_fword);
713
714 /* The word is bad unless we recognize it. */
715 mi.mi_result = SP_BAD;
716
717 /*
718 * Loop over the languages specified in 'spelllang'.
719 * We check them all, because a matching word may be longer than an
720 * already found matching word.
721 */
722 for (mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0);
723 mi.mi_lp->lp_slang != NULL; ++mi.mi_lp)
724 {
725 /* Check for a matching word in case-folded words. */
726 find_word(&mi, FIND_FOLDWORD);
727
728 /* Check for a matching word in keep-case words. */
729 find_word(&mi, FIND_KEEPWORD);
730
731 /* Check for matching prefixes. */
732 find_prefix(&mi);
733 }
734
735 if (mi.mi_result != SP_OK)
736 {
Bram Moolenaar0c405862005-06-22 22:26:26 +0000737 /* If we found a number skip over it. Allows for "42nd". Do flag
738 * rare and local words, e.g., "3GPP". */
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000739 if (nrlen > 0)
Bram Moolenaar0c405862005-06-22 22:26:26 +0000740 {
741 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
742 return nrlen;
743 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000744
745 /* When we are at a non-word character there is no error, just
746 * skip over the character (try looking for a word after it). */
Bram Moolenaar0c405862005-06-22 22:26:26 +0000747 else if (!SPELL_ISWORDP(ptr))
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000748 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000749#ifdef FEAT_MBYTE
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000750 if (has_mbyte)
751 return mb_ptr2len_check(ptr);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000752#endif
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000753 return 1;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000754 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +0000755
756 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
757 *attrp = highlight_attr[HLF_SPB];
758 else if (mi.mi_result == SP_RARE)
759 *attrp = highlight_attr[HLF_SPR];
760 else
761 *attrp = highlight_attr[HLF_SPL];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000762 }
763
Bram Moolenaar51485f02005-06-04 21:55:20 +0000764 return (int)(mi.mi_end - ptr);
765}
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000766
Bram Moolenaar51485f02005-06-04 21:55:20 +0000767/*
768 * Check if the word at "mip->mi_word" is in the tree.
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000769 * When "mode" is FIND_FOLDWORD check in fold-case word tree.
770 * When "mode" is FIND_KEEPWORD check in keep-case word tree.
771 * When "mode" is FIND_PREFIX check for word after prefix in fold-case word
772 * tree.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000773 *
774 * For a match mip->mi_result is updated.
775 */
776 static void
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000777find_word(mip, mode)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000778 matchinf_T *mip;
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000779 int mode;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000780{
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000781 idx_T arridx = 0;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000782 int endlen[MAXWLEN]; /* length at possible word endings */
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000783 idx_T endidx[MAXWLEN]; /* possible word endings */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000784 int endidxcnt = 0;
785 int len;
786 int wlen = 0;
787 int flen;
788 int c;
789 char_u *ptr;
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000790 idx_T lo, hi, m;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000791#ifdef FEAT_MBYTE
792 char_u *s;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000793 char_u *p;
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000794#endif
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000795 int res = SP_BAD;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000796 slang_T *slang = mip->mi_lp->lp_slang;
797 unsigned flags;
798 char_u *byts;
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000799 idx_T *idxs;
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000800 int prefid;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000801
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000802 if (mode == FIND_KEEPWORD)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000803 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000804 /* Check for word with matching case in keep-case tree. */
805 ptr = mip->mi_word;
806 flen = 9999; /* no case folding, always enough bytes */
807 byts = slang->sl_kbyts;
808 idxs = slang->sl_kidxs;
809 }
810 else
811 {
812 /* Check for case-folded in case-folded tree. */
813 ptr = mip->mi_fword;
814 flen = mip->mi_fwordlen; /* available case-folded bytes */
815 byts = slang->sl_fbyts;
816 idxs = slang->sl_fidxs;
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000817
818 if (mode == FIND_PREFIX)
819 {
820 /* Skip over the prefix. */
821 wlen = mip->mi_prefixlen;
822 flen -= mip->mi_prefixlen;
823 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000824 }
825
Bram Moolenaar51485f02005-06-04 21:55:20 +0000826 if (byts == NULL)
827 return; /* array is empty */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000828
Bram Moolenaar51485f02005-06-04 21:55:20 +0000829 /*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000830 * Repeat advancing in the tree until:
831 * - there is a byte that doesn't match,
832 * - we reach the end of the tree,
833 * - or we reach the end of the line.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000834 */
835 for (;;)
836 {
Bram Moolenaar0c405862005-06-22 22:26:26 +0000837 if (flen <= 0 && *mip->mi_fend != NUL)
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000838 flen = fold_more(mip);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000839
840 len = byts[arridx++];
841
842 /* If the first possible byte is a zero the word could end here.
843 * Remember this index, we first check for the longest word. */
844 if (byts[arridx] == 0)
845 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000846 if (endidxcnt == MAXWLEN)
847 {
848 /* Must be a corrupted spell file. */
849 EMSG(_(e_format));
850 return;
851 }
Bram Moolenaar51485f02005-06-04 21:55:20 +0000852 endlen[endidxcnt] = wlen;
853 endidx[endidxcnt++] = arridx++;
854 --len;
855
856 /* Skip over the zeros, there can be several flag/region
857 * combinations. */
858 while (len > 0 && byts[arridx] == 0)
859 {
860 ++arridx;
861 --len;
862 }
863 if (len == 0)
864 break; /* no children, word must end here */
865 }
866
867 /* Stop looking at end of the line. */
868 if (ptr[wlen] == NUL)
869 break;
870
871 /* Perform a binary search in the list of accepted bytes. */
872 c = ptr[wlen];
Bram Moolenaar0c405862005-06-22 22:26:26 +0000873 if (c == TAB) /* <Tab> is handled like <Space> */
874 c = ' ';
Bram Moolenaar51485f02005-06-04 21:55:20 +0000875 lo = arridx;
876 hi = arridx + len - 1;
877 while (lo < hi)
878 {
879 m = (lo + hi) / 2;
880 if (byts[m] > c)
881 hi = m - 1;
882 else if (byts[m] < c)
883 lo = m + 1;
884 else
885 {
886 lo = hi = m;
887 break;
888 }
889 }
890
891 /* Stop if there is no matching byte. */
892 if (hi < lo || byts[lo] != c)
893 break;
894
895 /* Continue at the child (if there is one). */
896 arridx = idxs[lo];
897 ++wlen;
898 --flen;
Bram Moolenaar0c405862005-06-22 22:26:26 +0000899
900 /* One space in the good word may stand for several spaces in the
901 * checked word. */
902 if (c == ' ')
903 {
904 for (;;)
905 {
906 if (flen <= 0 && *mip->mi_fend != NUL)
907 flen = fold_more(mip);
908 if (ptr[wlen] != ' ' && ptr[wlen] != TAB)
909 break;
910 ++wlen;
911 --flen;
912 }
913 }
Bram Moolenaar51485f02005-06-04 21:55:20 +0000914 }
915
916 /*
917 * Verify that one of the possible endings is valid. Try the longest
918 * first.
919 */
920 while (endidxcnt > 0)
921 {
922 --endidxcnt;
923 arridx = endidx[endidxcnt];
924 wlen = endlen[endidxcnt];
925
926#ifdef FEAT_MBYTE
927 if ((*mb_head_off)(ptr, ptr + wlen) > 0)
928 continue; /* not at first byte of character */
929#endif
Bram Moolenaarea408852005-06-25 22:49:46 +0000930 if (spell_iswordp(ptr + wlen))
Bram Moolenaar51485f02005-06-04 21:55:20 +0000931 continue; /* next char is a word character */
932
933#ifdef FEAT_MBYTE
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000934 if (mode != FIND_KEEPWORD && has_mbyte)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000935 {
936 /* Compute byte length in original word, length may change
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000937 * when folding case. This can be slow, take a shortcut when the
938 * case-folded word is equal to the keep-case word. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000939 p = mip->mi_word;
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000940 if (STRNCMP(ptr, p, wlen) != 0)
941 {
942 for (s = ptr; s < ptr + wlen; mb_ptr_adv(s))
943 mb_ptr_adv(p);
944 wlen = p - mip->mi_word;
945 }
Bram Moolenaar51485f02005-06-04 21:55:20 +0000946 }
947#endif
948
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000949 /* Check flags and region. For FIND_PREFIX check the condition and
950 * prefix ID.
951 * Repeat this if there are more flags/region alternatives until there
952 * is a match. */
953 res = SP_BAD;
954 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0;
955 --len, ++arridx)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000956 {
957 flags = idxs[arridx];
Bram Moolenaar9f30f502005-06-14 22:01:04 +0000958
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000959 /* For the fold-case tree check that the case of the checked word
960 * matches with what the word in the tree requires.
961 * For keep-case tree the case is always right. For prefixes we
962 * don't bother to check. */
963 if (mode == FIND_FOLDWORD)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000964 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000965 if (mip->mi_cend != mip->mi_word + wlen)
966 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000967 /* mi_capflags was set for a different word length, need
968 * to do it again. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000969 mip->mi_cend = mip->mi_word + wlen;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000970 mip->mi_capflags = captype(mip->mi_word, mip->mi_cend);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000971 }
972
Bram Moolenaar0c405862005-06-22 22:26:26 +0000973 if (mip->mi_capflags == WF_KEEPCAP
974 || !spell_valid_case(mip->mi_capflags, flags))
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000975 continue;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000976 }
977
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000978 /* When mode is FIND_PREFIX the word must support the prefix:
979 * check the prefix ID and the condition. Do that for the list at
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000980 * mip->mi_prefarridx that find_prefix() filled. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000981 if (mode == FIND_PREFIX)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000982 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000983 /* The prefix ID is stored two bytes above the flags. */
984 prefid = (unsigned)flags >> 16;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000985 c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx,
Bram Moolenaarf417f2b2005-06-23 22:29:21 +0000986 prefid, mip->mi_fword + mip->mi_prefixlen,
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000987 slang);
988 if (c == 0)
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000989 continue;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +0000990
991 /* Use the WF_RARE flag for a rare prefix. */
992 if (c & WF_RAREPFX)
993 flags |= WF_RARE;
Bram Moolenaar1d73c882005-06-19 22:48:47 +0000994 }
995
996 if (flags & WF_BANNED)
997 res = SP_BANNED;
998 else if (flags & WF_REGION)
999 {
1000 /* Check region. */
1001 if ((mip->mi_lp->lp_region & (flags >> 8)) != 0)
1002 res = SP_OK;
1003 else
1004 res = SP_LOCAL;
1005 }
1006 else if (flags & WF_RARE)
1007 res = SP_RARE;
1008 else
1009 res = SP_OK;
1010
1011 /* Always use the longest match and the best result. */
1012 if (mip->mi_result > res)
1013 {
1014 mip->mi_result = res;
1015 mip->mi_end = mip->mi_word + wlen;
1016 }
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001017 else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen)
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001018 mip->mi_end = mip->mi_word + wlen;
1019
1020 if (res == SP_OK)
1021 break;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001022 }
1023
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001024 if (res == SP_OK)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001025 break;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001026 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001027}
1028
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001029/*
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00001030 * Return non-zero if the prefix indicated by "mip->mi_prefarridx" matches
1031 * with the prefix ID "prefid" for the word "word".
1032 * The WF_RAREPFX flag is included in the return value for a rare prefix.
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001033 */
1034 static int
1035valid_word_prefix(totprefcnt, arridx, prefid, word, slang)
1036 int totprefcnt; /* nr of prefix IDs */
1037 int arridx; /* idx in sl_pidxs[] */
1038 int prefid;
1039 char_u *word;
1040 slang_T *slang;
1041{
1042 int prefcnt;
1043 int pidx;
1044 regprog_T *rp;
1045 regmatch_T regmatch;
1046
1047 for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt)
1048 {
1049 pidx = slang->sl_pidxs[arridx + prefcnt];
1050
1051 /* Check the prefix ID. */
1052 if (prefid != (pidx & 0xff))
1053 continue;
1054
1055 /* Check the condition, if there is one. The condition index is
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00001056 * stored in the two bytes above the prefix ID byte. */
1057 rp = slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff];
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001058 if (rp != NULL)
1059 {
1060 regmatch.regprog = rp;
1061 regmatch.rm_ic = FALSE;
1062 if (!vim_regexec(&regmatch, word, 0))
1063 continue;
1064 }
1065
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00001066 /* It's a match! Return the WF_RAREPFX flag. */
1067 return pidx;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001068 }
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00001069 return 0;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001070}
1071
1072/*
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001073 * Check if the word at "mip->mi_word" has a matching prefix.
1074 * If it does, then check the following word.
1075 *
1076 * For a match mip->mi_result is updated.
1077 */
1078 static void
1079find_prefix(mip)
1080 matchinf_T *mip;
1081{
1082 idx_T arridx = 0;
1083 int len;
1084 int wlen = 0;
1085 int flen;
1086 int c;
1087 char_u *ptr;
1088 idx_T lo, hi, m;
1089 slang_T *slang = mip->mi_lp->lp_slang;
1090 char_u *byts;
1091 idx_T *idxs;
1092
1093 /* We use the case-folded word here, since prefixes are always
1094 * case-folded. */
1095 ptr = mip->mi_fword;
1096 flen = mip->mi_fwordlen; /* available case-folded bytes */
1097 byts = slang->sl_pbyts;
1098 idxs = slang->sl_pidxs;
1099
1100 if (byts == NULL)
1101 return; /* array is empty */
1102
1103 /*
1104 * Repeat advancing in the tree until:
1105 * - there is a byte that doesn't match,
1106 * - we reach the end of the tree,
1107 * - or we reach the end of the line.
1108 */
1109 for (;;)
1110 {
1111 if (flen == 0 && *mip->mi_fend != NUL)
1112 flen = fold_more(mip);
1113
1114 len = byts[arridx++];
1115
1116 /* If the first possible byte is a zero the prefix could end here.
1117 * Check if the following word matches and supports the prefix. */
1118 if (byts[arridx] == 0)
1119 {
1120 /* There can be several prefixes with different conditions. We
1121 * try them all, since we don't know which one will give the
1122 * longest match. The word is the same each time, pass the list
1123 * of possible prefixes to find_word(). */
1124 mip->mi_prefarridx = arridx;
1125 mip->mi_prefcnt = len;
1126 while (len > 0 && byts[arridx] == 0)
1127 {
1128 ++arridx;
1129 --len;
1130 }
1131 mip->mi_prefcnt -= len;
1132
1133 /* Find the word that comes after the prefix. */
1134 mip->mi_prefixlen = wlen;
1135 find_word(mip, FIND_PREFIX);
1136
1137
1138 if (len == 0)
1139 break; /* no children, word must end here */
1140 }
1141
1142 /* Stop looking at end of the line. */
1143 if (ptr[wlen] == NUL)
1144 break;
1145
1146 /* Perform a binary search in the list of accepted bytes. */
1147 c = ptr[wlen];
1148 lo = arridx;
1149 hi = arridx + len - 1;
1150 while (lo < hi)
1151 {
1152 m = (lo + hi) / 2;
1153 if (byts[m] > c)
1154 hi = m - 1;
1155 else if (byts[m] < c)
1156 lo = m + 1;
1157 else
1158 {
1159 lo = hi = m;
1160 break;
1161 }
1162 }
1163
1164 /* Stop if there is no matching byte. */
1165 if (hi < lo || byts[lo] != c)
1166 break;
1167
1168 /* Continue at the child (if there is one). */
1169 arridx = idxs[lo];
1170 ++wlen;
1171 --flen;
1172 }
1173}
1174
1175/*
1176 * Need to fold at least one more character. Do until next non-word character
1177 * for efficiency.
1178 * Return the length of the folded chars in bytes.
1179 */
1180 static int
1181fold_more(mip)
1182 matchinf_T *mip;
1183{
1184 int flen;
1185 char_u *p;
1186
1187 p = mip->mi_fend;
1188 do
1189 {
1190 mb_ptr_adv(mip->mi_fend);
Bram Moolenaarea408852005-06-25 22:49:46 +00001191 } while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend));
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001192
1193 /* Include the non-word character so that we can check for the
1194 * word end. */
1195 if (*mip->mi_fend != NUL)
1196 mb_ptr_adv(mip->mi_fend);
1197
1198 (void)spell_casefold(p, (int)(mip->mi_fend - p),
1199 mip->mi_fword + mip->mi_fwordlen,
1200 MAXWLEN - mip->mi_fwordlen);
1201 flen = STRLEN(mip->mi_fword + mip->mi_fwordlen);
1202 mip->mi_fwordlen += flen;
1203 return flen;
1204}
1205
1206/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001207 * Check case flags for a word. Return TRUE if the word has the requested
1208 * case.
1209 */
1210 static int
1211spell_valid_case(origflags, treeflags)
1212 int origflags; /* flags for the checked word. */
1213 int treeflags; /* flags for the word in the spell tree */
1214{
1215 return (origflags == WF_ALLCAP
1216 || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0
1217 && ((treeflags & WF_ONECAP) == 0 || origflags == WF_ONECAP)));
1218}
1219
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001220/*
1221 * Return TRUE if spell checking is not enabled.
1222 */
1223 static int
1224no_spell_checking()
1225{
1226 if (!curwin->w_p_spell || *curbuf->b_p_spl == NUL)
1227 {
1228 EMSG(_("E756: Spell checking is not enabled"));
1229 return TRUE;
1230 }
1231 return FALSE;
1232}
Bram Moolenaar51485f02005-06-04 21:55:20 +00001233
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001234/*
1235 * Move to next spell error.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001236 * "curline" is TRUE for "z?": find word under/after cursor in the same line.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001237 * Return OK if found, FAIL otherwise.
1238 */
1239 int
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001240spell_move_to(dir, allwords, curline)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001241 int dir; /* FORWARD or BACKWARD */
1242 int allwords; /* TRUE for "[s" and "]s" */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001243 int curline;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001244{
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001245 linenr_T lnum;
1246 pos_T found_pos;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001247 char_u *line;
1248 char_u *p;
Bram Moolenaar0c405862005-06-22 22:26:26 +00001249 char_u *endp;
1250 int attr;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001251 int len;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001252 int has_syntax = syntax_present(curbuf);
1253 int col;
1254 int can_spell;
Bram Moolenaar0c405862005-06-22 22:26:26 +00001255 char_u *buf = NULL;
1256 int buflen = 0;
1257 int skip = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001258
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001259 if (no_spell_checking())
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001260 return FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001261
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001262 /*
1263 * Start looking for bad word at the start of the line, because we can't
Bram Moolenaar0c405862005-06-22 22:26:26 +00001264 * start halfway a word, we don't know where the it starts or ends.
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001265 *
1266 * When searching backwards, we continue in the line to find the last
1267 * bad word (in the cursor line: before the cursor).
Bram Moolenaar0c405862005-06-22 22:26:26 +00001268 *
1269 * We concatenate the start of the next line, so that wrapped words work
1270 * (e.g. "et<line-break>cetera"). Doesn't work when searching backwards
1271 * though...
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001272 */
1273 lnum = curwin->w_cursor.lnum;
1274 found_pos.lnum = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001275
1276 while (!got_int)
1277 {
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001278 line = ml_get(lnum);
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001279
Bram Moolenaar0c405862005-06-22 22:26:26 +00001280 len = STRLEN(line);
1281 if (buflen < len + MAXWLEN + 2)
1282 {
1283 vim_free(buf);
1284 buflen = len + MAXWLEN + 2;
1285 buf = alloc(buflen);
1286 if (buf == NULL)
1287 break;
1288 }
1289
1290 /* Copy the line into "buf" and append the start of the next line if
1291 * possible. */
1292 STRCPY(buf, line);
1293 if (lnum < curbuf->b_ml.ml_line_count)
1294 spell_cat_line(buf + STRLEN(buf), ml_get(lnum + 1), MAXWLEN);
1295
1296 p = buf + skip;
1297 endp = buf + len;
1298 while (p < endp)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001299 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001300 /* When searching backward don't search after the cursor. */
1301 if (dir == BACKWARD
1302 && lnum == curwin->w_cursor.lnum
Bram Moolenaar0c405862005-06-22 22:26:26 +00001303 && (colnr_T)(p - buf) >= curwin->w_cursor.col)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001304 break;
1305
1306 /* start of word */
Bram Moolenaar0c405862005-06-22 22:26:26 +00001307 attr = 0;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001308 len = spell_check(curwin, p, &attr);
1309
1310 if (attr != 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001311 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001312 /* We found a bad word. Check the attribute. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001313 if (allwords || attr == highlight_attr[HLF_SPB])
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001314 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001315 /* When searching forward only accept a bad word after
1316 * the cursor. */
1317 if (dir == BACKWARD
1318 || lnum > curwin->w_cursor.lnum
1319 || (lnum == curwin->w_cursor.lnum
Bram Moolenaar0c405862005-06-22 22:26:26 +00001320 && (colnr_T)(curline ? p - buf + len
1321 : p - buf)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001322 > curwin->w_cursor.col))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001323 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001324 if (has_syntax)
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001325 {
Bram Moolenaar0c405862005-06-22 22:26:26 +00001326 col = p - buf;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001327 (void)syn_get_id(lnum, (colnr_T)col,
1328 FALSE, &can_spell);
Bram Moolenaar51485f02005-06-04 21:55:20 +00001329 }
1330 else
1331 can_spell = TRUE;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001332
Bram Moolenaar51485f02005-06-04 21:55:20 +00001333 if (can_spell)
1334 {
1335 found_pos.lnum = lnum;
Bram Moolenaar0c405862005-06-22 22:26:26 +00001336 found_pos.col = p - buf;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001337#ifdef FEAT_VIRTUALEDIT
Bram Moolenaar51485f02005-06-04 21:55:20 +00001338 found_pos.coladd = 0;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001339#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +00001340 if (dir == FORWARD)
1341 {
1342 /* No need to search further. */
1343 curwin->w_cursor = found_pos;
Bram Moolenaar0c405862005-06-22 22:26:26 +00001344 vim_free(buf);
Bram Moolenaar51485f02005-06-04 21:55:20 +00001345 return OK;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001346 }
1347 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001348 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001349 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001350 }
1351
Bram Moolenaar51485f02005-06-04 21:55:20 +00001352 /* advance to character after the word */
1353 p += len;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001354 }
1355
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001356 if (curline)
Bram Moolenaar0c405862005-06-22 22:26:26 +00001357 break; /* only check cursor line */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001358
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001359 /* Advance to next line. */
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001360 if (dir == BACKWARD)
1361 {
1362 if (found_pos.lnum != 0)
1363 {
1364 /* Use the last match in the line. */
1365 curwin->w_cursor = found_pos;
Bram Moolenaar0c405862005-06-22 22:26:26 +00001366 vim_free(buf);
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001367 return OK;
1368 }
1369 if (lnum == 1)
Bram Moolenaar0c405862005-06-22 22:26:26 +00001370 break;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001371 --lnum;
1372 }
1373 else
1374 {
1375 if (lnum == curbuf->b_ml.ml_line_count)
Bram Moolenaar0c405862005-06-22 22:26:26 +00001376 break;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001377 ++lnum;
Bram Moolenaar0c405862005-06-22 22:26:26 +00001378
1379 /* Skip the characters at the start of the next line that were
1380 * included in a match crossing line boundaries. */
1381 if (attr == 0)
1382 skip = p - endp;
1383 else
1384 skip = 0;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001385 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001386
1387 line_breakcheck();
1388 }
1389
Bram Moolenaar0c405862005-06-22 22:26:26 +00001390 vim_free(buf);
1391 return FAIL;
1392}
1393
1394/*
1395 * For spell checking: concatenate the start of the following line "line" into
1396 * "buf", blanking-out special characters. Copy less then "maxlen" bytes.
1397 */
1398 void
1399spell_cat_line(buf, line, maxlen)
1400 char_u *buf;
1401 char_u *line;
1402 int maxlen;
1403{
1404 char_u *p;
1405 int n;
1406
1407 p = skipwhite(line);
1408 while (vim_strchr((char_u *)"*#/\"\t", *p) != NULL)
1409 p = skipwhite(p + 1);
1410
1411 if (*p != NUL)
1412 {
1413 *buf = ' ';
1414 vim_strncpy(buf + 1, line, maxlen - 1);
1415 n = p - line;
1416 if (n >= maxlen)
1417 n = maxlen - 1;
1418 vim_memset(buf + 1, ' ', n);
1419 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001420}
1421
1422/*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001423 * Load word list(s) for "lang" from Vim spell file(s).
Bram Moolenaarb765d632005-06-07 21:00:02 +00001424 * "lang" must be the language without the region: e.g., "en".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001425 */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001426 static void
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001427spell_load_lang(lang)
1428 char_u *lang;
1429{
Bram Moolenaarb765d632005-06-07 21:00:02 +00001430 char_u fname_enc[85];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001431 int r;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001432 char_u langcp[MAXWLEN + 1];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001433
Bram Moolenaarb765d632005-06-07 21:00:02 +00001434 /* Copy the language name to pass it to spell_load_cb() as a cookie.
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001435 * It's truncated when an error is detected. */
1436 STRCPY(langcp, lang);
1437
Bram Moolenaarb765d632005-06-07 21:00:02 +00001438 /*
1439 * Find the first spell file for "lang" in 'runtimepath' and load it.
1440 */
1441 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
1442 "spell/%s.%s.spl", lang, spell_enc());
1443 r = do_in_runtimepath(fname_enc, FALSE, spell_load_cb, &langcp);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001444
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001445 if (r == FAIL && *langcp != NUL)
1446 {
1447 /* Try loading the ASCII version. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001448 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
Bram Moolenaar9c13b352005-05-19 20:53:52 +00001449 "spell/%s.ascii.spl", lang);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001450 r = do_in_runtimepath(fname_enc, FALSE, spell_load_cb, &langcp);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001451 }
1452
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001453 if (r == FAIL)
1454 smsg((char_u *)_("Warning: Cannot find word list \"%s\""),
1455 fname_enc + 6);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001456 else if (*langcp != NUL)
1457 {
1458 /* Load all the additions. */
1459 STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl");
1460 do_in_runtimepath(fname_enc, TRUE, spell_load_cb, &langcp);
1461 }
1462}
1463
1464/*
1465 * Return the encoding used for spell checking: Use 'encoding', except that we
1466 * use "latin1" for "latin9". And limit to 60 characters (just in case).
1467 */
1468 static char_u *
1469spell_enc()
1470{
1471
1472#ifdef FEAT_MBYTE
1473 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0)
1474 return p_enc;
1475#endif
1476 return (char_u *)"latin1";
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001477}
1478
1479/*
1480 * Allocate a new slang_T.
1481 * Caller must fill "sl_next".
1482 */
1483 static slang_T *
1484slang_alloc(lang)
1485 char_u *lang;
1486{
1487 slang_T *lp;
1488
Bram Moolenaar51485f02005-06-04 21:55:20 +00001489 lp = (slang_T *)alloc_clear(sizeof(slang_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001490 if (lp != NULL)
1491 {
1492 lp->sl_name = vim_strsave(lang);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001493 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001494 ga_init2(&lp->sl_sal, sizeof(salitem_T), 10);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001495 }
1496 return lp;
1497}
1498
1499/*
1500 * Free the contents of an slang_T and the structure itself.
1501 */
1502 static void
1503slang_free(lp)
1504 slang_T *lp;
1505{
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001506 vim_free(lp->sl_name);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001507 vim_free(lp->sl_fname);
1508 slang_clear(lp);
1509 vim_free(lp);
1510}
1511
1512/*
1513 * Clear an slang_T so that the file can be reloaded.
1514 */
1515 static void
1516slang_clear(lp)
1517 slang_T *lp;
1518{
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001519 garray_T *gap;
1520 fromto_T *ftp;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001521 salitem_T *smp;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001522 int i;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001523
Bram Moolenaar51485f02005-06-04 21:55:20 +00001524 vim_free(lp->sl_fbyts);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001525 lp->sl_fbyts = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001526 vim_free(lp->sl_kbyts);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001527 lp->sl_kbyts = NULL;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001528 vim_free(lp->sl_pbyts);
1529 lp->sl_pbyts = NULL;
1530
Bram Moolenaar51485f02005-06-04 21:55:20 +00001531 vim_free(lp->sl_fidxs);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001532 lp->sl_fidxs = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001533 vim_free(lp->sl_kidxs);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001534 lp->sl_kidxs = NULL;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001535 vim_free(lp->sl_pidxs);
1536 lp->sl_pidxs = NULL;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001537
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001538 gap = &lp->sl_rep;
1539 while (gap->ga_len > 0)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001540 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001541 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len];
1542 vim_free(ftp->ft_from);
1543 vim_free(ftp->ft_to);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001544 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001545 ga_clear(gap);
1546
1547 gap = &lp->sl_sal;
1548 while (gap->ga_len > 0)
1549 {
1550 smp = &((salitem_T *)gap->ga_data)[--gap->ga_len];
1551 vim_free(smp->sm_lead);
1552 vim_free(smp->sm_to);
1553 }
1554 ga_clear(gap);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001555
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001556 for (i = 0; i < lp->sl_prefixcnt; ++i)
1557 vim_free(lp->sl_prefprog[i]);
1558 vim_free(lp->sl_prefprog);
1559
Bram Moolenaarea424162005-06-16 21:51:00 +00001560#ifdef FEAT_MBYTE
1561 {
1562 int todo = lp->sl_map_hash.ht_used;
1563 hashitem_T *hi;
1564
1565 for (hi = lp->sl_map_hash.ht_array; todo > 0; ++hi)
1566 if (!HASHITEM_EMPTY(hi))
1567 {
1568 --todo;
1569 vim_free(hi->hi_key);
1570 }
1571 }
1572 hash_clear(&lp->sl_map_hash);
1573#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001574}
1575
1576/*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001577 * Load one spell file and store the info into a slang_T.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001578 * Invoked through do_in_runtimepath().
1579 */
1580 static void
Bram Moolenaarb765d632005-06-07 21:00:02 +00001581spell_load_cb(fname, cookie)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001582 char_u *fname;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001583 void *cookie; /* points to the language name */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001584{
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001585 (void)spell_load_file(fname, (char_u *)cookie, NULL, FALSE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001586}
1587
1588/*
1589 * Load one spell file and store the info into a slang_T.
1590 *
1591 * This is invoked in two ways:
1592 * - From spell_load_cb() to load a spell file for the first time. "lang" is
1593 * the language name, "old_lp" is NULL. Will allocate an slang_T.
1594 * - To reload a spell file that was changed. "lang" is NULL and "old_lp"
1595 * points to the existing slang_T.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001596 * Returns the slang_T the spell file was loaded into. NULL for error.
Bram Moolenaarb765d632005-06-07 21:00:02 +00001597 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001598 static slang_T *
1599spell_load_file(fname, lang, old_lp, silent)
Bram Moolenaarb765d632005-06-07 21:00:02 +00001600 char_u *fname;
1601 char_u *lang;
1602 slang_T *old_lp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001603 int silent; /* no error if file doesn't exist */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001604{
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001605 FILE *fd;
1606 char_u buf[MAXWLEN + 1];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001607 char_u *p;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001608 char_u *bp;
1609 idx_T *ip;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001610 int i;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001611 int n;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001612 int len;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001613 int round;
1614 char_u *save_sourcing_name = sourcing_name;
1615 linenr_T save_sourcing_lnum = sourcing_lnum;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001616 int cnt, ccnt;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001617 char_u *fol;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001618 slang_T *lp = NULL;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001619 garray_T *gap;
1620 fromto_T *ftp;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001621 salitem_T *smp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001622 int rr;
1623 short *first;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00001624 idx_T idx;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001625 int c = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001626
Bram Moolenaarb765d632005-06-07 21:00:02 +00001627 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001628 if (fd == NULL)
1629 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001630 if (!silent)
1631 EMSG2(_(e_notopen), fname);
1632 else if (p_verbose > 2)
1633 {
1634 verbose_enter();
1635 smsg((char_u *)e_notopen, fname);
1636 verbose_leave();
1637 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001638 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001639 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00001640 if (p_verbose > 2)
1641 {
1642 verbose_enter();
1643 smsg((char_u *)_("Reading spell file \"%s\""), fname);
1644 verbose_leave();
1645 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001646
Bram Moolenaarb765d632005-06-07 21:00:02 +00001647 if (old_lp == NULL)
1648 {
1649 lp = slang_alloc(lang);
1650 if (lp == NULL)
1651 goto endFAIL;
1652
1653 /* Remember the file name, used to reload the file when it's updated. */
1654 lp->sl_fname = vim_strsave(fname);
1655 if (lp->sl_fname == NULL)
1656 goto endFAIL;
1657
1658 /* Check for .add.spl. */
1659 lp->sl_add = strstr((char *)gettail(fname), ".add.") != NULL;
1660 }
1661 else
1662 lp = old_lp;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001663
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001664 /* Set sourcing_name, so that error messages mention the file name. */
1665 sourcing_name = fname;
1666 sourcing_lnum = 0;
1667
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001668 /* <HEADER>: <fileID>
1669 * <regioncnt> <regionname> ...
1670 * <charflagslen> <charflags>
1671 * <fcharslen> <fchars>
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00001672 * <midwordlen> <midword>
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001673 * <prefcondcnt> <prefcond> ...
1674 */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001675 for (i = 0; i < VIMSPELLMAGICL; ++i)
1676 buf[i] = getc(fd); /* <fileID> */
1677 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0)
1678 {
1679 EMSG(_("E757: Wrong file ID in spell file"));
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001680 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001681 }
1682
1683 cnt = getc(fd); /* <regioncnt> */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001684 if (cnt < 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001685 {
1686truncerr:
1687 EMSG(_("E758: Truncated spell file"));
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001688 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001689 }
1690 if (cnt > 8)
1691 {
1692formerr:
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001693 EMSG(_(e_format));
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001694 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001695 }
1696 for (i = 0; i < cnt; ++i)
1697 {
1698 lp->sl_regions[i * 2] = getc(fd); /* <regionname> */
1699 lp->sl_regions[i * 2 + 1] = getc(fd);
1700 }
1701 lp->sl_regions[cnt * 2] = NUL;
1702
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001703 cnt = getc(fd); /* <charflagslen> */
1704 if (cnt > 0)
1705 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001706 p = alloc((unsigned)cnt);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001707 if (p == NULL)
1708 goto endFAIL;
1709 for (i = 0; i < cnt; ++i)
1710 p[i] = getc(fd); /* <charflags> */
1711
1712 ccnt = (getc(fd) << 8) + getc(fd); /* <fcharslen> */
1713 if (ccnt <= 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001714 {
1715 vim_free(p);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001716 goto formerr;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001717 }
1718 fol = alloc((unsigned)ccnt + 1);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001719 if (fol == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001720 {
1721 vim_free(p);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001722 goto endFAIL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001723 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001724 for (i = 0; i < ccnt; ++i)
1725 fol[i] = getc(fd); /* <fchars> */
1726 fol[i] = NUL;
1727
Bram Moolenaar9f30f502005-06-14 22:01:04 +00001728 /* Set the word-char flags and fill SPELL_ISUPPER() table. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001729 i = set_spell_charflags(p, cnt, fol);
1730 vim_free(p);
1731 vim_free(fol);
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001732#if 0 /* tolerate the differences */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001733 if (i == FAIL)
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001734 goto formerr;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00001735#endif
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001736 }
1737 else
1738 {
1739 /* When <charflagslen> is zero then <fcharlen> must also be zero. */
1740 cnt = (getc(fd) << 8) + getc(fd);
1741 if (cnt != 0)
1742 goto formerr;
1743 }
1744
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00001745 /* <midwordlen> <midword> */
1746 cnt = (getc(fd) << 8) + getc(fd);
1747 if (cnt < 0)
1748 goto truncerr;
1749 if (cnt > 0)
1750 {
1751 for (i = 0; i < cnt; ++i)
1752 if (i < MAXWLEN) /* truncate at reasonable length */
1753 buf[i] = getc(fd);
1754 if (i < MAXWLEN)
1755 buf[i] = NUL;
1756 else
1757 buf[MAXWLEN] = NUL;
1758
1759 /* The midword characters add up to any midword characters from other
1760 * .spel files. */
1761 for (p = buf; *p != NUL; )
1762#ifdef FEAT_MBYTE
1763 if (has_mbyte)
1764 {
1765 c = mb_ptr2char(p);
1766 i = mb_ptr2len_check(p);
1767 if (c < 256)
1768 spell_ismw[c] = TRUE;
1769 else if (spell_ismw_mb == NULL)
1770 /* First multi-byte char in "spell_ismw_mb". */
1771 spell_ismw_mb = vim_strnsave(p, i);
1772 else
1773 {
1774 /* Append multi-byte chars to "spell_ismw_mb". */
1775 n = STRLEN(spell_ismw_mb);
1776 bp = vim_strnsave(spell_ismw_mb, n + i);
1777 if (bp != NULL)
1778 {
1779 vim_free(spell_ismw_mb);
1780 spell_ismw_mb = bp;
1781 vim_strncpy(bp + n, p, i);
1782 }
1783 }
1784 p += i;
1785 }
1786 else
1787#endif
1788 spell_ismw[*p++] = TRUE;
1789 }
1790
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001791 /* <prefcondcnt> <prefcond> ... */
1792 cnt = (getc(fd) << 8) + getc(fd); /* <prefcondcnt> */
1793 if (cnt > 0)
1794 {
1795 lp->sl_prefprog = (regprog_T **)alloc_clear(
1796 (unsigned)sizeof(regprog_T *) * cnt);
1797 if (lp->sl_prefprog == NULL)
1798 goto endFAIL;
1799 lp->sl_prefixcnt = cnt;
1800
1801 for (i = 0; i < cnt; ++i)
1802 {
1803 /* <prefcond> : <condlen> <condstr> */
1804 n = getc(fd); /* <condlen> */
1805 if (n < 0)
1806 goto formerr;
1807 /* When <condlen> is zero we have an empty condition. Otherwise
1808 * compile the regexp program used to check for the condition. */
1809 if (n > 0)
1810 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001811 buf[0] = '^'; /* always match at one position only */
1812 p = buf + 1;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001813 while (n-- > 0)
1814 *p++ = getc(fd); /* <condstr> */
1815 *p = NUL;
1816 lp->sl_prefprog[i] = vim_regcomp(buf, RE_MAGIC + RE_STRING);
1817 }
1818 }
1819 }
1820
1821
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001822 /* <SUGGEST> : <repcount> <rep> ...
1823 * <salflags> <salcount> <sal> ...
1824 * <maplen> <mapstr> */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001825
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001826 cnt = (getc(fd) << 8) + getc(fd); /* <repcount> */
1827 if (cnt < 0)
1828 goto formerr;
1829
1830 gap = &lp->sl_rep;
1831 if (ga_grow(gap, cnt) == FAIL)
1832 goto endFAIL;
1833
1834 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */
1835 for (; gap->ga_len < cnt; ++gap->ga_len)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001836 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001837 ftp = &((fromto_T *)gap->ga_data)[gap->ga_len];
1838 for (rr = 1; rr <= 2; ++rr)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001839 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001840 ccnt = getc(fd);
1841 if (ccnt < 0)
1842 {
1843 if (rr == 2)
1844 vim_free(ftp->ft_from);
1845 goto formerr;
1846 }
1847 if ((p = alloc(ccnt + 1)) == NULL)
1848 {
1849 if (rr == 2)
1850 vim_free(ftp->ft_from);
1851 goto endFAIL;
1852 }
1853 for (i = 0; i < ccnt; ++i)
1854 p[i] = getc(fd); /* <repfrom> or <repto> */
1855 p[i] = NUL;
1856 if (rr == 1)
1857 ftp->ft_from = p;
1858 else
1859 ftp->ft_to = p;
1860 }
1861 }
1862
1863 /* Fill the first-index table. */
1864 first = lp->sl_rep_first;
1865 for (i = 0; i < 256; ++i)
1866 first[i] = -1;
1867 for (i = 0; i < gap->ga_len; ++i)
1868 {
1869 ftp = &((fromto_T *)gap->ga_data)[i];
1870 if (first[*ftp->ft_from] == -1)
1871 first[*ftp->ft_from] = i;
1872 }
1873
1874 i = getc(fd); /* <salflags> */
1875 if (i & SAL_F0LLOWUP)
1876 lp->sl_followup = TRUE;
1877 if (i & SAL_COLLAPSE)
1878 lp->sl_collapse = TRUE;
1879 if (i & SAL_REM_ACCENTS)
1880 lp->sl_rem_accents = TRUE;
1881
1882 cnt = (getc(fd) << 8) + getc(fd); /* <salcount> */
1883 if (cnt < 0)
1884 goto formerr;
1885
1886 gap = &lp->sl_sal;
1887 if (ga_grow(gap, cnt) == FAIL)
1888 goto endFAIL;
1889
1890 /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
1891 for (; gap->ga_len < cnt; ++gap->ga_len)
1892 {
1893 smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
1894 ccnt = getc(fd); /* <salfromlen> */
1895 if (ccnt < 0)
1896 goto formerr;
1897 if ((p = alloc(ccnt + 2)) == NULL)
1898 goto endFAIL;
1899 smp->sm_lead = p;
1900
1901 /* Read up to the first special char into sm_lead. */
1902 for (i = 0; i < ccnt; ++i)
1903 {
1904 c = getc(fd); /* <salfrom> */
1905 if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL)
1906 break;
1907 *p++ = c;
1908 }
1909 smp->sm_leadlen = p - smp->sm_lead;
1910 *p++ = NUL;
1911
1912 /* Put optional chars in sm_oneoff, if any. */
1913 if (c == '(')
1914 {
1915 smp->sm_oneoff = p;
1916 for (++i; i < ccnt; ++i)
1917 {
1918 c = getc(fd); /* <salfrom> */
1919 if (c == ')')
1920 break;
1921 *p++ = c;
1922 }
1923 *p++ = NUL;
1924 if (++i < ccnt)
1925 c = getc(fd);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001926 }
1927 else
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001928 smp->sm_oneoff = NULL;
1929
1930 /* Any following chars go in sm_rules. */
1931 smp->sm_rules = p;
1932 if (i < ccnt)
1933 *p++ = c;
1934 for (++i; i < ccnt; ++i)
1935 *p++ = getc(fd); /* <salfrom> */
1936 *p++ = NUL;
1937
1938 ccnt = getc(fd); /* <saltolen> */
1939 if (ccnt < 0)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001940 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001941 vim_free(smp->sm_lead);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001942 goto formerr;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001943 }
1944 if ((p = alloc(ccnt + 1)) == NULL)
1945 {
1946 vim_free(smp->sm_lead);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001947 goto endFAIL;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001948 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001949 smp->sm_to = p;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001950
Bram Moolenaard857f0e2005-06-21 22:37:39 +00001951 for (i = 0; i < ccnt; ++i)
1952 *p++ = getc(fd); /* <salto> */
1953 *p++ = NUL;
1954 }
1955
1956 /* Fill the first-index table. */
1957 first = lp->sl_sal_first;
1958 for (i = 0; i < 256; ++i)
1959 first[i] = -1;
1960 for (i = 0; i < gap->ga_len; ++i)
1961 {
1962 smp = &((salitem_T *)gap->ga_data)[i];
1963 if (first[*smp->sm_lead] == -1)
1964 first[*smp->sm_lead] = i;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001965 }
1966
1967 cnt = (getc(fd) << 8) + getc(fd); /* <maplen> */
1968 if (cnt < 0)
1969 goto formerr;
1970 p = alloc(cnt + 1);
1971 if (p == NULL)
1972 goto endFAIL;
1973 for (i = 0; i < cnt; ++i)
1974 p[i] = getc(fd); /* <mapstr> */
1975 p[i] = NUL;
Bram Moolenaarea424162005-06-16 21:51:00 +00001976 set_map_str(lp, p);
1977 vim_free(p);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001978
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001979
Bram Moolenaar51485f02005-06-04 21:55:20 +00001980 /* round 1: <LWORDTREE>
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001981 * round 2: <KWORDTREE>
1982 * round 3: <PREFIXTREE> */
1983 for (round = 1; round <= 3; ++round)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001984 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001985 /* The tree size was computed when writing the file, so that we can
1986 * allocate it as one long block. <nodecount> */
1987 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
1988 if (len < 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001989 goto truncerr;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001990 if (len > 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001991 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001992 /* Allocate the byte array. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001993 bp = lalloc((long_u)len, TRUE);
1994 if (bp == NULL)
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001995 goto endFAIL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001996 if (round == 1)
Bram Moolenaar1d73c882005-06-19 22:48:47 +00001997 lp->sl_fbyts = bp;
1998 else if (round == 2)
1999 lp->sl_kbyts = bp;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00002000 else
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002001 lp->sl_pbyts = bp;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002002
2003 /* Allocate the index array. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002004 ip = (idx_T *)lalloc_clear((long_u)(len * sizeof(int)), TRUE);
2005 if (ip == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002006 goto endFAIL;
2007 if (round == 1)
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002008 lp->sl_fidxs = ip;
2009 else if (round == 2)
2010 lp->sl_kidxs = ip;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002011 else
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002012 lp->sl_pidxs = ip;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002013
2014 /* Read the tree and store it in the array. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002015 idx = read_tree(fd, bp, ip, len, 0, round == 3, lp->sl_prefixcnt);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002016 if (idx == -1)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002017 goto truncerr;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002018 if (idx < 0)
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002019 goto formerr;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002020 }
2021 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002022
Bram Moolenaarb765d632005-06-07 21:00:02 +00002023 /* For a new file link it in the list of spell files. */
2024 if (old_lp == NULL)
2025 {
2026 lp->sl_next = first_lang;
2027 first_lang = lp;
2028 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002029
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002030 goto endOK;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002031
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002032endFAIL:
Bram Moolenaarb765d632005-06-07 21:00:02 +00002033 if (lang != NULL)
2034 /* truncating the name signals the error to spell_load_lang() */
2035 *lang = NUL;
2036 if (lp != NULL && old_lp == NULL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002037 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002038 slang_free(lp);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002039 lp = NULL;
2040 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002041
2042endOK:
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002043 if (fd != NULL)
2044 fclose(fd);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002045 sourcing_name = save_sourcing_name;
2046 sourcing_lnum = save_sourcing_lnum;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002047
2048 return lp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002049}
2050
2051/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00002052 * Read one row of siblings from the spell file and store it in the byte array
2053 * "byts" and index array "idxs". Recursively read the children.
2054 *
Bram Moolenaar0c405862005-06-22 22:26:26 +00002055 * NOTE: The code here must match put_node().
Bram Moolenaar51485f02005-06-04 21:55:20 +00002056 *
2057 * Returns the index follosing the siblings.
2058 * Returns -1 if the file is shorter than expected.
2059 * Returns -2 if there is a format error.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002060 */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002061 static idx_T
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002062read_tree(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002063 FILE *fd;
2064 char_u *byts;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002065 idx_T *idxs;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002066 int maxidx; /* size of arrays */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002067 idx_T startidx; /* current index in "byts" and "idxs" */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002068 int prefixtree; /* TRUE for reading PREFIXTREE */
2069 int maxprefcondnr; /* maximum for <prefcondnr> */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002070{
Bram Moolenaar51485f02005-06-04 21:55:20 +00002071 int len;
2072 int i;
2073 int n;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002074 idx_T idx = startidx;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002075 int c;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002076 int c2;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002077#define SHARED_MASK 0x8000000
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002078
Bram Moolenaar51485f02005-06-04 21:55:20 +00002079 len = getc(fd); /* <siblingcount> */
2080 if (len <= 0)
2081 return -1;
2082
2083 if (startidx + len >= maxidx)
2084 return -2;
2085 byts[idx++] = len;
2086
2087 /* Read the byte values, flag/region bytes and shared indexes. */
2088 for (i = 1; i <= len; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002089 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002090 c = getc(fd); /* <byte> */
2091 if (c < 0)
2092 return -1;
2093 if (c <= BY_SPECIAL)
2094 {
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002095 if (c == BY_NOFLAGS && !prefixtree)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002096 {
2097 /* No flags, all regions. */
2098 idxs[idx] = 0;
2099 c = 0;
2100 }
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002101 else if (c == BY_FLAGS || c == BY_NOFLAGS)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002102 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002103 if (prefixtree)
2104 {
2105 /* Read the prefix ID and the condition nr. In idxs[]
2106 * store the prefix ID in the low byte, the condition
2107 * index shifted up 8 bits. */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002108 c2 = getc(fd); /* <prefixID> */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002109 n = (getc(fd) << 8) + getc(fd); /* <prefcondnr> */
2110 if (n >= maxprefcondnr)
2111 return -2;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002112 c2 += (n << 8);
2113 if (c == BY_NOFLAGS)
2114 c = c2;
2115 else
2116 c = c2 | WF_RAREPFX;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002117 }
2118 else
2119 {
2120 /* Read flags and optional region and prefix ID. In
2121 * idxs[] the flags go in the low byte, region above that
2122 * and prefix ID above the region. */
2123 c = getc(fd); /* <flags> */
2124 if (c & WF_REGION)
2125 c = (getc(fd) << 8) + c; /* <region> */
2126 if (c & WF_PFX)
2127 c = (getc(fd) << 16) + c; /* <prefixID> */
2128 }
2129
Bram Moolenaar51485f02005-06-04 21:55:20 +00002130 idxs[idx] = c;
2131 c = 0;
2132 }
2133 else /* c == BY_INDEX */
2134 {
2135 /* <nodeidx> */
2136 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
2137 if (n < 0 || n >= maxidx)
2138 return -2;
2139 idxs[idx] = n + SHARED_MASK;
2140 c = getc(fd); /* <xbyte> */
2141 }
2142 }
2143 byts[idx++] = c;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002144 }
2145
Bram Moolenaar51485f02005-06-04 21:55:20 +00002146 /* Recursively read the children for non-shared siblings.
2147 * Skip the end-of-word ones (zero byte value) and the shared ones (and
2148 * remove SHARED_MASK) */
2149 for (i = 1; i <= len; ++i)
2150 if (byts[startidx + i] != 0)
2151 {
2152 if (idxs[startidx + i] & SHARED_MASK)
2153 idxs[startidx + i] &= ~SHARED_MASK;
2154 else
2155 {
2156 idxs[startidx + i] = idx;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002157 idx = read_tree(fd, byts, idxs, maxidx, idx,
2158 prefixtree, maxprefcondnr);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002159 if (idx < 0)
2160 break;
2161 }
2162 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002163
Bram Moolenaar51485f02005-06-04 21:55:20 +00002164 return idx;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002165}
2166
2167/*
2168 * Parse 'spelllang' and set buf->b_langp accordingly.
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002169 * Returns NULL if it's OK, an error message otherwise.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002170 */
2171 char_u *
2172did_set_spelllang(buf)
2173 buf_T *buf;
2174{
2175 garray_T ga;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002176 char_u *splp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002177 char_u *region;
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002178 int filename;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002179 int region_mask;
2180 slang_T *lp;
2181 int c;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002182 char_u lang[MAXWLEN + 1];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002183 char_u spf_name[MAXPATHL];
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002184 int load_spf;
2185 int len;
2186 char_u *p;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002187
2188 ga_init2(&ga, sizeof(langp_T), 2);
2189
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002190 /* Make the name of the .spl file associated with 'spellfile'. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002191 if (*buf->b_p_spf == NUL)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002192 load_spf = FALSE;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002193 else
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002194 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002195 vim_snprintf((char *)spf_name, sizeof(spf_name), "%s.spl",
2196 buf->b_p_spf);
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002197 load_spf = TRUE;
2198 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002199
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002200 /* loop over comma separated language names. */
2201 for (splp = buf->b_p_spl; *splp != NUL; )
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002202 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002203 /* Get one language name. */
2204 copy_option_part(&splp, lang, MAXWLEN, ",");
2205
Bram Moolenaar5482f332005-04-17 20:18:43 +00002206 region = NULL;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002207 len = STRLEN(lang);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002208
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002209 /* If the name ends in ".spl" use it as the name of the spell file.
2210 * If there is a region name let "region" point to it and remove it
2211 * from the name. */
2212 if (len > 4 && fnamecmp(lang + len - 4, ".spl") == 0)
2213 {
2214 filename = TRUE;
2215
2216 /* Check if we loaded this language before. */
2217 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
2218 if (fullpathcmp(lang, lp->sl_fname, FALSE) == FPC_SAME)
2219 break;
2220 }
2221 else
2222 {
2223 filename = FALSE;
2224 if (len > 3 && lang[len - 3] == '_')
2225 {
2226 region = lang + len - 2;
2227 len -= 3;
2228 lang[len] = NUL;
2229 }
2230
2231 /* Check if we loaded this language before. */
2232 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
2233 if (STRICMP(lang, lp->sl_name) == 0)
2234 break;
2235 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002236
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002237 /* If not found try loading the language now. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002238 if (lp == NULL)
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002239 {
2240 if (filename)
2241 (void)spell_load_file(lang, lang, NULL, FALSE);
2242 else
2243 spell_load_lang(lang);
2244 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002245
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002246 /*
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002247 * Loop over the languages, there can be several files for "lang".
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002248 */
2249 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002250 if (filename ? fullpathcmp(lang, lp->sl_fname, FALSE) == FPC_SAME
2251 : STRICMP(lang, lp->sl_name) == 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002252 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00002253 region_mask = REGION_ALL;
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002254 if (!filename && region != NULL)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002255 {
2256 /* find region in sl_regions */
2257 c = find_region(lp->sl_regions, region);
2258 if (c == REGION_ALL)
2259 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00002260 if (!lp->sl_add)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002261 smsg((char_u *)
2262 _("Warning: region %s not supported"),
2263 region);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002264 }
2265 else
2266 region_mask = 1 << c;
2267 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002268
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002269 if (ga_grow(&ga, 1) == FAIL)
2270 {
2271 ga_clear(&ga);
2272 return e_outofmem;
2273 }
2274 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp;
2275 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask;
2276 ++ga.ga_len;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002277
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002278 /* Check if this is the spell file related to 'spellfile'. */
2279 if (load_spf && fullpathcmp(spf_name, lp->sl_fname, FALSE)
2280 == FPC_SAME)
2281 load_spf = FALSE;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002282 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002283 }
2284
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002285 /*
2286 * Make sure the 'spellfile' file is loaded. It may be in 'runtimepath',
2287 * then it's probably loaded above already. Otherwise load it here.
2288 */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002289 if (load_spf)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002290 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002291 /* Check if it was loaded already. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002292 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
2293 if (fullpathcmp(spf_name, lp->sl_fname, FALSE) == FPC_SAME)
2294 break;
2295 if (lp == NULL)
2296 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002297 /* Not loaded, try loading it now. The language name includes the
2298 * region name, the region is ignored otherwise. */
2299 vim_strncpy(lang, gettail(buf->b_p_spf), MAXWLEN);
2300 p = vim_strchr(lang, '.');
2301 if (p != NULL)
2302 *p = NUL; /* truncate at ".encoding.add" */
2303 lp = spell_load_file(spf_name, lang, NULL, TRUE);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002304 }
2305 if (lp != NULL && ga_grow(&ga, 1) == OK)
2306 {
2307 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp;
2308 LANGP_ENTRY(ga, ga.ga_len)->lp_region = REGION_ALL;
2309 ++ga.ga_len;
2310 }
2311 }
2312
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002313 /* Add a NULL entry to mark the end of the list. */
2314 if (ga_grow(&ga, 1) == FAIL)
2315 {
2316 ga_clear(&ga);
2317 return e_outofmem;
2318 }
2319 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = NULL;
2320 ++ga.ga_len;
2321
2322 /* Everything is fine, store the new b_langp value. */
2323 ga_clear(&buf->b_langp);
2324 buf->b_langp = ga;
2325
2326 return NULL;
2327}
2328
2329/*
2330 * Find the region "region[2]" in "rp" (points to "sl_regions").
2331 * Each region is simply stored as the two characters of it's name.
2332 * Returns the index if found, REGION_ALL if not found.
2333 */
2334 static int
2335find_region(rp, region)
2336 char_u *rp;
2337 char_u *region;
2338{
2339 int i;
2340
2341 for (i = 0; ; i += 2)
2342 {
2343 if (rp[i] == NUL)
2344 return REGION_ALL;
2345 if (rp[i] == region[0] && rp[i + 1] == region[1])
2346 break;
2347 }
2348 return i / 2;
2349}
2350
2351/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002352 * Return case type of word:
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002353 * w word 0
Bram Moolenaar51485f02005-06-04 21:55:20 +00002354 * Word WF_ONECAP
2355 * W WORD WF_ALLCAP
2356 * WoRd wOrd WF_KEEPCAP
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002357 */
2358 static int
2359captype(word, end)
2360 char_u *word;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002361 char_u *end; /* When NULL use up to NUL byte. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002362{
2363 char_u *p;
2364 int c;
2365 int firstcap;
2366 int allcap;
2367 int past_second = FALSE; /* past second word char */
2368
2369 /* find first letter */
Bram Moolenaarea408852005-06-25 22:49:46 +00002370 for (p = word; !spell_iswordp(p); mb_ptr_adv(p))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002371 if (end == NULL ? *p == NUL : p >= end)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002372 return 0; /* only non-word characters, illegal word */
2373#ifdef FEAT_MBYTE
Bram Moolenaarb765d632005-06-07 21:00:02 +00002374 if (has_mbyte)
2375 c = mb_ptr2char_adv(&p);
2376 else
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002377#endif
Bram Moolenaarb765d632005-06-07 21:00:02 +00002378 c = *p++;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002379 firstcap = allcap = SPELL_ISUPPER(c);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002380
2381 /*
2382 * Need to check all letters to find a word with mixed upper/lower.
2383 * But a word with an upper char only at start is a ONECAP.
2384 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002385 for ( ; end == NULL ? *p != NUL : p < end; mb_ptr_adv(p))
Bram Moolenaarea408852005-06-25 22:49:46 +00002386 if (spell_iswordp(p))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002387 {
2388#ifdef FEAT_MBYTE
2389 c = mb_ptr2char(p);
2390#else
2391 c = *p;
2392#endif
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002393 if (!SPELL_ISUPPER(c))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002394 {
2395 /* UUl -> KEEPCAP */
2396 if (past_second && allcap)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002397 return WF_KEEPCAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002398 allcap = FALSE;
2399 }
2400 else if (!allcap)
2401 /* UlU -> KEEPCAP */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002402 return WF_KEEPCAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002403 past_second = TRUE;
2404 }
2405
2406 if (allcap)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002407 return WF_ALLCAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002408 if (firstcap)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002409 return WF_ONECAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002410 return 0;
2411}
2412
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002413# if defined(FEAT_MBYTE) || defined(EXITFREE) || defined(PROTO)
2414/*
2415 * Free all languages.
2416 */
2417 void
2418spell_free_all()
2419{
2420 slang_T *lp;
2421 buf_T *buf;
2422
2423 /* Go through all buffers and handle 'spelllang'. */
2424 for (buf = firstbuf; buf != NULL; buf = buf->b_next)
2425 ga_clear(&buf->b_langp);
2426
2427 while (first_lang != NULL)
2428 {
2429 lp = first_lang;
2430 first_lang = lp->sl_next;
2431 slang_free(lp);
2432 }
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002433
2434 init_spell_chartab();
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002435}
2436# endif
2437
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002438# if defined(FEAT_MBYTE) || defined(PROTO)
2439/*
2440 * Clear all spelling tables and reload them.
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002441 * Used after 'encoding' is set and when ":mkspell" was used.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002442 */
2443 void
2444spell_reload()
2445{
2446 buf_T *buf;
Bram Moolenaar3982c542005-06-08 21:56:31 +00002447 win_T *wp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002448
Bram Moolenaarea408852005-06-25 22:49:46 +00002449 /* Initialize the table for spell_iswordp(). */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002450 init_spell_chartab();
2451
2452 /* Unload all allocated memory. */
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00002453 spell_free_all();
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002454
2455 /* Go through all buffers and handle 'spelllang'. */
2456 for (buf = firstbuf; buf != NULL; buf = buf->b_next)
2457 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00002458 /* Only load the wordlists when 'spelllang' is set and there is a
2459 * window for this buffer in which 'spell' is set. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002460 if (*buf->b_p_spl != NUL)
Bram Moolenaar3982c542005-06-08 21:56:31 +00002461 {
2462 FOR_ALL_WINDOWS(wp)
2463 if (wp->w_buffer == buf && wp->w_p_spell)
2464 {
2465 (void)did_set_spelllang(buf);
2466# ifdef FEAT_WINDOWS
2467 break;
2468# endif
2469 }
2470 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002471 }
2472}
2473# endif
2474
Bram Moolenaarb765d632005-06-07 21:00:02 +00002475/*
2476 * Reload the spell file "fname" if it's loaded.
2477 */
2478 static void
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002479spell_reload_one(fname, added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00002480 char_u *fname;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002481 int added_word; /* invoked through "zg" */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002482{
2483 slang_T *lp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002484 int didit = FALSE;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002485
Bram Moolenaarb765d632005-06-07 21:00:02 +00002486 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
2487 if (fullpathcmp(fname, lp->sl_fname, FALSE) == FPC_SAME)
2488 {
2489 slang_clear(lp);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002490 (void)spell_load_file(fname, NULL, lp, FALSE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002491 redraw_all_later(NOT_VALID);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002492 didit = TRUE;
Bram Moolenaarb765d632005-06-07 21:00:02 +00002493 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002494
2495 /* When "zg" was used and the file wasn't loaded yet, should redo
2496 * 'spelllang' to get it loaded. */
2497 if (added_word && !didit)
2498 did_set_spelllang(curbuf);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002499}
2500
2501
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002502/*
2503 * Functions for ":mkspell".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002504 */
2505
Bram Moolenaar51485f02005-06-04 21:55:20 +00002506#define MAXLINELEN 500 /* Maximum length in bytes of a line in a .aff
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002507 and .dic file. */
2508/*
2509 * Main structure to store the contents of a ".aff" file.
2510 */
2511typedef struct afffile_S
2512{
2513 char_u *af_enc; /* "SET", normalized, alloc'ed string or NULL */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002514 int af_rar; /* RAR ID for rare word */
2515 int af_kep; /* KEP ID for keep-case word */
Bram Moolenaar0c405862005-06-22 22:26:26 +00002516 int af_bad; /* BAD ID for banned word */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002517 int af_pfxpostpone; /* postpone prefixes without chop string */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002518 hashtab_T af_pref; /* hashtable for prefixes, affheader_T */
2519 hashtab_T af_suff; /* hashtable for suffixes, affheader_T */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002520} afffile_T;
2521
2522typedef struct affentry_S affentry_T;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002523/* Affix entry from ".aff" file. Used for prefixes and suffixes. */
2524struct affentry_S
2525{
2526 affentry_T *ae_next; /* next affix with same name/number */
2527 char_u *ae_chop; /* text to chop off basic word (can be NULL) */
2528 char_u *ae_add; /* text to add to basic word (can be NULL) */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002529 char_u *ae_cond; /* condition (NULL for ".") */
2530 regprog_T *ae_prog; /* regexp program for ae_cond or NULL */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002531 int ae_rare; /* rare affix */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002532};
2533
2534/* Affix header from ".aff" file. Used for af_pref and af_suff. */
2535typedef struct affheader_S
2536{
2537 char_u ah_key[2]; /* key for hashtable == name of affix entry */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002538 int ah_newID; /* prefix ID after renumbering */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002539 int ah_combine; /* suffix may combine with prefix */
2540 affentry_T *ah_first; /* first affix entry */
2541} affheader_T;
2542
2543#define HI2AH(hi) ((affheader_T *)(hi)->hi_key)
2544
2545/*
2546 * Structure that is used to store the items in the word tree. This avoids
2547 * the need to keep track of each allocated thing, it's freed all at once
2548 * after ":mkspell" is done.
2549 */
2550#define SBLOCKSIZE 16000 /* size of sb_data */
2551typedef struct sblock_S sblock_T;
2552struct sblock_S
2553{
2554 sblock_T *sb_next; /* next block in list */
2555 int sb_used; /* nr of bytes already in use */
2556 char_u sb_data[1]; /* data, actually longer */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002557};
2558
2559/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00002560 * A node in the tree.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002561 */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002562typedef struct wordnode_S wordnode_T;
2563struct wordnode_S
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002564{
Bram Moolenaar0c405862005-06-22 22:26:26 +00002565 union /* shared to save space */
2566 {
2567 char_u hashkey[6]; /* room for the hash key */
2568 int index; /* index in written nodes (valid after first
2569 round) */
2570 } wn_u1;
2571 union /* shared to save space */
2572 {
2573 wordnode_T *next; /* next node with same hash key */
2574 wordnode_T *wnode; /* parent node that will write this node */
2575 } wn_u2;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002576 wordnode_T *wn_child; /* child (next byte in word) */
2577 wordnode_T *wn_sibling; /* next sibling (alternate byte in word,
2578 always sorted) */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002579 char_u wn_byte; /* Byte for this node. NUL for word end */
2580 char_u wn_flags; /* when wn_byte is NUL: WF_ flags */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002581 short wn_region; /* when wn_byte is NUL: region mask; for
2582 PREFIXTREE it's the prefcondnr */
2583 char_u wn_prefixID; /* supported/required prefix ID or 0 */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002584};
2585
Bram Moolenaar51485f02005-06-04 21:55:20 +00002586#define HI2WN(hi) (wordnode_T *)((hi)->hi_key)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002587
Bram Moolenaar51485f02005-06-04 21:55:20 +00002588/*
2589 * Info used while reading the spell files.
2590 */
2591typedef struct spellinfo_S
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002592{
Bram Moolenaar51485f02005-06-04 21:55:20 +00002593 wordnode_T *si_foldroot; /* tree with case-folded words */
Bram Moolenaar8db73182005-06-17 21:51:16 +00002594 long si_foldwcount; /* nr of words in si_foldroot */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002595 wordnode_T *si_keeproot; /* tree with keep-case words */
Bram Moolenaar8db73182005-06-17 21:51:16 +00002596 long si_keepwcount; /* nr of words in si_keeproot */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002597 wordnode_T *si_prefroot; /* tree with postponed prefixes */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002598 sblock_T *si_blocks; /* memory blocks used */
2599 int si_ascii; /* handling only ASCII words */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002600 int si_add; /* addition file */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00002601 int si_clear_chartab; /* when TRUE clear char tables */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002602 int si_region; /* region mask */
2603 vimconv_T si_conv; /* for conversion to 'encoding' */
Bram Moolenaar50cde822005-06-05 21:54:54 +00002604 int si_memtot; /* runtime memory used */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002605 int si_verbose; /* verbose messages */
Bram Moolenaar3982c542005-06-08 21:56:31 +00002606 int si_region_count; /* number of regions supported (1 when there
2607 are no regions) */
2608 char_u si_region_name[16]; /* region names (if count > 1) */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002609
2610 garray_T si_rep; /* list of fromto_T entries from REP lines */
2611 garray_T si_sal; /* list of fromto_T entries from SAL lines */
2612 int si_followup; /* soundsalike: ? */
2613 int si_collapse; /* soundsalike: ? */
2614 int si_rem_accents; /* soundsalike: remove accents */
2615 garray_T si_map; /* MAP info concatenated */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002616 char_u *si_midword; /* MIDWORD chars, alloc'ed string or NULL */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002617 garray_T si_prefcond; /* table with conditions for postponed
2618 * prefixes, each stored as a string */
2619 int si_newID; /* current value for ah_newID */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002620} spellinfo_T;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002621
Bram Moolenaar51485f02005-06-04 21:55:20 +00002622static afffile_T *spell_read_aff __ARGS((char_u *fname, spellinfo_T *spin));
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002623static int str_equal __ARGS((char_u *s1, char_u *s2));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002624static void add_fromto __ARGS((spellinfo_T *spin, garray_T *gap, char_u *from, char_u *to));
2625static int sal_to_bool __ARGS((char_u *s));
Bram Moolenaar5482f332005-04-17 20:18:43 +00002626static int has_non_ascii __ARGS((char_u *s));
Bram Moolenaar51485f02005-06-04 21:55:20 +00002627static void spell_free_aff __ARGS((afffile_T *aff));
2628static int spell_read_dic __ARGS((char_u *fname, spellinfo_T *spin, afffile_T *affile));
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002629static char_u *get_pfxlist __ARGS((afffile_T *affile, char_u *afflist, sblock_T **blp));
2630static int store_aff_word __ARGS((char_u *word, spellinfo_T *spin, char_u *afflist, afffile_T *affile, hashtab_T *ht, hashtab_T *xht, int comb, int flags, char_u *pfxlist));
Bram Moolenaar51485f02005-06-04 21:55:20 +00002631static int spell_read_wordfile __ARGS((char_u *fname, spellinfo_T *spin));
2632static void *getroom __ARGS((sblock_T **blp, size_t len));
2633static char_u *getroom_save __ARGS((sblock_T **blp, char_u *s));
2634static void free_blocks __ARGS((sblock_T *bl));
2635static wordnode_T *wordtree_alloc __ARGS((sblock_T **blp));
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002636static int store_word __ARGS((char_u *word, spellinfo_T *spin, int flags, int region, char_u *pfxlist));
2637static int tree_add_word __ARGS((char_u *word, wordnode_T *tree, int flags, int region, int prefixID, sblock_T **blp));
Bram Moolenaarb765d632005-06-07 21:00:02 +00002638static void wordtree_compress __ARGS((wordnode_T *root, spellinfo_T *spin));
Bram Moolenaar51485f02005-06-04 21:55:20 +00002639static int node_compress __ARGS((wordnode_T *node, hashtab_T *ht, int *tot));
2640static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2));
Bram Moolenaar3982c542005-06-08 21:56:31 +00002641static void write_vim_spell __ARGS((char_u *fname, spellinfo_T *spin));
Bram Moolenaar0c405862005-06-22 22:26:26 +00002642static void clear_node __ARGS((wordnode_T *node));
2643static int put_node __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask, int prefixtree));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002644static void mkspell __ARGS((int fcount, char_u **fnames, int ascii, int overwrite, int added_word));
Bram Moolenaarb765d632005-06-07 21:00:02 +00002645static void init_spellfile __ARGS((void));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002646
2647/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002648 * Read the affix file "fname".
Bram Moolenaar3982c542005-06-08 21:56:31 +00002649 * Returns an afffile_T, NULL for complete failure.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002650 */
2651 static afffile_T *
Bram Moolenaar51485f02005-06-04 21:55:20 +00002652spell_read_aff(fname, spin)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002653 char_u *fname;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002654 spellinfo_T *spin;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002655{
2656 FILE *fd;
2657 afffile_T *aff;
2658 char_u rline[MAXLINELEN];
2659 char_u *line;
2660 char_u *pc = NULL;
Bram Moolenaar8db73182005-06-17 21:51:16 +00002661#define MAXITEMCNT 7
2662 char_u *(items[MAXITEMCNT]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002663 int itemcnt;
2664 char_u *p;
2665 int lnum = 0;
2666 affheader_T *cur_aff = NULL;
2667 int aff_todo = 0;
2668 hashtab_T *tp;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002669 char_u *low = NULL;
2670 char_u *fol = NULL;
2671 char_u *upp = NULL;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002672 static char *e_affname = N_("Affix name too long in %s line %d: %s");
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002673 int do_rep;
2674 int do_sal;
2675 int do_map;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002676 int do_midword;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002677 int found_map = FALSE;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002678 hashitem_T *hi;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002679
Bram Moolenaar51485f02005-06-04 21:55:20 +00002680 /*
2681 * Open the file.
2682 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002683 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002684 if (fd == NULL)
2685 {
2686 EMSG2(_(e_notopen), fname);
2687 return NULL;
2688 }
2689
Bram Moolenaarb765d632005-06-07 21:00:02 +00002690 if (spin->si_verbose || p_verbose > 2)
2691 {
2692 if (!spin->si_verbose)
2693 verbose_enter();
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002694 smsg((char_u *)_("Reading affix file %s ..."), fname);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002695 out_flush();
2696 if (!spin->si_verbose)
2697 verbose_leave();
2698 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002699
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002700 /* Only do REP lines when not done in another .aff file already. */
2701 do_rep = spin->si_rep.ga_len == 0;
2702
2703 /* Only do SAL lines when not done in another .aff file already. */
2704 do_sal = spin->si_sal.ga_len == 0;
2705
2706 /* Only do MAP lines when not done in another .aff file already. */
2707 do_map = spin->si_map.ga_len == 0;
2708
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002709 /* Only do MIDWORD line when not done in another .aff file already */
2710 do_midword = spin->si_midword == NULL;
2711
Bram Moolenaar51485f02005-06-04 21:55:20 +00002712 /*
2713 * Allocate and init the afffile_T structure.
2714 */
2715 aff = (afffile_T *)getroom(&spin->si_blocks, sizeof(afffile_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002716 if (aff == NULL)
2717 return NULL;
2718 hash_init(&aff->af_pref);
2719 hash_init(&aff->af_suff);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002720
2721 /*
2722 * Read all the lines in the file one by one.
2723 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002724 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002725 {
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002726 line_breakcheck();
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002727 ++lnum;
2728
2729 /* Skip comment lines. */
2730 if (*rline == '#')
2731 continue;
2732
2733 /* Convert from "SET" to 'encoding' when needed. */
2734 vim_free(pc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002735#ifdef FEAT_MBYTE
Bram Moolenaar51485f02005-06-04 21:55:20 +00002736 if (spin->si_conv.vc_type != CONV_NONE)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002737 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002738 pc = string_convert(&spin->si_conv, rline, NULL);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002739 if (pc == NULL)
2740 {
2741 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
2742 fname, lnum, rline);
2743 continue;
2744 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002745 line = pc;
2746 }
2747 else
Bram Moolenaarb765d632005-06-07 21:00:02 +00002748#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002749 {
2750 pc = NULL;
2751 line = rline;
2752 }
2753
2754 /* Split the line up in white separated items. Put a NUL after each
2755 * item. */
2756 itemcnt = 0;
2757 for (p = line; ; )
2758 {
2759 while (*p != NUL && *p <= ' ') /* skip white space and CR/NL */
2760 ++p;
2761 if (*p == NUL)
2762 break;
Bram Moolenaar8db73182005-06-17 21:51:16 +00002763 if (itemcnt == MAXITEMCNT) /* too many items */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002764 break;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002765 items[itemcnt++] = p;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002766 while (*p > ' ') /* skip until white space or CR/NL */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002767 ++p;
2768 if (*p == NUL)
2769 break;
2770 *p++ = NUL;
2771 }
2772
2773 /* Handle non-empty lines. */
2774 if (itemcnt > 0)
2775 {
2776 if (STRCMP(items[0], "SET") == 0 && itemcnt == 2
2777 && aff->af_enc == NULL)
2778 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00002779#ifdef FEAT_MBYTE
Bram Moolenaar51485f02005-06-04 21:55:20 +00002780 /* Setup for conversion from "ENC" to 'encoding'. */
2781 aff->af_enc = enc_canonize(items[1]);
2782 if (aff->af_enc != NULL && !spin->si_ascii
2783 && convert_setup(&spin->si_conv, aff->af_enc,
2784 p_enc) == FAIL)
2785 smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
2786 fname, aff->af_enc, p_enc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002787#else
2788 smsg((char_u *)_("Conversion in %s not supported"), fname);
2789#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002790 }
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002791 else if (STRCMP(items[0], "MIDWORD") == 0 && itemcnt == 2)
2792 {
2793 if (do_midword)
2794 spin->si_midword = vim_strsave(items[1]);
2795 }
Bram Moolenaar50cde822005-06-05 21:54:54 +00002796 else if (STRCMP(items[0], "NOSPLITSUGS") == 0 && itemcnt == 1)
2797 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002798 /* ignored, we always split */
Bram Moolenaar50cde822005-06-05 21:54:54 +00002799 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002800 else if (STRCMP(items[0], "TRY") == 0 && itemcnt == 2)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002801 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002802 /* ignored, we look in the tree for what chars may appear */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002803 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002804 else if (STRCMP(items[0], "RAR") == 0 && itemcnt == 2
2805 && aff->af_rar == 0)
2806 {
2807 aff->af_rar = items[1][0];
2808 if (items[1][1] != NUL)
2809 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
2810 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00002811 else if (STRCMP(items[0], "KEP") == 0 && itemcnt == 2
2812 && aff->af_kep == 0)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002813 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00002814 aff->af_kep = items[1][0];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002815 if (items[1][1] != NUL)
2816 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
2817 }
Bram Moolenaar0c405862005-06-22 22:26:26 +00002818 else if (STRCMP(items[0], "BAD") == 0 && itemcnt == 2
2819 && aff->af_bad == 0)
2820 {
2821 aff->af_bad = items[1][0];
2822 if (items[1][1] != NUL)
2823 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
2824 }
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002825 else if (STRCMP(items[0], "PFXPOSTPONE") == 0 && itemcnt == 1)
2826 {
2827 aff->af_pfxpostpone = TRUE;
2828 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002829 else if ((STRCMP(items[0], "PFX") == 0
2830 || STRCMP(items[0], "SFX") == 0)
2831 && aff_todo == 0
Bram Moolenaar8db73182005-06-17 21:51:16 +00002832 && itemcnt >= 4)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002833 {
Bram Moolenaar8db73182005-06-17 21:51:16 +00002834 /* Myspell allows extra text after the item, but that might
2835 * mean mistakes go unnoticed. Require a comment-starter. */
2836 if (itemcnt > 4 && *items[4] != '#')
2837 smsg((char_u *)_("Trailing text in %s line %d: %s"),
2838 fname, lnum, items[4]);
2839
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002840 /* New affix letter. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002841 cur_aff = (affheader_T *)getroom(&spin->si_blocks,
2842 sizeof(affheader_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002843 if (cur_aff == NULL)
2844 break;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002845 cur_aff->ah_key[0] = *items[1]; /* TODO: multi-byte? */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002846 cur_aff->ah_key[1] = NUL;
2847 if (items[1][1] != NUL)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002848 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002849 if (*items[2] == 'Y')
2850 cur_aff->ah_combine = TRUE;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002851 else if (*items[2] != 'N')
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002852 smsg((char_u *)_("Expected Y or N in %s line %d: %s"),
2853 fname, lnum, items[2]);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002854
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002855 if (*items[0] == 'P')
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002856 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002857 tp = &aff->af_pref;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002858 /* Use a new number in the .spl file later, to be able to
2859 * handle multiple .aff files. */
2860 if (aff->af_pfxpostpone)
Bram Moolenaard857f0e2005-06-21 22:37:39 +00002861 cur_aff->ah_newID = ++spin->si_newID;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002862 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002863 else
2864 tp = &aff->af_suff;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002865 aff_todo = atoi((char *)items[3]);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00002866 hi = hash_find(tp, cur_aff->ah_key);
2867 if (!HASHITEM_EMPTY(hi))
Bram Moolenaar51485f02005-06-04 21:55:20 +00002868 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002869 smsg((char_u *)_("Duplicate affix in %s line %d: %s"),
2870 fname, lnum, items[1]);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002871 aff_todo = 0;
2872 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002873 else
2874 hash_add(tp, cur_aff->ah_key);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002875 }
2876 else if ((STRCMP(items[0], "PFX") == 0
2877 || STRCMP(items[0], "SFX") == 0)
2878 && aff_todo > 0
2879 && STRCMP(cur_aff->ah_key, items[1]) == 0
Bram Moolenaar8db73182005-06-17 21:51:16 +00002880 && itemcnt >= 5)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002881 {
2882 affentry_T *aff_entry;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002883 int rare = FALSE;
2884 int lasti = 5;
2885
2886 /* Check for "rare" after the other info. */
2887 if (itemcnt > 5 && STRICMP(items[5], "rare") == 0)
2888 {
2889 rare = TRUE;
2890 lasti = 6;
2891 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002892
Bram Moolenaar8db73182005-06-17 21:51:16 +00002893 /* Myspell allows extra text after the item, but that might
2894 * mean mistakes go unnoticed. Require a comment-starter. */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002895 if (itemcnt > lasti && *items[lasti] != '#')
Bram Moolenaar8db73182005-06-17 21:51:16 +00002896 smsg((char_u *)_("Trailing text in %s line %d: %s"),
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002897 fname, lnum, items[lasti]);
Bram Moolenaar8db73182005-06-17 21:51:16 +00002898
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002899 /* New item for an affix letter. */
2900 --aff_todo;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002901 aff_entry = (affentry_T *)getroom(&spin->si_blocks,
2902 sizeof(affentry_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002903 if (aff_entry == NULL)
2904 break;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002905 aff_entry->ae_rare = rare;
Bram Moolenaar5482f332005-04-17 20:18:43 +00002906
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002907 if (STRCMP(items[2], "0") != 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002908 aff_entry->ae_chop = getroom_save(&spin->si_blocks,
2909 items[2]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002910 if (STRCMP(items[3], "0") != 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002911 aff_entry->ae_add = getroom_save(&spin->si_blocks,
2912 items[3]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002913
Bram Moolenaar51485f02005-06-04 21:55:20 +00002914 /* Don't use an affix entry with non-ASCII characters when
2915 * "spin->si_ascii" is TRUE. */
2916 if (!spin->si_ascii || !(has_non_ascii(aff_entry->ae_chop)
Bram Moolenaar5482f332005-04-17 20:18:43 +00002917 || has_non_ascii(aff_entry->ae_add)))
2918 {
Bram Moolenaar5482f332005-04-17 20:18:43 +00002919 aff_entry->ae_next = cur_aff->ah_first;
2920 cur_aff->ah_first = aff_entry;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002921
2922 if (STRCMP(items[4], ".") != 0)
2923 {
2924 char_u buf[MAXLINELEN];
2925
2926 aff_entry->ae_cond = getroom_save(&spin->si_blocks,
2927 items[4]);
2928 if (*items[0] == 'P')
2929 sprintf((char *)buf, "^%s", items[4]);
2930 else
2931 sprintf((char *)buf, "%s$", items[4]);
2932 aff_entry->ae_prog = vim_regcomp(buf,
2933 RE_MAGIC + RE_STRING);
2934 }
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002935
2936 /* For postponed prefixes we need an entry in si_prefcond
2937 * for the condition. Use an existing one if possible. */
2938 if (*items[0] == 'P' && aff->af_pfxpostpone
2939 && aff_entry->ae_chop == NULL)
2940 {
2941 int idx;
2942 char_u **pp;
2943
2944 for (idx = spin->si_prefcond.ga_len - 1; idx >= 0;
2945 --idx)
2946 {
2947 p = ((char_u **)spin->si_prefcond.ga_data)[idx];
2948 if (str_equal(p, aff_entry->ae_cond))
2949 break;
2950 }
2951 if (idx < 0 && ga_grow(&spin->si_prefcond, 1) == OK)
2952 {
2953 /* Not found, add a new condition. */
2954 idx = spin->si_prefcond.ga_len++;
2955 pp = ((char_u **)spin->si_prefcond.ga_data) + idx;
2956 if (aff_entry->ae_cond == NULL)
2957 *pp = NULL;
2958 else
2959 *pp = getroom_save(&spin->si_blocks,
2960 aff_entry->ae_cond);
2961 }
2962
2963 /* Add the prefix to the prefix tree. */
2964 if (aff_entry->ae_add == NULL)
2965 p = (char_u *)"";
2966 else
2967 p = aff_entry->ae_add;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00002968 tree_add_word(p, spin->si_prefroot, rare ? -2 : -1,
2969 idx, cur_aff->ah_newID, &spin->si_blocks);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00002970 }
Bram Moolenaar5482f332005-04-17 20:18:43 +00002971 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002972 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002973 else if (STRCMP(items[0], "FOL") == 0 && itemcnt == 2)
2974 {
2975 if (fol != NULL)
2976 smsg((char_u *)_("Duplicate FOL in %s line %d"),
2977 fname, lnum);
2978 else
2979 fol = vim_strsave(items[1]);
2980 }
2981 else if (STRCMP(items[0], "LOW") == 0 && itemcnt == 2)
2982 {
2983 if (low != NULL)
2984 smsg((char_u *)_("Duplicate LOW in %s line %d"),
2985 fname, lnum);
2986 else
2987 low = vim_strsave(items[1]);
2988 }
2989 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2)
2990 {
2991 if (upp != NULL)
2992 smsg((char_u *)_("Duplicate UPP in %s line %d"),
2993 fname, lnum);
2994 else
2995 upp = vim_strsave(items[1]);
2996 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002997 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 2)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002998 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002999 /* Ignore REP count */;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003000 if (!isdigit(*items[1]))
3001 smsg((char_u *)_("Expected REP count in %s line %d"),
3002 fname, lnum);
3003 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003004 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 3)
3005 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003006 /* REP item */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003007 if (do_rep)
3008 add_fromto(spin, &spin->si_rep, items[1], items[2]);
3009 }
3010 else if (STRCMP(items[0], "MAP") == 0 && itemcnt == 2)
3011 {
3012 /* MAP item or count */
3013 if (!found_map)
3014 {
3015 /* First line contains the count. */
3016 found_map = TRUE;
3017 if (!isdigit(*items[1]))
3018 smsg((char_u *)_("Expected MAP count in %s line %d"),
3019 fname, lnum);
3020 }
3021 else if (do_map)
3022 {
Bram Moolenaar0c405862005-06-22 22:26:26 +00003023 int c;
3024
3025 /* Check that every character appears only once. */
3026 for (p = items[1]; *p != NUL; )
3027 {
3028#ifdef FEAT_MBYTE
3029 c = mb_ptr2char_adv(&p);
3030#else
3031 c = *p++;
3032#endif
3033 if ((spin->si_map.ga_len > 0
3034 && vim_strchr(spin->si_map.ga_data, c)
3035 != NULL)
3036 || vim_strchr(p, c) != NULL)
3037 smsg((char_u *)_("Duplicate character in MAP in %s line %d"),
3038 fname, lnum);
3039 }
3040
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003041 /* We simply concatenate all the MAP strings, separated by
3042 * slashes. */
3043 ga_concat(&spin->si_map, items[1]);
3044 ga_append(&spin->si_map, '/');
3045 }
3046 }
3047 else if (STRCMP(items[0], "SAL") == 0 && itemcnt == 3)
3048 {
3049 if (do_sal)
3050 {
3051 /* SAL item (sounds-a-like)
3052 * Either one of the known keys or a from-to pair. */
3053 if (STRCMP(items[1], "followup") == 0)
3054 spin->si_followup = sal_to_bool(items[2]);
3055 else if (STRCMP(items[1], "collapse_result") == 0)
3056 spin->si_collapse = sal_to_bool(items[2]);
3057 else if (STRCMP(items[1], "remove_accents") == 0)
3058 spin->si_rem_accents = sal_to_bool(items[2]);
3059 else
3060 /* when "to" is "_" it means empty */
3061 add_fromto(spin, &spin->si_sal, items[1],
3062 STRCMP(items[2], "_") == 0 ? (char_u *)""
3063 : items[2]);
3064 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003065 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003066 else
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003067 smsg((char_u *)_("Unrecognized item in %s line %d: %s"),
3068 fname, lnum, items[0]);
3069 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003070 }
3071
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003072 if (fol != NULL || low != NULL || upp != NULL)
3073 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00003074 if (spin->si_clear_chartab)
3075 {
3076 /* Clear the char type tables, don't want to use any of the
3077 * currently used spell properties. */
3078 init_spell_chartab();
3079 spin->si_clear_chartab = FALSE;
3080 }
3081
Bram Moolenaar3982c542005-06-08 21:56:31 +00003082 /*
3083 * Don't write a word table for an ASCII file, so that we don't check
3084 * for conflicts with a word table that matches 'encoding'.
Bram Moolenaar9f30f502005-06-14 22:01:04 +00003085 * Don't write one for utf-8 either, we use utf_*() and
Bram Moolenaar3982c542005-06-08 21:56:31 +00003086 * mb_get_class(), the list of chars in the file will be incomplete.
3087 */
3088 if (!spin->si_ascii
3089#ifdef FEAT_MBYTE
3090 && !enc_utf8
3091#endif
3092 )
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00003093 {
3094 if (fol == NULL || low == NULL || upp == NULL)
3095 smsg((char_u *)_("Missing FOL/LOW/UPP line in %s"), fname);
3096 else
Bram Moolenaar3982c542005-06-08 21:56:31 +00003097 (void)set_spell_chartab(fol, low, upp);
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00003098 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003099
3100 vim_free(fol);
3101 vim_free(low);
3102 vim_free(upp);
3103 }
3104
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003105 vim_free(pc);
3106 fclose(fd);
3107 return aff;
3108}
3109
3110/*
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003111 * Return TRUE if strings "s1" and "s2" are equal. Also consider both being
3112 * NULL as equal.
3113 */
3114 static int
3115str_equal(s1, s2)
3116 char_u *s1;
3117 char_u *s2;
3118{
3119 if (s1 == NULL || s2 == NULL)
3120 return s1 == s2;
3121 return STRCMP(s1, s2) == 0;
3122}
3123
3124/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003125 * Add a from-to item to "gap". Used for REP and SAL items.
3126 * They are stored case-folded.
3127 */
3128 static void
3129add_fromto(spin, gap, from, to)
3130 spellinfo_T *spin;
3131 garray_T *gap;
3132 char_u *from;
3133 char_u *to;
3134{
3135 fromto_T *ftp;
3136 char_u word[MAXWLEN];
3137
3138 if (ga_grow(gap, 1) == OK)
3139 {
3140 ftp = ((fromto_T *)gap->ga_data) + gap->ga_len;
3141 (void)spell_casefold(from, STRLEN(from), word, MAXWLEN);
3142 ftp->ft_from = getroom_save(&spin->si_blocks, word);
3143 (void)spell_casefold(to, STRLEN(to), word, MAXWLEN);
3144 ftp->ft_to = getroom_save(&spin->si_blocks, word);
3145 ++gap->ga_len;
3146 }
3147}
3148
3149/*
3150 * Convert a boolean argument in a SAL line to TRUE or FALSE;
3151 */
3152 static int
3153sal_to_bool(s)
3154 char_u *s;
3155{
3156 return STRCMP(s, "1") == 0 || STRCMP(s, "true") == 0;
3157}
3158
3159/*
Bram Moolenaar5482f332005-04-17 20:18:43 +00003160 * Return TRUE if string "s" contains a non-ASCII character (128 or higher).
3161 * When "s" is NULL FALSE is returned.
3162 */
3163 static int
3164has_non_ascii(s)
3165 char_u *s;
3166{
3167 char_u *p;
3168
3169 if (s != NULL)
3170 for (p = s; *p != NUL; ++p)
3171 if (*p >= 128)
3172 return TRUE;
3173 return FALSE;
3174}
3175
3176/*
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003177 * Free the structure filled by spell_read_aff().
3178 */
3179 static void
3180spell_free_aff(aff)
3181 afffile_T *aff;
3182{
3183 hashtab_T *ht;
3184 hashitem_T *hi;
3185 int todo;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003186 affheader_T *ah;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003187 affentry_T *ae;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003188
3189 vim_free(aff->af_enc);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003190
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003191 /* All this trouble to free the "ae_prog" items... */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003192 for (ht = &aff->af_pref; ; ht = &aff->af_suff)
3193 {
3194 todo = ht->ht_used;
3195 for (hi = ht->ht_array; todo > 0; ++hi)
3196 {
3197 if (!HASHITEM_EMPTY(hi))
3198 {
3199 --todo;
3200 ah = HI2AH(hi);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003201 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
3202 vim_free(ae->ae_prog);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003203 }
3204 }
3205 if (ht == &aff->af_suff)
3206 break;
3207 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003208
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003209 hash_clear(&aff->af_pref);
3210 hash_clear(&aff->af_suff);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003211}
3212
3213/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003214 * Read dictionary file "fname".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003215 * Returns OK or FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003216 */
3217 static int
Bram Moolenaar51485f02005-06-04 21:55:20 +00003218spell_read_dic(fname, spin, affile)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003219 char_u *fname;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003220 spellinfo_T *spin;
3221 afffile_T *affile;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003222{
Bram Moolenaar51485f02005-06-04 21:55:20 +00003223 hashtab_T ht;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003224 char_u line[MAXLINELEN];
Bram Moolenaar51485f02005-06-04 21:55:20 +00003225 char_u *afflist;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003226 char_u *pfxlist;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003227 char_u *dw;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003228 char_u *pc;
3229 char_u *w;
3230 int l;
3231 hash_T hash;
3232 hashitem_T *hi;
3233 FILE *fd;
3234 int lnum = 1;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003235 int non_ascii = 0;
3236 int retval = OK;
3237 char_u message[MAXLINELEN + MAXWLEN];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003238 int flags;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003239
Bram Moolenaar51485f02005-06-04 21:55:20 +00003240 /*
3241 * Open the file.
3242 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003243 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003244 if (fd == NULL)
3245 {
3246 EMSG2(_(e_notopen), fname);
3247 return FAIL;
3248 }
3249
Bram Moolenaar51485f02005-06-04 21:55:20 +00003250 /* The hashtable is only used to detect duplicated words. */
3251 hash_init(&ht);
3252
Bram Moolenaar8db73182005-06-17 21:51:16 +00003253 spin->si_foldwcount = 0;
3254 spin->si_keepwcount = 0;
3255
Bram Moolenaarb765d632005-06-07 21:00:02 +00003256 if (spin->si_verbose || p_verbose > 2)
3257 {
3258 if (!spin->si_verbose)
3259 verbose_enter();
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003260 smsg((char_u *)_("Reading dictionary file %s ..."), fname);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003261 out_flush();
3262 if (!spin->si_verbose)
3263 verbose_leave();
3264 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003265
3266 /* Read and ignore the first line: word count. */
3267 (void)vim_fgets(line, MAXLINELEN, fd);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00003268 if (!vim_isdigit(*skipwhite(line)))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003269 EMSG2(_("E760: No word count in %s"), fname);
3270
3271 /*
3272 * Read all the lines in the file one by one.
3273 * The words are converted to 'encoding' here, before being added to
3274 * the hashtable.
3275 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003276 while (!vim_fgets(line, MAXLINELEN, fd) && !got_int)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003277 {
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003278 line_breakcheck();
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003279 ++lnum;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00003280 if (line[0] == '#')
3281 continue; /* comment line */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003282
Bram Moolenaar51485f02005-06-04 21:55:20 +00003283 /* Remove CR, LF and white space from the end. White space halfway
3284 * the word is kept to allow e.g., "et al.". */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003285 l = STRLEN(line);
3286 while (l > 0 && line[l - 1] <= ' ')
3287 --l;
3288 if (l == 0)
3289 continue; /* empty line */
3290 line[l] = NUL;
3291
Bram Moolenaar51485f02005-06-04 21:55:20 +00003292 /* Find the optional affix names. */
3293 afflist = vim_strchr(line, '/');
3294 if (afflist != NULL)
3295 *afflist++ = NUL;
3296
3297 /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */
3298 if (spin->si_ascii && has_non_ascii(line))
3299 {
3300 ++non_ascii;
Bram Moolenaar5482f332005-04-17 20:18:43 +00003301 continue;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003302 }
Bram Moolenaar5482f332005-04-17 20:18:43 +00003303
Bram Moolenaarb765d632005-06-07 21:00:02 +00003304#ifdef FEAT_MBYTE
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003305 /* Convert from "SET" to 'encoding' when needed. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00003306 if (spin->si_conv.vc_type != CONV_NONE)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003307 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003308 pc = string_convert(&spin->si_conv, line, NULL);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003309 if (pc == NULL)
3310 {
3311 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
3312 fname, lnum, line);
3313 continue;
3314 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003315 w = pc;
3316 }
3317 else
Bram Moolenaarb765d632005-06-07 21:00:02 +00003318#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003319 {
3320 pc = NULL;
3321 w = line;
3322 }
3323
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003324 /* This takes time, print a message now and then. */
3325 if (spin->si_verbose && (lnum & 0x3ff) == 0)
3326 {
3327 vim_snprintf((char *)message, sizeof(message),
3328 _("line %6d, word %6d - %s"),
3329 lnum, spin->si_foldwcount + spin->si_keepwcount, w);
3330 msg_start();
3331 msg_puts_long_attr(message, 0);
3332 msg_clr_eos();
3333 msg_didout = FALSE;
3334 msg_col = 0;
3335 out_flush();
3336 }
3337
Bram Moolenaar51485f02005-06-04 21:55:20 +00003338 /* Store the word in the hashtable to be able to find duplicates. */
3339 dw = (char_u *)getroom_save(&spin->si_blocks, w);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003340 if (dw == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003341 retval = FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003342 vim_free(pc);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003343 if (retval == FAIL)
3344 break;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003345
Bram Moolenaar51485f02005-06-04 21:55:20 +00003346 hash = hash_hash(dw);
3347 hi = hash_lookup(&ht, dw, hash);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003348 if (!HASHITEM_EMPTY(hi))
3349 smsg((char_u *)_("Duplicate word in %s line %d: %s"),
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003350 fname, lnum, w);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003351 else
Bram Moolenaar51485f02005-06-04 21:55:20 +00003352 hash_add_item(&ht, hi, dw, hash);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003353
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003354 flags = 0;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003355 pfxlist = NULL;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003356 if (afflist != NULL)
3357 {
3358 /* Check for affix name that stands for keep-case word and stands
3359 * for rare word (if defined). */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003360 if (affile->af_kep != NUL
3361 && vim_strchr(afflist, affile->af_kep) != NULL)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003362 flags |= WF_KEEPCAP;
3363 if (affile->af_rar != NUL
3364 && vim_strchr(afflist, affile->af_rar) != NULL)
3365 flags |= WF_RARE;
Bram Moolenaar0c405862005-06-22 22:26:26 +00003366 if (affile->af_bad != NUL
3367 && vim_strchr(afflist, affile->af_bad) != NULL)
3368 flags |= WF_BANNED;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003369
3370 if (affile->af_pfxpostpone)
3371 /* Need to store the list of prefix IDs with the word. */
3372 pfxlist = get_pfxlist(affile, afflist, &spin->si_blocks);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003373 }
3374
Bram Moolenaar51485f02005-06-04 21:55:20 +00003375 /* Add the word to the word tree(s). */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003376 if (store_word(dw, spin, flags, spin->si_region, pfxlist) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003377 retval = FAIL;
3378
3379 if (afflist != NULL)
3380 {
3381 /* Find all matching suffixes and add the resulting words.
3382 * Additionally do matching prefixes that combine. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003383 if (store_aff_word(dw, spin, afflist, affile,
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003384 &affile->af_suff, &affile->af_pref,
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003385 FALSE, flags, pfxlist) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003386 retval = FAIL;
3387
3388 /* Find all matching prefixes and add the resulting words. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003389 if (store_aff_word(dw, spin, afflist, affile,
3390 &affile->af_pref, NULL,
3391 FALSE, flags, pfxlist) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003392 retval = FAIL;
3393 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003394 }
3395
Bram Moolenaar51485f02005-06-04 21:55:20 +00003396 if (spin->si_ascii && non_ascii > 0)
3397 smsg((char_u *)_("Ignored %d words with non-ASCII characters"),
3398 non_ascii);
3399 hash_clear(&ht);
3400
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003401 fclose(fd);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003402 return retval;
3403}
3404
3405/*
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003406 * Get the list of prefix IDs from the affix list "afflist".
3407 * Used for PFXPOSTPONE.
3408 * Returns a string allocated with getroom(). NULL when there are no prefixes
3409 * or when out of memory.
3410 */
3411 static char_u *
3412get_pfxlist(affile, afflist, blp)
3413 afffile_T *affile;
3414 char_u *afflist;
3415 sblock_T **blp;
3416{
3417 char_u *p;
3418 int cnt;
3419 int round;
3420 char_u *res = NULL;
3421 char_u key[2];
3422 hashitem_T *hi;
3423
3424 key[1] = NUL;
3425
3426 /* round 1: count the number of prefix IDs.
3427 * round 2: move prefix IDs to "res" */
3428 for (round = 1; round <= 2; ++round)
3429 {
3430 cnt = 0;
3431 for (p = afflist; *p != NUL; ++p)
3432 {
3433 key[0] = *p;
3434 hi = hash_find(&affile->af_pref, key);
3435 if (!HASHITEM_EMPTY(hi))
3436 {
3437 /* This is a prefix ID, use the new number. */
3438 if (round == 2)
3439 res[cnt] = HI2AH(hi)->ah_newID;
3440 ++cnt;
3441 }
3442 }
3443 if (round == 1 && cnt > 0)
3444 res = getroom(blp, cnt + 1);
3445 if (res == NULL)
3446 break;
3447 }
3448
3449 if (res != NULL)
3450 res[cnt] = NUL;
3451 return res;
3452}
3453
3454/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003455 * Apply affixes to a word and store the resulting words.
3456 * "ht" is the hashtable with affentry_T that need to be applied, either
3457 * prefixes or suffixes.
3458 * "xht", when not NULL, is the prefix hashtable, to be used additionally on
3459 * the resulting words for combining affixes.
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003460 *
3461 * Returns FAIL when out of memory.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003462 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003463 static int
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003464store_aff_word(word, spin, afflist, affile, ht, xht, comb, flags, pfxlist)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003465 char_u *word; /* basic word start */
3466 spellinfo_T *spin; /* spell info */
3467 char_u *afflist; /* list of names of supported affixes */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003468 afffile_T *affile;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003469 hashtab_T *ht;
3470 hashtab_T *xht;
3471 int comb; /* only use affixes that combine */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003472 int flags; /* flags for the word */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003473 char_u *pfxlist; /* list of prefix IDs */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003474{
3475 int todo;
3476 hashitem_T *hi;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003477 affheader_T *ah;
3478 affentry_T *ae;
3479 regmatch_T regmatch;
3480 char_u newword[MAXWLEN];
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003481 int retval = OK;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003482 int i;
3483 char_u *p;
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003484 int use_flags;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003485
Bram Moolenaar51485f02005-06-04 21:55:20 +00003486 todo = ht->ht_used;
3487 for (hi = ht->ht_array; todo > 0 && retval == OK; ++hi)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003488 {
3489 if (!HASHITEM_EMPTY(hi))
3490 {
3491 --todo;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003492 ah = HI2AH(hi);
Bram Moolenaar5482f332005-04-17 20:18:43 +00003493
Bram Moolenaar51485f02005-06-04 21:55:20 +00003494 /* Check that the affix combines, if required, and that the word
3495 * supports this affix. */
3496 if ((!comb || ah->ah_combine)
3497 && vim_strchr(afflist, *ah->ah_key) != NULL)
Bram Moolenaar5482f332005-04-17 20:18:43 +00003498 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003499 /* Loop over all affix entries with this name. */
3500 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003501 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003502 /* Check the condition. It's not logical to match case
3503 * here, but it is required for compatibility with
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003504 * Myspell.
3505 * For prefixes, when "PFXPOSTPONE" was used, only do
3506 * prefixes with a chop string. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00003507 regmatch.regprog = ae->ae_prog;
3508 regmatch.rm_ic = FALSE;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003509 if ((xht != NULL || !affile->af_pfxpostpone
3510 || ae->ae_chop != NULL)
3511 && (ae->ae_prog == NULL
3512 || vim_regexec(&regmatch, word, (colnr_T)0)))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003513 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003514 /* Match. Remove the chop and add the affix. */
3515 if (xht == NULL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003516 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003517 /* prefix: chop/add at the start of the word */
3518 if (ae->ae_add == NULL)
3519 *newword = NUL;
3520 else
3521 STRCPY(newword, ae->ae_add);
3522 p = word;
3523 if (ae->ae_chop != NULL)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003524 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003525 /* Skip chop string. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003526#ifdef FEAT_MBYTE
3527 if (has_mbyte)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00003528 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003529 i = mb_charlen(ae->ae_chop);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00003530 for ( ; i > 0; --i)
3531 mb_ptr_adv(p);
3532 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00003533 else
3534#endif
Bram Moolenaar9f30f502005-06-14 22:01:04 +00003535 p += STRLEN(ae->ae_chop);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003536 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003537 STRCAT(newword, p);
3538 }
3539 else
3540 {
3541 /* suffix: chop/add at the end of the word */
3542 STRCPY(newword, word);
3543 if (ae->ae_chop != NULL)
3544 {
3545 /* Remove chop string. */
3546 p = newword + STRLEN(newword);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003547#ifdef FEAT_MBYTE
3548 if (has_mbyte)
3549 i = mb_charlen(ae->ae_chop);
3550 else
3551#endif
3552 i = STRLEN(ae->ae_chop);
3553 for ( ; i > 0; --i)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003554 mb_ptr_back(newword, p);
3555 *p = NUL;
3556 }
3557 if (ae->ae_add != NULL)
3558 STRCAT(newword, ae->ae_add);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003559 }
3560
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003561 /* Obey the "rare" flag of the affix. */
3562 if (ae->ae_rare)
3563 use_flags = flags | WF_RARE;
3564 else
3565 use_flags = flags;
3566
Bram Moolenaar51485f02005-06-04 21:55:20 +00003567 /* Store the modified word. */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003568 if (store_word(newword, spin, use_flags,
3569 spin->si_region, pfxlist) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003570 retval = FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003571
Bram Moolenaar51485f02005-06-04 21:55:20 +00003572 /* When added a suffix and combining is allowed also
3573 * try adding prefixes additionally. */
3574 if (xht != NULL && ah->ah_combine)
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003575 if (store_aff_word(newword, spin, afflist, affile,
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003576 xht, NULL, TRUE, use_flags, pfxlist)
3577 == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003578 retval = FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003579 }
3580 }
3581 }
3582 }
3583 }
3584
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003585 return retval;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003586}
3587
3588/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003589 * Read a file with a list of words.
3590 */
3591 static int
3592spell_read_wordfile(fname, spin)
3593 char_u *fname;
3594 spellinfo_T *spin;
3595{
3596 FILE *fd;
3597 long lnum = 0;
3598 char_u rline[MAXLINELEN];
3599 char_u *line;
3600 char_u *pc = NULL;
3601 int l;
3602 int retval = OK;
3603 int did_word = FALSE;
3604 int non_ascii = 0;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003605 int flags;
Bram Moolenaar3982c542005-06-08 21:56:31 +00003606 int regionmask;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003607
3608 /*
3609 * Open the file.
3610 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003611 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar51485f02005-06-04 21:55:20 +00003612 if (fd == NULL)
3613 {
3614 EMSG2(_(e_notopen), fname);
3615 return FAIL;
3616 }
3617
Bram Moolenaarb765d632005-06-07 21:00:02 +00003618 if (spin->si_verbose || p_verbose > 2)
3619 {
3620 if (!spin->si_verbose)
3621 verbose_enter();
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003622 smsg((char_u *)_("Reading word file %s ..."), fname);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003623 out_flush();
3624 if (!spin->si_verbose)
3625 verbose_leave();
3626 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003627
3628 /*
3629 * Read all the lines in the file one by one.
3630 */
3631 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
3632 {
3633 line_breakcheck();
3634 ++lnum;
3635
3636 /* Skip comment lines. */
3637 if (*rline == '#')
3638 continue;
3639
3640 /* Remove CR, LF and white space from the end. */
3641 l = STRLEN(rline);
3642 while (l > 0 && rline[l - 1] <= ' ')
3643 --l;
3644 if (l == 0)
3645 continue; /* empty or blank line */
3646 rline[l] = NUL;
3647
3648 /* Convert from "=encoding={encoding}" to 'encoding' when needed. */
3649 vim_free(pc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003650#ifdef FEAT_MBYTE
Bram Moolenaar51485f02005-06-04 21:55:20 +00003651 if (spin->si_conv.vc_type != CONV_NONE)
3652 {
3653 pc = string_convert(&spin->si_conv, rline, NULL);
3654 if (pc == NULL)
3655 {
3656 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
3657 fname, lnum, rline);
3658 continue;
3659 }
3660 line = pc;
3661 }
3662 else
Bram Moolenaarb765d632005-06-07 21:00:02 +00003663#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +00003664 {
3665 pc = NULL;
3666 line = rline;
3667 }
3668
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003669 flags = 0;
Bram Moolenaar3982c542005-06-08 21:56:31 +00003670 regionmask = spin->si_region;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003671
3672 if (*line == '/')
Bram Moolenaar51485f02005-06-04 21:55:20 +00003673 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003674 ++line;
Bram Moolenaar3982c542005-06-08 21:56:31 +00003675
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003676 if (STRNCMP(line, "encoding=", 9) == 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003677 {
3678 if (spin->si_conv.vc_type != CONV_NONE)
Bram Moolenaar3982c542005-06-08 21:56:31 +00003679 smsg((char_u *)_("Duplicate /encoding= line ignored in %s line %d: %s"),
3680 fname, lnum, line - 1);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003681 else if (did_word)
Bram Moolenaar3982c542005-06-08 21:56:31 +00003682 smsg((char_u *)_("/encoding= line after word ignored in %s line %d: %s"),
3683 fname, lnum, line - 1);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003684 else
3685 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003686#ifdef FEAT_MBYTE
3687 char_u *enc;
3688
Bram Moolenaar51485f02005-06-04 21:55:20 +00003689 /* Setup for conversion to 'encoding'. */
Bram Moolenaar3982c542005-06-08 21:56:31 +00003690 line += 10;
3691 enc = enc_canonize(line);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003692 if (enc != NULL && !spin->si_ascii
3693 && convert_setup(&spin->si_conv, enc,
3694 p_enc) == FAIL)
3695 smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
Bram Moolenaar3982c542005-06-08 21:56:31 +00003696 fname, line, p_enc);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003697 vim_free(enc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003698#else
3699 smsg((char_u *)_("Conversion in %s not supported"), fname);
3700#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +00003701 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003702 continue;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003703 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003704
Bram Moolenaar3982c542005-06-08 21:56:31 +00003705 if (STRNCMP(line, "regions=", 8) == 0)
3706 {
3707 if (spin->si_region_count > 1)
3708 smsg((char_u *)_("Duplicate /regions= line ignored in %s line %d: %s"),
3709 fname, lnum, line);
3710 else
3711 {
3712 line += 8;
3713 if (STRLEN(line) > 16)
3714 smsg((char_u *)_("Too many regions in %s line %d: %s"),
3715 fname, lnum, line);
3716 else
3717 {
3718 spin->si_region_count = STRLEN(line) / 2;
3719 STRCPY(spin->si_region_name, line);
3720 }
3721 }
3722 continue;
3723 }
3724
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003725 if (*line == '=')
3726 {
3727 /* keep-case word */
3728 flags |= WF_KEEPCAP;
3729 ++line;
3730 }
3731
3732 if (*line == '!')
3733 {
3734 /* Bad, bad, wicked word. */
3735 flags |= WF_BANNED;
3736 ++line;
3737 }
3738 else if (*line == '?')
3739 {
3740 /* Rare word. */
3741 flags |= WF_RARE;
3742 ++line;
3743 }
3744
Bram Moolenaar3982c542005-06-08 21:56:31 +00003745 if (VIM_ISDIGIT(*line))
3746 {
3747 /* region number(s) */
3748 regionmask = 0;
3749 while (VIM_ISDIGIT(*line))
3750 {
3751 l = *line - '0';
3752 if (l > spin->si_region_count)
3753 {
3754 smsg((char_u *)_("Invalid region nr in %s line %d: %s"),
3755 fname, lnum, line);
3756 break;
3757 }
3758 regionmask |= 1 << (l - 1);
3759 ++line;
3760 }
3761 flags |= WF_REGION;
3762 }
3763
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003764 if (flags == 0)
3765 {
3766 smsg((char_u *)_("/ line ignored in %s line %d: %s"),
Bram Moolenaar51485f02005-06-04 21:55:20 +00003767 fname, lnum, line);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003768 continue;
3769 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003770 }
3771
3772 /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */
3773 if (spin->si_ascii && has_non_ascii(line))
3774 {
3775 ++non_ascii;
3776 continue;
3777 }
3778
3779 /* Normal word: store it. */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003780 if (store_word(line, spin, flags, regionmask, NULL) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003781 {
3782 retval = FAIL;
3783 break;
3784 }
3785 did_word = TRUE;
3786 }
3787
3788 vim_free(pc);
3789 fclose(fd);
3790
Bram Moolenaarb765d632005-06-07 21:00:02 +00003791 if (spin->si_ascii && non_ascii > 0 && (spin->si_verbose || p_verbose > 2))
3792 {
3793 if (p_verbose > 2)
3794 verbose_enter();
Bram Moolenaar51485f02005-06-04 21:55:20 +00003795 smsg((char_u *)_("Ignored %d words with non-ASCII characters"),
3796 non_ascii);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003797 if (p_verbose > 2)
3798 verbose_leave();
3799 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003800 return retval;
3801}
3802
3803/*
3804 * Get part of an sblock_T, "len" bytes long.
3805 * This avoids calling free() for every little struct we use.
3806 * The memory is cleared to all zeros.
3807 * Returns NULL when out of memory.
3808 */
3809 static void *
3810getroom(blp, len)
3811 sblock_T **blp;
3812 size_t len; /* length needed */
3813{
3814 char_u *p;
3815 sblock_T *bl = *blp;
3816
3817 if (bl == NULL || bl->sb_used + len > SBLOCKSIZE)
3818 {
3819 /* Allocate a block of memory. This is not freed until much later. */
3820 bl = (sblock_T *)alloc_clear((unsigned)(sizeof(sblock_T) + SBLOCKSIZE));
3821 if (bl == NULL)
3822 return NULL;
3823 bl->sb_next = *blp;
3824 *blp = bl;
3825 bl->sb_used = 0;
3826 }
3827
3828 p = bl->sb_data + bl->sb_used;
3829 bl->sb_used += len;
3830
3831 return p;
3832}
3833
3834/*
3835 * Make a copy of a string into memory allocated with getroom().
3836 */
3837 static char_u *
3838getroom_save(blp, s)
3839 sblock_T **blp;
3840 char_u *s;
3841{
3842 char_u *sc;
3843
3844 sc = (char_u *)getroom(blp, STRLEN(s) + 1);
3845 if (sc != NULL)
3846 STRCPY(sc, s);
3847 return sc;
3848}
3849
3850
3851/*
3852 * Free the list of allocated sblock_T.
3853 */
3854 static void
3855free_blocks(bl)
3856 sblock_T *bl;
3857{
3858 sblock_T *next;
3859
3860 while (bl != NULL)
3861 {
3862 next = bl->sb_next;
3863 vim_free(bl);
3864 bl = next;
3865 }
3866}
3867
3868/*
3869 * Allocate the root of a word tree.
3870 */
3871 static wordnode_T *
3872wordtree_alloc(blp)
3873 sblock_T **blp;
3874{
3875 return (wordnode_T *)getroom(blp, sizeof(wordnode_T));
3876}
3877
3878/*
3879 * Store a word in the tree(s).
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003880 * Always store it in the case-folded tree. A keep-case word can also be used
3881 * with all caps.
Bram Moolenaar51485f02005-06-04 21:55:20 +00003882 * For a keep-case word also store it in the keep-case tree.
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003883 * When "pfxlist" is not NULL store the word for each prefix ID.
Bram Moolenaar51485f02005-06-04 21:55:20 +00003884 */
3885 static int
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003886store_word(word, spin, flags, region, pfxlist)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003887 char_u *word;
3888 spellinfo_T *spin;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003889 int flags; /* extra flags, WF_BANNED */
Bram Moolenaar3982c542005-06-08 21:56:31 +00003890 int region; /* supported region(s) */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003891 char_u *pfxlist; /* list of prefix IDs or NULL */
Bram Moolenaar51485f02005-06-04 21:55:20 +00003892{
3893 int len = STRLEN(word);
3894 int ct = captype(word, word + len);
3895 char_u foldword[MAXWLEN];
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003896 int res = OK;
3897 char_u *p;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003898
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003899 (void)spell_casefold(word, len, foldword, MAXWLEN);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003900 for (p = pfxlist; res == OK; ++p)
3901 {
3902 res = tree_add_word(foldword, spin->si_foldroot, ct | flags,
3903 region, p == NULL ? 0 : *p, &spin->si_blocks);
3904 if (p == NULL || *p == NUL)
3905 break;
3906 }
Bram Moolenaar8db73182005-06-17 21:51:16 +00003907 ++spin->si_foldwcount;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003908
3909 if (res == OK && (ct == WF_KEEPCAP || flags & WF_KEEPCAP))
Bram Moolenaar8db73182005-06-17 21:51:16 +00003910 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003911 for (p = pfxlist; res == OK; ++p)
3912 {
3913 res = tree_add_word(word, spin->si_keeproot, flags,
3914 region, p == NULL ? 0 : *p, &spin->si_blocks);
3915 if (p == NULL || *p == NUL)
3916 break;
3917 }
Bram Moolenaar8db73182005-06-17 21:51:16 +00003918 ++spin->si_keepwcount;
3919 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003920 return res;
3921}
3922
3923/*
3924 * Add word "word" to a word tree at "root".
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00003925 * When "flags" < 0 we are adding to the prefix tree where flags is used for
3926 * "rare" and "region" is the condition nr.
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003927 * Returns FAIL when out of memory.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003928 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003929 static int
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003930tree_add_word(word, root, flags, region, prefixID, blp)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003931 char_u *word;
3932 wordnode_T *root;
3933 int flags;
3934 int region;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003935 int prefixID;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003936 sblock_T **blp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003937{
Bram Moolenaar51485f02005-06-04 21:55:20 +00003938 wordnode_T *node = root;
3939 wordnode_T *np;
3940 wordnode_T **prev = NULL;
3941 int i;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003942
Bram Moolenaar51485f02005-06-04 21:55:20 +00003943 /* Add each byte of the word to the tree, including the NUL at the end. */
3944 for (i = 0; ; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003945 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003946 /* Look for the sibling that has the same character. They are sorted
3947 * on byte value, thus stop searching when a sibling is found with a
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003948 * higher byte value. For zero bytes (end of word) the sorting is
3949 * done on flags and then on prefixID
Bram Moolenaar51485f02005-06-04 21:55:20 +00003950 */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003951 while (node != NULL
3952 && (node->wn_byte < word[i]
3953 || (node->wn_byte == NUL
3954 && (flags < 0
3955 ? node->wn_prefixID < prefixID
3956 : node->wn_flags < (flags & 0xff)
3957 || (node->wn_flags == (flags & 0xff)
3958 && node->wn_prefixID < prefixID)))))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003959 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003960 prev = &node->wn_sibling;
3961 node = *prev;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003962 }
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003963 if (node == NULL
3964 || node->wn_byte != word[i]
3965 || (word[i] == NUL
3966 && (flags < 0
3967 || node->wn_flags != (flags & 0xff)
3968 || node->wn_prefixID != prefixID)))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003969 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003970 /* Allocate a new node. */
3971 np = (wordnode_T *)getroom(blp, sizeof(wordnode_T));
3972 if (np == NULL)
3973 return FAIL;
3974 np->wn_byte = word[i];
3975 *prev = np;
3976 np->wn_sibling = node;
3977 node = np;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003978 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003979
Bram Moolenaar51485f02005-06-04 21:55:20 +00003980 if (word[i] == NUL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003981 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003982 node->wn_flags = flags;
3983 node->wn_region |= region;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00003984 node->wn_prefixID = prefixID;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003985 break;
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +00003986 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003987 prev = &node->wn_child;
3988 node = *prev;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003989 }
3990
3991 return OK;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003992}
3993
3994/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003995 * Compress a tree: find tails that are identical and can be shared.
3996 */
3997 static void
Bram Moolenaarb765d632005-06-07 21:00:02 +00003998wordtree_compress(root, spin)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003999 wordnode_T *root;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004000 spellinfo_T *spin;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004001{
4002 hashtab_T ht;
4003 int n;
4004 int tot = 0;
4005
4006 if (root != NULL)
4007 {
4008 hash_init(&ht);
4009 n = node_compress(root, &ht, &tot);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004010 if (spin->si_verbose || p_verbose > 2)
4011 {
4012 if (!spin->si_verbose)
4013 verbose_enter();
4014 smsg((char_u *)_("Compressed %d of %d nodes; %d%% remaining"),
Bram Moolenaar51485f02005-06-04 21:55:20 +00004015 n, tot, (tot - n) * 100 / tot);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004016 if (p_verbose > 2)
4017 verbose_leave();
4018 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00004019 hash_clear(&ht);
4020 }
4021}
4022
4023/*
4024 * Compress a node, its siblings and its children, depth first.
4025 * Returns the number of compressed nodes.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004026 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00004027 static int
Bram Moolenaar51485f02005-06-04 21:55:20 +00004028node_compress(node, ht, tot)
4029 wordnode_T *node;
4030 hashtab_T *ht;
4031 int *tot; /* total count of nodes before compressing,
4032 incremented while going through the tree */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004033{
Bram Moolenaar51485f02005-06-04 21:55:20 +00004034 wordnode_T *np;
4035 wordnode_T *tp;
4036 wordnode_T *child;
4037 hash_T hash;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004038 hashitem_T *hi;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004039 int len = 0;
4040 unsigned nr, n;
4041 int compressed = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004042
Bram Moolenaar51485f02005-06-04 21:55:20 +00004043 /*
4044 * Go through the list of siblings. Compress each child and then try
4045 * finding an identical child to replace it.
4046 * Note that with "child" we mean not just the node that is pointed to,
4047 * but the whole list of siblings, of which the node is the first.
4048 */
4049 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004050 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004051 ++len;
4052 if ((child = np->wn_child) != NULL)
4053 {
Bram Moolenaar0c405862005-06-22 22:26:26 +00004054 /* Compress the child. This fills hashkey. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00004055 compressed += node_compress(child, ht, tot);
4056
4057 /* Try to find an identical child. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004058 hash = hash_hash(child->wn_u1.hashkey);
4059 hi = hash_lookup(ht, child->wn_u1.hashkey, hash);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004060 tp = NULL;
4061 if (!HASHITEM_EMPTY(hi))
4062 {
4063 /* There are children with an identical hash value. Now check
4064 * if there is one that is really identical. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004065 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004066 if (node_equal(child, tp))
4067 {
4068 /* Found one! Now use that child in place of the
4069 * current one. This means the current child is
4070 * dropped from the tree. */
4071 np->wn_child = tp;
4072 ++compressed;
4073 break;
4074 }
4075 if (tp == NULL)
4076 {
4077 /* No other child with this hash value equals the child of
4078 * the node, add it to the linked list after the first
4079 * item. */
4080 tp = HI2WN(hi);
Bram Moolenaar0c405862005-06-22 22:26:26 +00004081 child->wn_u2.next = tp->wn_u2.next;
4082 tp->wn_u2.next = child;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004083 }
4084 }
4085 else
4086 /* No other child has this hash value, add it to the
4087 * hashtable. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004088 hash_add_item(ht, hi, child->wn_u1.hashkey, hash);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004089 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004090 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00004091 *tot += len;
4092
4093 /*
4094 * Make a hash key for the node and its siblings, so that we can quickly
4095 * find a lookalike node. This must be done after compressing the sibling
4096 * list, otherwise the hash key would become invalid by the compression.
4097 */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004098 node->wn_u1.hashkey[0] = len;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004099 nr = 0;
4100 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004101 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004102 if (np->wn_byte == NUL)
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004103 /* end node: use wn_flags, wn_region and wn_prefixID */
4104 n = np->wn_flags + (np->wn_region << 8) + (np->wn_prefixID << 16);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004105 else
4106 /* byte node: use the byte value and the child pointer */
4107 n = np->wn_byte + ((long_u)np->wn_child << 8);
4108 nr = nr * 101 + n;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004109 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00004110
4111 /* Avoid NUL bytes, it terminates the hash key. */
4112 n = nr & 0xff;
Bram Moolenaar0c405862005-06-22 22:26:26 +00004113 node->wn_u1.hashkey[1] = n == 0 ? 1 : n;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004114 n = (nr >> 8) & 0xff;
Bram Moolenaar0c405862005-06-22 22:26:26 +00004115 node->wn_u1.hashkey[2] = n == 0 ? 1 : n;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004116 n = (nr >> 16) & 0xff;
Bram Moolenaar0c405862005-06-22 22:26:26 +00004117 node->wn_u1.hashkey[3] = n == 0 ? 1 : n;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004118 n = (nr >> 24) & 0xff;
Bram Moolenaar0c405862005-06-22 22:26:26 +00004119 node->wn_u1.hashkey[4] = n == 0 ? 1 : n;
4120 node->wn_u1.hashkey[5] = NUL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004121
4122 return compressed;
4123}
4124
4125/*
4126 * Return TRUE when two nodes have identical siblings and children.
4127 */
4128 static int
4129node_equal(n1, n2)
4130 wordnode_T *n1;
4131 wordnode_T *n2;
4132{
4133 wordnode_T *p1;
4134 wordnode_T *p2;
4135
4136 for (p1 = n1, p2 = n2; p1 != NULL && p2 != NULL;
4137 p1 = p1->wn_sibling, p2 = p2->wn_sibling)
4138 if (p1->wn_byte != p2->wn_byte
4139 || (p1->wn_byte == NUL
4140 ? (p1->wn_flags != p2->wn_flags
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004141 || p1->wn_region != p2->wn_region
4142 || p1->wn_prefixID != p2->wn_prefixID)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004143 : (p1->wn_child != p2->wn_child)))
4144 break;
4145
4146 return p1 == NULL && p2 == NULL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004147}
4148
4149/*
4150 * Write a number to file "fd", MSB first, in "len" bytes.
4151 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00004152 void
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004153put_bytes(fd, nr, len)
4154 FILE *fd;
4155 long_u nr;
4156 int len;
4157{
4158 int i;
4159
4160 for (i = len - 1; i >= 0; --i)
4161 putc((int)(nr >> (i * 8)), fd);
4162}
4163
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004164static int
4165#ifdef __BORLANDC__
4166_RTLENTRYF
4167#endif
4168rep_compare __ARGS((const void *s1, const void *s2));
4169
4170/*
4171 * Function given to qsort() to sort the REP items on "from" string.
4172 */
4173 static int
4174#ifdef __BORLANDC__
4175_RTLENTRYF
4176#endif
4177rep_compare(s1, s2)
4178 const void *s1;
4179 const void *s2;
4180{
4181 fromto_T *p1 = (fromto_T *)s1;
4182 fromto_T *p2 = (fromto_T *)s2;
4183
4184 return STRCMP(p1->ft_from, p2->ft_from);
4185}
4186
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004187/*
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004188 * Write the Vim spell file "fname".
4189 */
4190 static void
Bram Moolenaar3982c542005-06-08 21:56:31 +00004191write_vim_spell(fname, spin)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004192 char_u *fname;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004193 spellinfo_T *spin;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004194{
Bram Moolenaar51485f02005-06-04 21:55:20 +00004195 FILE *fd;
4196 int regionmask;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004197 int round;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004198 wordnode_T *tree;
4199 int nodecount;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004200 int i;
4201 int l;
4202 garray_T *gap;
4203 fromto_T *ftp;
4204 char_u *p;
4205 int rr;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004206
Bram Moolenaarb765d632005-06-07 21:00:02 +00004207 fd = mch_fopen((char *)fname, "w");
Bram Moolenaar51485f02005-06-04 21:55:20 +00004208 if (fd == NULL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004209 {
4210 EMSG2(_(e_notopen), fname);
4211 return;
4212 }
4213
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00004214 /* <HEADER>: <fileID> <regioncnt> <regionname> ...
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004215 * <charflagslen> <charflags>
4216 * <fcharslen> <fchars>
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004217 * <midwordlen> <midword>
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004218 * <prefcondcnt> <prefcond> ... */
Bram Moolenaar51485f02005-06-04 21:55:20 +00004219
4220 /* <fileID> */
4221 if (fwrite(VIMSPELLMAGIC, VIMSPELLMAGICL, (size_t)1, fd) != 1)
4222 EMSG(_(e_write));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004223
4224 /* write the region names if there is more than one */
Bram Moolenaar3982c542005-06-08 21:56:31 +00004225 if (spin->si_region_count > 1)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004226 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00004227 putc(spin->si_region_count, fd); /* <regioncnt> <regionname> ... */
4228 fwrite(spin->si_region_name, (size_t)(spin->si_region_count * 2),
4229 (size_t)1, fd);
4230 regionmask = (1 << spin->si_region_count) - 1;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004231 }
4232 else
4233 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004234 putc(0, fd);
4235 regionmask = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004236 }
4237
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004238 /*
4239 * Write the table with character flags and table for case folding.
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00004240 * <charflagslen> <charflags> <fcharlen> <fchars>
4241 * Skip this for ASCII, the table may conflict with the one used for
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004242 * 'encoding'.
4243 * Also skip this for an .add.spl file, the main spell file must contain
4244 * the table (avoids that it conflicts). File is shorter too.
4245 */
4246 if (spin->si_ascii || spin->si_add)
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00004247 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004248 putc(0, fd);
4249 putc(0, fd);
4250 putc(0, fd);
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00004251 }
4252 else
Bram Moolenaar51485f02005-06-04 21:55:20 +00004253 write_spell_chartab(fd);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00004254
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004255
4256 if (spin->si_midword == NULL)
4257 put_bytes(fd, 0L, 2); /* <midwordlen> */
4258 else
4259 {
4260 i = STRLEN(spin->si_midword);
4261 put_bytes(fd, (long_u)i, 2); /* <midwordlen> */
4262 fwrite(spin->si_midword, (size_t)i, (size_t)1, fd); /* <midword> */
4263 }
4264
4265
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004266 /* Write the prefix conditions. */
4267 write_spell_prefcond(fd, &spin->si_prefcond);
4268
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004269 /* <SUGGEST> : <repcount> <rep> ...
4270 * <salflags> <salcount> <sal> ...
4271 * <maplen> <mapstr> */
4272
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004273 /* Sort the REP items. */
4274 qsort(spin->si_rep.ga_data, (size_t)spin->si_rep.ga_len,
4275 sizeof(fromto_T), rep_compare);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004276
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004277 for (round = 1; round <= 2; ++round)
4278 {
4279 if (round == 1)
4280 gap = &spin->si_rep;
4281 else
4282 {
4283 gap = &spin->si_sal;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004284
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004285 i = 0;
4286 if (spin->si_followup)
4287 i |= SAL_F0LLOWUP;
4288 if (spin->si_collapse)
4289 i |= SAL_COLLAPSE;
4290 if (spin->si_rem_accents)
4291 i |= SAL_REM_ACCENTS;
4292 putc(i, fd); /* <salflags> */
4293 }
4294
4295 put_bytes(fd, (long_u)gap->ga_len, 2); /* <repcount> or <salcount> */
4296 for (i = 0; i < gap->ga_len; ++i)
4297 {
4298 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */
4299 /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
4300 ftp = &((fromto_T *)gap->ga_data)[i];
4301 for (rr = 1; rr <= 2; ++rr)
4302 {
4303 p = rr == 1 ? ftp->ft_from : ftp->ft_to;
4304 l = STRLEN(p);
4305 putc(l, fd);
4306 fwrite(p, l, (size_t)1, fd);
4307 }
4308 }
4309 }
4310
4311 put_bytes(fd, (long_u)spin->si_map.ga_len, 2); /* <maplen> */
4312 if (spin->si_map.ga_len > 0) /* <mapstr> */
4313 fwrite(spin->si_map.ga_data, (size_t)spin->si_map.ga_len,
4314 (size_t)1, fd);
Bram Moolenaar50cde822005-06-05 21:54:54 +00004315
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004316 /*
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004317 * <LWORDTREE> <KWORDTREE> <PREFIXTREE>
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004318 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004319 spin->si_memtot = 0;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004320 for (round = 1; round <= 3; ++round)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004321 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004322 if (round == 1)
4323 tree = spin->si_foldroot;
4324 else if (round == 2)
4325 tree = spin->si_keeproot;
4326 else
4327 tree = spin->si_prefroot;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004328
Bram Moolenaar0c405862005-06-22 22:26:26 +00004329 /* Clear the index and wnode fields in the tree. */
4330 clear_node(tree);
4331
Bram Moolenaar51485f02005-06-04 21:55:20 +00004332 /* Count the number of nodes. Needed to be able to allocate the
Bram Moolenaar0c405862005-06-22 22:26:26 +00004333 * memory when reading the nodes. Also fills in index for shared
Bram Moolenaar51485f02005-06-04 21:55:20 +00004334 * nodes. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004335 nodecount = put_node(NULL, tree, 0, regionmask, round == 3);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004336
Bram Moolenaar51485f02005-06-04 21:55:20 +00004337 /* number of nodes in 4 bytes */
4338 put_bytes(fd, (long_u)nodecount, 4); /* <nodecount> */
Bram Moolenaar50cde822005-06-05 21:54:54 +00004339 spin->si_memtot += nodecount + nodecount * sizeof(int);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004340
Bram Moolenaar51485f02005-06-04 21:55:20 +00004341 /* Write the nodes. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004342 (void)put_node(fd, tree, 0, regionmask, round == 3);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004343 }
4344
Bram Moolenaar51485f02005-06-04 21:55:20 +00004345 fclose(fd);
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00004346}
4347
4348/*
Bram Moolenaar0c405862005-06-22 22:26:26 +00004349 * Clear the index and wnode fields of "node", it siblings and its
4350 * children. This is needed because they are a union with other items to save
4351 * space.
4352 */
4353 static void
4354clear_node(node)
4355 wordnode_T *node;
4356{
4357 wordnode_T *np;
4358
4359 if (node != NULL)
4360 for (np = node; np != NULL; np = np->wn_sibling)
4361 {
4362 np->wn_u1.index = 0;
4363 np->wn_u2.wnode = NULL;
4364
4365 if (np->wn_byte != NUL)
4366 clear_node(np->wn_child);
4367 }
4368}
4369
4370
4371/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00004372 * Dump a word tree at node "node".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004373 *
Bram Moolenaar51485f02005-06-04 21:55:20 +00004374 * This first writes the list of possible bytes (siblings). Then for each
4375 * byte recursively write the children.
4376 *
4377 * NOTE: The code here must match the code in read_tree(), since assumptions
4378 * are made about the indexes (so that we don't have to write them in the
4379 * file).
4380 *
4381 * Returns the number of nodes used.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004382 */
Bram Moolenaar51485f02005-06-04 21:55:20 +00004383 static int
Bram Moolenaar0c405862005-06-22 22:26:26 +00004384put_node(fd, node, index, regionmask, prefixtree)
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004385 FILE *fd; /* NULL when only counting */
Bram Moolenaar51485f02005-06-04 21:55:20 +00004386 wordnode_T *node;
4387 int index;
4388 int regionmask;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004389 int prefixtree; /* TRUE for PREFIXTREE */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004390{
Bram Moolenaar51485f02005-06-04 21:55:20 +00004391 int newindex = index;
4392 int siblingcount = 0;
4393 wordnode_T *np;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004394 int flags;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004395
Bram Moolenaar51485f02005-06-04 21:55:20 +00004396 /* If "node" is zero the tree is empty. */
4397 if (node == NULL)
4398 return 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004399
Bram Moolenaar51485f02005-06-04 21:55:20 +00004400 /* Store the index where this node is written. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004401 node->wn_u1.index = index;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004402
4403 /* Count the number of siblings. */
4404 for (np = node; np != NULL; np = np->wn_sibling)
4405 ++siblingcount;
4406
4407 /* Write the sibling count. */
4408 if (fd != NULL)
4409 putc(siblingcount, fd); /* <siblingcount> */
4410
4411 /* Write each sibling byte and optionally extra info. */
4412 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004413 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004414 if (np->wn_byte == 0)
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00004415 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004416 if (fd != NULL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004417 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004418 /* For a NUL byte (end of word) write the flags etc. */
4419 if (prefixtree)
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00004420 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004421 /* In PREFIXTREE write the required prefixID and the
4422 * associated condition nr (stored in wn_region). */
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004423 if (np->wn_flags == (char_u)-2)
4424 putc(BY_FLAGS, fd); /* <byte> rare */
4425 else
4426 putc(BY_NOFLAGS, fd); /* <byte> */
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004427 putc(np->wn_prefixID, fd); /* <prefixID> */
4428 put_bytes(fd, (long_u)np->wn_region, 2); /* <prefcondnr> */
Bram Moolenaar51485f02005-06-04 21:55:20 +00004429 }
4430 else
4431 {
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004432 /* For word trees we write the flag/region items. */
4433 flags = np->wn_flags;
4434 if (regionmask != 0 && np->wn_region != regionmask)
4435 flags |= WF_REGION;
4436 if (np->wn_prefixID != 0)
4437 flags |= WF_PFX;
4438 if (flags == 0)
4439 {
4440 /* word without flags or region */
4441 putc(BY_NOFLAGS, fd); /* <byte> */
4442 }
4443 else
4444 {
4445 putc(BY_FLAGS, fd); /* <byte> */
4446 putc(flags, fd); /* <flags> */
4447 if (flags & WF_REGION)
4448 putc(np->wn_region, fd); /* <region> */
4449 if (flags & WF_PFX)
4450 putc(np->wn_prefixID, fd); /* <prefixID> */
4451 }
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00004452 }
4453 }
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00004454 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00004455 else
4456 {
Bram Moolenaar0c405862005-06-22 22:26:26 +00004457 if (np->wn_child->wn_u1.index != 0
4458 && np->wn_child->wn_u2.wnode != node)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004459 {
4460 /* The child is written elsewhere, write the reference. */
4461 if (fd != NULL)
4462 {
4463 putc(BY_INDEX, fd); /* <byte> */
4464 /* <nodeidx> */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004465 put_bytes(fd, (long_u)np->wn_child->wn_u1.index, 3);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004466 }
4467 }
Bram Moolenaar0c405862005-06-22 22:26:26 +00004468 else if (np->wn_child->wn_u2.wnode == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004469 /* We will write the child below and give it an index. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00004470 np->wn_child->wn_u2.wnode = node;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00004471
Bram Moolenaar51485f02005-06-04 21:55:20 +00004472 if (fd != NULL)
4473 if (putc(np->wn_byte, fd) == EOF) /* <byte> or <xbyte> */
4474 {
4475 EMSG(_(e_write));
4476 return 0;
4477 }
4478 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004479 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00004480
4481 /* Space used in the array when reading: one for each sibling and one for
4482 * the count. */
4483 newindex += siblingcount + 1;
4484
4485 /* Recursively dump the children of each sibling. */
4486 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar0c405862005-06-22 22:26:26 +00004487 if (np->wn_byte != 0 && np->wn_child->wn_u2.wnode == node)
4488 newindex = put_node(fd, np->wn_child, newindex, regionmask,
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004489 prefixtree);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004490
4491 return newindex;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004492}
4493
4494
4495/*
Bram Moolenaarb765d632005-06-07 21:00:02 +00004496 * ":mkspell [-ascii] outfile infile ..."
4497 * ":mkspell [-ascii] addfile"
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004498 */
4499 void
4500ex_mkspell(eap)
4501 exarg_T *eap;
4502{
4503 int fcount;
4504 char_u **fnames;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004505 char_u *arg = eap->arg;
4506 int ascii = FALSE;
4507
4508 if (STRNCMP(arg, "-ascii", 6) == 0)
4509 {
4510 ascii = TRUE;
4511 arg = skipwhite(arg + 6);
4512 }
4513
4514 /* Expand all the remaining arguments (e.g., $VIMRUNTIME). */
4515 if (get_arglist_exp(arg, &fcount, &fnames) == OK)
4516 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004517 mkspell(fcount, fnames, ascii, eap->forceit, FALSE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004518 FreeWild(fcount, fnames);
4519 }
4520}
4521
4522/*
4523 * Create a Vim spell file from one or more word lists.
4524 * "fnames[0]" is the output file name.
4525 * "fnames[fcount - 1]" is the last input file name.
4526 * Exception: when "fnames[0]" ends in ".add" it's used as the input file name
4527 * and ".spl" is appended to make the output file name.
4528 */
4529 static void
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004530mkspell(fcount, fnames, ascii, overwrite, added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004531 int fcount;
4532 char_u **fnames;
4533 int ascii; /* -ascii argument given */
4534 int overwrite; /* overwrite existing output file */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004535 int added_word; /* invoked through "zg" */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004536{
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004537 char_u fname[MAXPATHL];
4538 char_u wfname[MAXPATHL];
Bram Moolenaarb765d632005-06-07 21:00:02 +00004539 char_u **innames;
4540 int incount;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004541 afffile_T *(afile[8]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004542 int i;
4543 int len;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004544 struct stat st;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00004545 int error = FALSE;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004546 spellinfo_T spin;
4547
4548 vim_memset(&spin, 0, sizeof(spin));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004549 spin.si_verbose = !added_word;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004550 spin.si_ascii = ascii;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004551 spin.si_followup = TRUE;
4552 spin.si_rem_accents = TRUE;
4553 ga_init2(&spin.si_rep, (int)sizeof(fromto_T), 20);
4554 ga_init2(&spin.si_sal, (int)sizeof(fromto_T), 20);
4555 ga_init2(&spin.si_map, (int)sizeof(char_u), 100);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004556 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004557
Bram Moolenaarb765d632005-06-07 21:00:02 +00004558 /* default: fnames[0] is output file, following are input files */
4559 innames = &fnames[1];
4560 incount = fcount - 1;
4561
4562 if (fcount >= 1)
Bram Moolenaar5482f332005-04-17 20:18:43 +00004563 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00004564 len = STRLEN(fnames[0]);
4565 if (fcount == 1 && len > 4 && STRCMP(fnames[0] + len - 4, ".add") == 0)
4566 {
4567 /* For ":mkspell path/en.latin1.add" output file is
4568 * "path/en.latin1.add.spl". */
4569 innames = &fnames[0];
4570 incount = 1;
4571 vim_snprintf((char *)wfname, sizeof(wfname), "%s.spl", fnames[0]);
4572 }
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004573 else if (fcount == 1)
4574 {
4575 /* For ":mkspell path/vim" output file is "path/vim.latin1.spl". */
4576 innames = &fnames[0];
4577 incount = 1;
4578 vim_snprintf((char *)wfname, sizeof(wfname), "%s.%s.spl", fnames[0],
4579 spin.si_ascii ? (char_u *)"ascii" : spell_enc());
4580 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00004581 else if (len > 4 && STRCMP(fnames[0] + len - 4, ".spl") == 0)
4582 {
4583 /* Name ends in ".spl", use as the file name. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004584 vim_strncpy(wfname, fnames[0], sizeof(wfname) - 1);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004585 }
4586 else
4587 /* Name should be language, make the file name from it. */
4588 vim_snprintf((char *)wfname, sizeof(wfname), "%s.%s.spl", fnames[0],
4589 spin.si_ascii ? (char_u *)"ascii" : spell_enc());
4590
4591 /* Check for .ascii.spl. */
4592 if (strstr((char *)gettail(wfname), ".ascii.") != NULL)
4593 spin.si_ascii = TRUE;
4594
4595 /* Check for .add.spl. */
4596 if (strstr((char *)gettail(wfname), ".add.") != NULL)
4597 spin.si_add = TRUE;
Bram Moolenaar5482f332005-04-17 20:18:43 +00004598 }
4599
Bram Moolenaarb765d632005-06-07 21:00:02 +00004600 if (incount <= 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004601 EMSG(_(e_invarg)); /* need at least output and input names */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004602 else if (vim_strchr(gettail(wfname), '_') != NULL)
4603 EMSG(_("E751: Output file name must not have region name"));
Bram Moolenaarb765d632005-06-07 21:00:02 +00004604 else if (incount > 8)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004605 EMSG(_("E754: Only up to 8 regions supported"));
4606 else
4607 {
4608 /* Check for overwriting before doing things that may take a lot of
4609 * time. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004610 if (!overwrite && mch_stat((char *)wfname, &st) >= 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004611 {
4612 EMSG(_(e_exists));
Bram Moolenaarb765d632005-06-07 21:00:02 +00004613 return;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004614 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00004615 if (mch_isdir(wfname))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004616 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00004617 EMSG2(_(e_isadir2), wfname);
4618 return;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004619 }
4620
4621 /*
4622 * Init the aff and dic pointers.
4623 * Get the region names if there are more than 2 arguments.
4624 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004625 for (i = 0; i < incount; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004626 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00004627 afile[i] = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004628
Bram Moolenaar3982c542005-06-08 21:56:31 +00004629 if (incount > 1)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004630 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00004631 len = STRLEN(innames[i]);
4632 if (STRLEN(gettail(innames[i])) < 5
4633 || innames[i][len - 3] != '_')
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004634 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00004635 EMSG2(_("E755: Invalid region in %s"), innames[i]);
4636 return;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004637 }
Bram Moolenaar3982c542005-06-08 21:56:31 +00004638 spin.si_region_name[i * 2] = TOLOWER_ASC(innames[i][len - 2]);
4639 spin.si_region_name[i * 2 + 1] =
4640 TOLOWER_ASC(innames[i][len - 1]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004641 }
4642 }
Bram Moolenaar3982c542005-06-08 21:56:31 +00004643 spin.si_region_count = incount;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004644
Bram Moolenaar51485f02005-06-04 21:55:20 +00004645 spin.si_foldroot = wordtree_alloc(&spin.si_blocks);
4646 spin.si_keeproot = wordtree_alloc(&spin.si_blocks);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004647 spin.si_prefroot = wordtree_alloc(&spin.si_blocks);
4648 if (spin.si_foldroot == NULL
4649 || spin.si_keeproot == NULL
4650 || spin.si_prefroot == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004651 {
4652 error = TRUE;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004653 return;
Bram Moolenaar51485f02005-06-04 21:55:20 +00004654 }
4655
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004656 /* When not producing a .add.spl file clear the character table when
4657 * we encounter one in the .aff file. This means we dump the current
4658 * one in the .spl file if the .aff file doesn't define one. That's
4659 * better than guessing the contents, the table will match a
4660 * previously loaded spell file. */
4661 if (!spin.si_add)
4662 spin.si_clear_chartab = TRUE;
4663
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004664 /*
4665 * Read all the .aff and .dic files.
4666 * Text is converted to 'encoding'.
Bram Moolenaar51485f02005-06-04 21:55:20 +00004667 * Words are stored in the case-folded and keep-case trees.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004668 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004669 for (i = 0; i < incount && !error; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004670 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004671 spin.si_conv.vc_type = CONV_NONE;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004672 spin.si_region = 1 << i;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004673
Bram Moolenaarb765d632005-06-07 21:00:02 +00004674 vim_snprintf((char *)fname, sizeof(fname), "%s.aff", innames[i]);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004675 if (mch_stat((char *)fname, &st) >= 0)
4676 {
4677 /* Read the .aff file. Will init "spin->si_conv" based on the
4678 * "SET" line. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004679 afile[i] = spell_read_aff(fname, &spin);
4680 if (afile[i] == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004681 error = TRUE;
4682 else
4683 {
4684 /* Read the .dic file and store the words in the trees. */
4685 vim_snprintf((char *)fname, sizeof(fname), "%s.dic",
Bram Moolenaarb765d632005-06-07 21:00:02 +00004686 innames[i]);
4687 if (spell_read_dic(fname, &spin, afile[i]) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004688 error = TRUE;
4689 }
4690 }
4691 else
4692 {
4693 /* No .aff file, try reading the file as a word list. Store
4694 * the words in the trees. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004695 if (spell_read_wordfile(innames[i], &spin) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00004696 error = TRUE;
4697 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004698
Bram Moolenaarb765d632005-06-07 21:00:02 +00004699#ifdef FEAT_MBYTE
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004700 /* Free any conversion stuff. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00004701 convert_setup(&spin.si_conv, NULL, NULL);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004702#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004703 }
4704
Bram Moolenaar51485f02005-06-04 21:55:20 +00004705 if (!error)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004706 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00004707 /*
4708 * Remove the dummy NUL from the start of the tree root.
4709 */
4710 spin.si_foldroot = spin.si_foldroot->wn_sibling;
4711 spin.si_keeproot = spin.si_keeproot->wn_sibling;
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004712 spin.si_prefroot = spin.si_prefroot->wn_sibling;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004713
4714 /*
Bram Moolenaar51485f02005-06-04 21:55:20 +00004715 * Combine tails in the tree.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004716 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004717 if (!added_word || p_verbose > 2)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004718 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004719 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004720 verbose_enter();
4721 MSG(_("Compressing word tree..."));
4722 out_flush();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004723 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004724 verbose_leave();
4725 }
4726 wordtree_compress(spin.si_foldroot, &spin);
4727 wordtree_compress(spin.si_keeproot, &spin);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004728 wordtree_compress(spin.si_prefroot, &spin);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004729 }
4730
Bram Moolenaar51485f02005-06-04 21:55:20 +00004731 if (!error)
4732 {
4733 /*
4734 * Write the info in the spell file.
4735 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004736 if (!added_word || p_verbose > 2)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004737 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004738 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004739 verbose_enter();
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004740 smsg((char_u *)_("Writing spell file %s ..."), wfname);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004741 out_flush();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004742 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004743 verbose_leave();
4744 }
Bram Moolenaar50cde822005-06-05 21:54:54 +00004745
Bram Moolenaar3982c542005-06-08 21:56:31 +00004746 write_vim_spell(wfname, &spin);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004747
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004748 if (!added_word || p_verbose > 2)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004749 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004750 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004751 verbose_enter();
4752 MSG(_("Done!"));
4753 smsg((char_u *)_("Estimated runtime memory use: %d bytes"),
Bram Moolenaar50cde822005-06-05 21:54:54 +00004754 spin.si_memtot);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004755 out_flush();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004756 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00004757 verbose_leave();
4758 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004759
Bram Moolenaarb765d632005-06-07 21:00:02 +00004760 /* If the file is loaded need to reload it. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004761 spell_reload_one(wfname, added_word);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004762 }
4763
4764 /* Free the allocated memory. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004765 ga_clear(&spin.si_rep);
4766 ga_clear(&spin.si_sal);
4767 ga_clear(&spin.si_map);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004768 ga_clear(&spin.si_prefcond);
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004769 vim_free(spin.si_midword);
Bram Moolenaar51485f02005-06-04 21:55:20 +00004770
4771 /* Free the .aff file structures. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004772 for (i = 0; i < incount; ++i)
4773 if (afile[i] != NULL)
4774 spell_free_aff(afile[i]);
Bram Moolenaar1d73c882005-06-19 22:48:47 +00004775
4776 /* Free all the bits and pieces at once. */
4777 free_blocks(spin.si_blocks);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004778 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004779}
4780
Bram Moolenaarb765d632005-06-07 21:00:02 +00004781
4782/*
4783 * ":spellgood {word}"
4784 * ":spellwrong {word}"
4785 */
4786 void
4787ex_spell(eap)
4788 exarg_T *eap;
4789{
4790 spell_add_word(eap->arg, STRLEN(eap->arg), eap->cmdidx == CMD_spellwrong);
4791}
4792
4793/*
4794 * Add "word[len]" to 'spellfile' as a good or bad word.
4795 */
4796 void
4797spell_add_word(word, len, bad)
4798 char_u *word;
4799 int len;
4800 int bad;
4801{
4802 FILE *fd;
4803 buf_T *buf;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004804 int new_spf = FALSE;
4805 struct stat st;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004806
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004807 /* If 'spellfile' isn't set figure out a good default value. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004808 if (*curbuf->b_p_spf == NUL)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004809 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00004810 init_spellfile();
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004811 new_spf = TRUE;
4812 }
4813
Bram Moolenaarb765d632005-06-07 21:00:02 +00004814 if (*curbuf->b_p_spf == NUL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004815 EMSG(_("E764: 'spellfile' is not set"));
Bram Moolenaarb765d632005-06-07 21:00:02 +00004816 else
4817 {
4818 /* Check that the user isn't editing the .add file somewhere. */
4819 buf = buflist_findname_exp(curbuf->b_p_spf);
4820 if (buf != NULL && buf->b_ml.ml_mfp == NULL)
4821 buf = NULL;
4822 if (buf != NULL && bufIsChanged(buf))
4823 EMSG(_(e_bufloaded));
4824 else
4825 {
4826 fd = mch_fopen((char *)curbuf->b_p_spf, "a");
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004827 if (fd == NULL && new_spf)
4828 {
4829 /* We just initialized the 'spellfile' option and can't open
4830 * the file. We may need to create the "spell" directory
4831 * first. We already checked the runtime directory is
4832 * writable in init_spellfile(). */
4833 STRCPY(NameBuff, curbuf->b_p_spf);
4834 *gettail_sep(NameBuff) = NUL;
4835 if (mch_stat((char *)NameBuff, &st) < 0)
4836 {
4837 /* The directory doesn't exist. Try creating it and
4838 * opening the file again. */
4839 vim_mkdir(NameBuff, 0755);
4840 fd = mch_fopen((char *)curbuf->b_p_spf, "a");
4841 }
4842 }
4843
Bram Moolenaarb765d632005-06-07 21:00:02 +00004844 if (fd == NULL)
4845 EMSG2(_(e_notopen), curbuf->b_p_spf);
4846 else
4847 {
4848 if (bad)
4849 fprintf(fd, "/!%.*s\n", len, word);
4850 else
4851 fprintf(fd, "%.*s\n", len, word);
4852 fclose(fd);
4853
4854 /* Update the .add.spl file. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004855 mkspell(1, &curbuf->b_p_spf, FALSE, TRUE, TRUE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004856
4857 /* If the .add file is edited somewhere, reload it. */
4858 if (buf != NULL)
4859 buf_reload(buf);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004860
4861 redraw_all_later(NOT_VALID);
Bram Moolenaarb765d632005-06-07 21:00:02 +00004862 }
4863 }
4864 }
4865}
4866
4867/*
4868 * Initialize 'spellfile' for the current buffer.
4869 */
4870 static void
4871init_spellfile()
4872{
4873 char_u buf[MAXPATHL];
4874 int l;
4875 slang_T *sl;
4876 char_u *rtp;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004877 char_u *lend;
Bram Moolenaarb765d632005-06-07 21:00:02 +00004878
4879 if (*curbuf->b_p_spl != NUL && curbuf->b_langp.ga_len > 0)
4880 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004881 /* Find the end of the language name. Exclude the region. */
4882 for (lend = curbuf->b_p_spl; *lend != NUL
4883 && vim_strchr((char_u *)",._", *lend) == NULL; ++lend)
4884 ;
4885
4886 /* Loop over all entries in 'runtimepath'. Use the first one where we
4887 * are allowed to write. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004888 rtp = p_rtp;
4889 while (*rtp != NUL)
4890 {
4891 /* Copy the path from 'runtimepath' to buf[]. */
4892 copy_option_part(&rtp, buf, MAXPATHL, ",");
4893 if (filewritable(buf) == 2)
4894 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00004895 /* Use the first language name from 'spelllang' and the
4896 * encoding used in the first loaded .spl file. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00004897 sl = LANGP_ENTRY(curbuf->b_langp, 0)->lp_slang;
4898 l = STRLEN(buf);
4899 vim_snprintf((char *)buf + l, MAXPATHL - l,
Bram Moolenaar3982c542005-06-08 21:56:31 +00004900 "/spell/%.*s.%s.add",
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00004901 (int)(lend - curbuf->b_p_spl), curbuf->b_p_spl,
Bram Moolenaarb765d632005-06-07 21:00:02 +00004902 strstr((char *)gettail(sl->sl_fname), ".ascii.") != NULL
4903 ? (char_u *)"ascii" : spell_enc());
4904 set_option_value((char_u *)"spellfile", 0L, buf, OPT_LOCAL);
4905 break;
4906 }
4907 }
4908 }
4909}
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00004910
Bram Moolenaar51485f02005-06-04 21:55:20 +00004911
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004912/*
4913 * Init the chartab used for spelling for ASCII.
4914 * EBCDIC is not supported!
4915 */
4916 static void
4917clear_spell_chartab(sp)
4918 spelltab_T *sp;
4919{
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004920 int i;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004921
4922 /* Init everything to FALSE. */
4923 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw));
4924 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu));
4925 for (i = 0; i < 256; ++i)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004926 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004927 sp->st_fold[i] = i;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004928 sp->st_upper[i] = i;
4929 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004930
4931 /* We include digits. A word shouldn't start with a digit, but handling
4932 * that is done separately. */
4933 for (i = '0'; i <= '9'; ++i)
4934 sp->st_isw[i] = TRUE;
4935 for (i = 'A'; i <= 'Z'; ++i)
4936 {
4937 sp->st_isw[i] = TRUE;
4938 sp->st_isu[i] = TRUE;
4939 sp->st_fold[i] = i + 0x20;
4940 }
4941 for (i = 'a'; i <= 'z'; ++i)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004942 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004943 sp->st_isw[i] = TRUE;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004944 sp->st_upper[i] = i - 0x20;
4945 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004946}
4947
4948/*
4949 * Init the chartab used for spelling. Only depends on 'encoding'.
4950 * Called once while starting up and when 'encoding' changes.
4951 * The default is to use isalpha(), but the spell file should define the word
4952 * characters to make it possible that 'encoding' differs from the current
4953 * locale.
4954 */
4955 void
4956init_spell_chartab()
4957{
4958 int i;
4959
4960 did_set_spelltab = FALSE;
4961 clear_spell_chartab(&spelltab);
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004962 vim_memset(spell_ismw, FALSE, sizeof(spell_ismw));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004963#ifdef FEAT_MBYTE
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00004964 vim_free(spell_ismw_mb);
4965 spell_ismw_mb = NULL;
4966
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004967 if (enc_dbcs)
4968 {
4969 /* DBCS: assume double-wide characters are word characters. */
4970 for (i = 128; i <= 255; ++i)
4971 if (MB_BYTE2LEN(i) == 2)
4972 spelltab.st_isw[i] = TRUE;
4973 }
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004974 else if (enc_utf8)
4975 {
4976 for (i = 128; i < 256; ++i)
4977 {
4978 spelltab.st_isu[i] = utf_isupper(i);
4979 spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i);
4980 spelltab.st_fold[i] = utf_fold(i);
4981 spelltab.st_upper[i] = utf_toupper(i);
4982 }
4983 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004984 else
4985#endif
4986 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004987 /* Rough guess: use locale-dependent library functions. */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004988 for (i = 128; i < 256; ++i)
4989 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004990 if (MB_ISUPPER(i))
4991 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004992 spelltab.st_isw[i] = TRUE;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00004993 spelltab.st_isu[i] = TRUE;
4994 spelltab.st_fold[i] = MB_TOLOWER(i);
4995 }
Bram Moolenaar9f30f502005-06-14 22:01:04 +00004996 else if (MB_ISLOWER(i))
4997 {
4998 spelltab.st_isw[i] = TRUE;
4999 spelltab.st_upper[i] = MB_TOUPPER(i);
5000 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005001 }
5002 }
5003}
5004
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005005static char *e_affform = N_("E761: Format error in affix file FOL, LOW or UPP");
5006static char *e_affrange = N_("E762: Character in FOL, LOW or UPP is out of range");
5007
5008/*
5009 * Set the spell character tables from strings in the affix file.
5010 */
5011 static int
5012set_spell_chartab(fol, low, upp)
5013 char_u *fol;
5014 char_u *low;
5015 char_u *upp;
5016{
5017 /* We build the new tables here first, so that we can compare with the
5018 * previous one. */
5019 spelltab_T new_st;
5020 char_u *pf = fol, *pl = low, *pu = upp;
5021 int f, l, u;
5022
5023 clear_spell_chartab(&new_st);
5024
5025 while (*pf != NUL)
5026 {
5027 if (*pl == NUL || *pu == NUL)
5028 {
5029 EMSG(_(e_affform));
5030 return FAIL;
5031 }
5032#ifdef FEAT_MBYTE
5033 f = mb_ptr2char_adv(&pf);
5034 l = mb_ptr2char_adv(&pl);
5035 u = mb_ptr2char_adv(&pu);
5036#else
5037 f = *pf++;
5038 l = *pl++;
5039 u = *pu++;
5040#endif
5041 /* Every character that appears is a word character. */
5042 if (f < 256)
5043 new_st.st_isw[f] = TRUE;
5044 if (l < 256)
5045 new_st.st_isw[l] = TRUE;
5046 if (u < 256)
5047 new_st.st_isw[u] = TRUE;
5048
5049 /* if "LOW" and "FOL" are not the same the "LOW" char needs
5050 * case-folding */
5051 if (l < 256 && l != f)
5052 {
5053 if (f >= 256)
5054 {
5055 EMSG(_(e_affrange));
5056 return FAIL;
5057 }
5058 new_st.st_fold[l] = f;
5059 }
5060
5061 /* if "UPP" and "FOL" are not the same the "UPP" char needs
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005062 * case-folding, it's upper case and the "UPP" is the upper case of
5063 * "FOL" . */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005064 if (u < 256 && u != f)
5065 {
5066 if (f >= 256)
5067 {
5068 EMSG(_(e_affrange));
5069 return FAIL;
5070 }
5071 new_st.st_fold[u] = f;
5072 new_st.st_isu[u] = TRUE;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005073 new_st.st_upper[f] = u;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005074 }
5075 }
5076
5077 if (*pl != NUL || *pu != NUL)
5078 {
5079 EMSG(_(e_affform));
5080 return FAIL;
5081 }
5082
5083 return set_spell_finish(&new_st);
5084}
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005085
5086/*
5087 * Set the spell character tables from strings in the .spl file.
5088 */
5089 static int
5090set_spell_charflags(flags, cnt, upp)
5091 char_u *flags;
5092 int cnt;
5093 char_u *upp;
5094{
5095 /* We build the new tables here first, so that we can compare with the
5096 * previous one. */
5097 spelltab_T new_st;
5098 int i;
5099 char_u *p = upp;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005100 int c;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005101
5102 clear_spell_chartab(&new_st);
5103
5104 for (i = 0; i < cnt; ++i)
5105 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005106 new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
5107 new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005108
5109 if (*p == NUL)
5110 return FAIL;
5111#ifdef FEAT_MBYTE
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005112 c = mb_ptr2char_adv(&p);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005113#else
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005114 c = *p++;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005115#endif
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005116 new_st.st_fold[i + 128] = c;
5117 if (i + 128 != c && new_st.st_isu[i + 128] && c < 256)
5118 new_st.st_upper[c] = i + 128;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005119 }
5120
5121 return set_spell_finish(&new_st);
5122}
5123
5124 static int
5125set_spell_finish(new_st)
5126 spelltab_T *new_st;
5127{
5128 int i;
5129
5130 if (did_set_spelltab)
5131 {
5132 /* check that it's the same table */
5133 for (i = 0; i < 256; ++i)
5134 {
5135 if (spelltab.st_isw[i] != new_st->st_isw[i]
5136 || spelltab.st_isu[i] != new_st->st_isu[i]
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005137 || spelltab.st_fold[i] != new_st->st_fold[i]
5138 || spelltab.st_upper[i] != new_st->st_upper[i])
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005139 {
5140 EMSG(_("E763: Word characters differ between spell files"));
5141 return FAIL;
5142 }
5143 }
5144 }
5145 else
5146 {
5147 /* copy the new spelltab into the one being used */
5148 spelltab = *new_st;
5149 did_set_spelltab = TRUE;
5150 }
5151
5152 return OK;
5153}
5154
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005155/*
Bram Moolenaarea408852005-06-25 22:49:46 +00005156 * Return TRUE if "p" points to a word character.
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00005157 * As a special case we see "midword" characters as word character when it is
Bram Moolenaarea408852005-06-25 22:49:46 +00005158 * followed by a word character. This finds they'there but not 'they there'.
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00005159 * Thus this only works properly when past the first character of the word.
Bram Moolenaarea408852005-06-25 22:49:46 +00005160 */
5161 static int
5162spell_iswordp(p)
5163 char_u *p;
5164{
Bram Moolenaarea408852005-06-25 22:49:46 +00005165#ifdef FEAT_MBYTE
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00005166 char_u *s;
5167 int l;
5168 int c;
5169
5170 if (has_mbyte)
5171 {
5172 l = MB_BYTE2LEN(*p);
5173 s = p;
5174 if (l == 1)
5175 {
5176 /* be quick for ASCII */
5177 if (spell_ismw[*p])
5178 {
5179 s = p + 1; /* skip a mid-word character */
5180 l = MB_BYTE2LEN(*s);
5181 }
5182 }
5183 else
5184 {
5185 c = mb_ptr2char(p);
5186 if (c < 256 ? spell_ismw[c] : (spell_ismw_mb != NULL
5187 && vim_strchr(spell_ismw_mb, c) != NULL))
5188 {
5189 s = p + l;
5190 l = MB_BYTE2LEN(*s);
5191 }
5192 }
5193
5194 if (l > 1)
5195 return mb_get_class(s) >= 2;
5196 return spelltab.st_isw[*s];
5197 }
Bram Moolenaarea408852005-06-25 22:49:46 +00005198#endif
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00005199
5200 return spelltab.st_isw[spell_ismw[*p] ? p[1] : p[0]];
Bram Moolenaarea408852005-06-25 22:49:46 +00005201}
5202
5203/*
Bram Moolenaar1d73c882005-06-19 22:48:47 +00005204 * Write the table with prefix conditions to the .spl file.
5205 */
5206 static void
5207write_spell_prefcond(fd, gap)
5208 FILE *fd;
5209 garray_T *gap;
5210{
5211 int i;
5212 char_u *p;
5213 int len;
5214
5215 put_bytes(fd, (long_u)gap->ga_len, 2); /* <prefcondcnt> */
5216
5217 for (i = 0; i < gap->ga_len; ++i)
5218 {
5219 /* <prefcond> : <condlen> <condstr> */
5220 p = ((char_u **)gap->ga_data)[i];
5221 if (p == NULL)
5222 fputc(0, fd);
5223 else
5224 {
5225 len = STRLEN(p);
5226 fputc(len, fd);
5227 fwrite(p, (size_t)len, (size_t)1, fd);
5228 }
5229 }
5230}
5231
5232/*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005233 * Write the current tables into the .spl file.
5234 * This makes sure the same characters are recognized as word characters when
5235 * generating an when using a spell file.
5236 */
5237 static void
5238write_spell_chartab(fd)
5239 FILE *fd;
5240{
5241 char_u charbuf[256 * 4];
5242 int len = 0;
5243 int flags;
5244 int i;
5245
5246 fputc(128, fd); /* <charflagslen> */
5247 for (i = 128; i < 256; ++i)
5248 {
5249 flags = 0;
5250 if (spelltab.st_isw[i])
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005251 flags |= CF_WORD;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005252 if (spelltab.st_isu[i])
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005253 flags |= CF_UPPER;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005254 fputc(flags, fd); /* <charflags> */
5255
Bram Moolenaarb765d632005-06-07 21:00:02 +00005256#ifdef FEAT_MBYTE
5257 if (has_mbyte)
5258 len += mb_char2bytes(spelltab.st_fold[i], charbuf + len);
5259 else
5260#endif
5261 charbuf[len++] = spelltab.st_fold[i];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005262 }
5263
5264 put_bytes(fd, (long_u)len, 2); /* <fcharlen> */
5265 fwrite(charbuf, (size_t)len, (size_t)1, fd); /* <fchars> */
5266}
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005267
5268/*
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005269 * Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated.
5270 * Uses the character definitions from the .spl file.
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005271 * When using a multi-byte 'encoding' the length may change!
5272 * Returns FAIL when something wrong.
5273 */
5274 static int
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005275spell_casefold(str, len, buf, buflen)
5276 char_u *str;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005277 int len;
5278 char_u *buf;
5279 int buflen;
5280{
5281 int i;
5282
5283 if (len >= buflen)
5284 {
5285 buf[0] = NUL;
5286 return FAIL; /* result will not fit */
5287 }
5288
5289#ifdef FEAT_MBYTE
5290 if (has_mbyte)
5291 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005292 int outi = 0;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005293 char_u *p;
5294 int c;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005295
5296 /* Fold one character at a time. */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005297 for (p = str; p < str + len; )
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005298 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005299 if (outi + MB_MAXBYTES > buflen)
5300 {
5301 buf[outi] = NUL;
5302 return FAIL;
5303 }
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005304 c = mb_ptr2char_adv(&p);
5305 outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005306 }
5307 buf[outi] = NUL;
5308 }
5309 else
5310#endif
5311 {
5312 /* Be quick for non-multibyte encodings. */
5313 for (i = 0; i < len; ++i)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005314 buf[i] = spelltab.st_fold[str[i]];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005315 buf[i] = NUL;
5316 }
5317
5318 return OK;
5319}
5320
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005321/*
5322 * "z?": Find badly spelled word under or after the cursor.
5323 * Give suggestions for the properly spelled word.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005324 */
5325 void
5326spell_suggest()
5327{
5328 char_u *line;
5329 pos_T prev_cursor = curwin->w_cursor;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005330 char_u wcopy[MAXWLEN + 2];
5331 char_u *p;
5332 int i;
5333 int c;
5334 suginfo_T sug;
5335 suggest_T *stp;
5336
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005337 /* Find the start of the badly spelled word. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00005338 if (spell_move_to(FORWARD, TRUE, TRUE) == FAIL
5339 || curwin->w_cursor.col > prev_cursor.col)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005340 {
Bram Moolenaar0c405862005-06-22 22:26:26 +00005341 if (!curwin->w_p_spell || *curbuf->b_p_spl == NUL)
5342 return;
5343
5344 /* No bad word or it starts after the cursor: use the word under the
5345 * cursor. */
5346 curwin->w_cursor = prev_cursor;
5347 line = ml_get_curline();
5348 p = line + curwin->w_cursor.col;
5349 /* Backup to before start of word. */
5350 while (p > line && SPELL_ISWORDP(p))
5351 mb_ptr_back(line, p);
5352 /* Forward to start of word. */
5353 while (!SPELL_ISWORDP(p))
5354 mb_ptr_adv(p);
5355
5356 if (!SPELL_ISWORDP(p)) /* No word found. */
5357 {
5358 beep_flush();
5359 return;
5360 }
5361 curwin->w_cursor.col = p - line;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005362 }
5363
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005364 /* Get the word and its length. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005365 line = ml_get_curline();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005366
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005367 /* Get the list of suggestions */
Bram Moolenaarea408852005-06-25 22:49:46 +00005368 spell_find_suggest(line + curwin->w_cursor.col, &sug, (int)Rows - 2, TRUE);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005369
5370 if (sug.su_ga.ga_len == 0)
5371 MSG(_("Sorry, no suggestions"));
5372 else
5373 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005374 /* List the suggestions. */
5375 msg_start();
5376 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
5377 sug.su_badlen, sug.su_badptr);
5378 msg_puts(IObuff);
5379 msg_clr_eos();
5380 msg_putchar('\n');
Bram Moolenaar0c405862005-06-22 22:26:26 +00005381
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005382 msg_scroll = TRUE;
5383 for (i = 0; i < sug.su_ga.ga_len; ++i)
5384 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005385 stp = &SUG(sug.su_ga, i);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005386
5387 /* The suggested word may replace only part of the bad word, add
5388 * the not replaced part. */
5389 STRCPY(wcopy, stp->st_word);
5390 if (sug.su_badlen > stp->st_orglen)
5391 vim_strncpy(wcopy + STRLEN(wcopy),
5392 sug.su_badptr + stp->st_orglen,
5393 sug.su_badlen - stp->st_orglen);
Bram Moolenaar0c405862005-06-22 22:26:26 +00005394 vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\""), i + 1, wcopy);
5395 msg_puts(IObuff);
5396
5397 /* The word may replace more than "su_badlen". */
5398 if (sug.su_badlen < stp->st_orglen)
5399 {
5400 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""),
5401 stp->st_orglen, sug.su_badptr);
5402 msg_puts(IObuff);
5403 }
5404
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005405 if (p_verbose > 0)
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005406 {
Bram Moolenaar0c405862005-06-22 22:26:26 +00005407 /* Add the score. */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00005408 if (sps_flags & (SPS_DOUBLE | SPS_BEST))
Bram Moolenaar0c405862005-06-22 22:26:26 +00005409 vim_snprintf((char *)IObuff, IOSIZE, _(" (%s%d - %d)"),
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005410 stp->st_salscore ? "s " : "",
5411 stp->st_score, stp->st_altscore);
5412 else
Bram Moolenaar0c405862005-06-22 22:26:26 +00005413 vim_snprintf((char *)IObuff, IOSIZE, _(" (%d)"),
5414 stp->st_score);
5415 msg_advance(30);
5416 msg_puts(IObuff);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005417 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005418 lines_left = 3; /* avoid more prompt */
5419 msg_putchar('\n');
5420 }
5421
5422 /* Ask for choice. */
5423 i = prompt_for_number();
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005424 if (i > 0 && i <= sug.su_ga.ga_len && u_save_cursor() == OK)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005425 {
5426 /* Replace the word. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005427 stp = &SUG(sug.su_ga, i - 1);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005428 p = alloc(STRLEN(line) - stp->st_orglen + STRLEN(stp->st_word) + 1);
5429 if (p != NULL)
5430 {
5431 c = sug.su_badptr - line;
5432 mch_memmove(p, line, c);
5433 STRCPY(p + c, stp->st_word);
5434 STRCAT(p, sug.su_badptr + stp->st_orglen);
5435 ml_replace(curwin->w_cursor.lnum, p, FALSE);
5436 curwin->w_cursor.col = c;
5437 changed_bytes(curwin->w_cursor.lnum, c);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005438
5439 /* For redo we use a change-word command. */
5440 ResetRedobuff();
5441 AppendToRedobuff((char_u *)"ciw");
5442 AppendToRedobuff(stp->st_word);
5443 AppendCharToRedobuff(ESC);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005444 }
5445 }
5446 else
5447 curwin->w_cursor = prev_cursor;
5448 }
5449
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005450 spell_find_cleanup(&sug);
5451}
5452
5453/*
5454 * Find spell suggestions for "word". Return them in the growarray "*gap" as
5455 * a list of allocated strings.
5456 */
5457 void
5458spell_suggest_list(gap, word, maxcount)
5459 garray_T *gap;
5460 char_u *word;
5461 int maxcount; /* maximum nr of suggestions */
5462{
5463 suginfo_T sug;
5464 int i;
5465 suggest_T *stp;
5466 char_u *wcopy;
5467
Bram Moolenaarea408852005-06-25 22:49:46 +00005468 spell_find_suggest(word, &sug, maxcount, FALSE);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005469
5470 /* Make room in "gap". */
5471 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
5472 if (ga_grow(gap, sug.su_ga.ga_len) == FAIL)
5473 return;
5474
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005475 for (i = 0; i < sug.su_ga.ga_len; ++i)
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005476 {
5477 stp = &SUG(sug.su_ga, i);
5478
5479 /* The suggested word may replace only part of "word", add the not
5480 * replaced part. */
5481 wcopy = alloc(STRLEN(stp->st_word)
5482 + STRLEN(sug.su_badptr + stp->st_orglen) + 1);
5483 if (wcopy == NULL)
5484 break;
5485 STRCPY(wcopy, stp->st_word);
5486 STRCAT(wcopy, sug.su_badptr + stp->st_orglen);
5487 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
5488 }
5489
5490 spell_find_cleanup(&sug);
5491}
5492
5493/*
5494 * Find spell suggestions for the word at the start of "badptr".
5495 * Return the suggestions in "su->su_ga".
5496 * The maximum number of suggestions is "maxcount".
5497 * Note: does use info for the current window.
5498 * This is based on the mechanisms of Aspell, but completely reimplemented.
5499 */
5500 static void
Bram Moolenaarea408852005-06-25 22:49:46 +00005501spell_find_suggest(badptr, su, maxcount, banbadword)
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005502 char_u *badptr;
5503 suginfo_T *su;
5504 int maxcount;
Bram Moolenaarea408852005-06-25 22:49:46 +00005505 int banbadword; /* don't include badword in suggestions */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005506{
5507 int attr;
5508
5509 /*
5510 * Set the info in "*su".
5511 */
5512 vim_memset(su, 0, sizeof(suginfo_T));
5513 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10);
5514 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10);
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00005515 if (*badptr == NUL)
5516 return;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005517 hash_init(&su->su_banned);
5518
5519 su->su_badptr = badptr;
5520 su->su_badlen = spell_check(curwin, su->su_badptr, &attr);
5521 su->su_maxcount = maxcount;
5522
5523 if (su->su_badlen >= MAXWLEN)
5524 su->su_badlen = MAXWLEN - 1; /* just in case */
5525 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen);
5526 (void)spell_casefold(su->su_badptr, su->su_badlen,
5527 su->su_fbadword, MAXWLEN);
Bram Moolenaar0c405862005-06-22 22:26:26 +00005528 /* get caps flags for bad word */
5529 su->su_badflags = captype(su->su_badptr, su->su_badptr + su->su_badlen);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005530
5531 /* Ban the bad word itself. It may appear in another region. */
Bram Moolenaarea408852005-06-25 22:49:46 +00005532 if (banbadword)
5533 add_banned(su, su->su_badword);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005534
5535 /*
Bram Moolenaar0c405862005-06-22 22:26:26 +00005536 * 1. Try special cases, such as repeating a word: "the the" -> "the".
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005537 *
5538 * Set a maximum score to limit the combination of operations that is
5539 * tried.
5540 */
5541 su->su_maxscore = SCORE_MAXINIT;
Bram Moolenaar0c405862005-06-22 22:26:26 +00005542 suggest_try_special(su);
5543
5544 /*
5545 * 2. Try inserting/deleting/swapping/changing a letter, use REP entries
5546 * from the .aff file and inserting a space (split the word).
5547 */
5548 suggest_try_change(su);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005549
5550 /* For the resulting top-scorers compute the sound-a-like score. */
5551 if (sps_flags & SPS_DOUBLE)
5552 score_comp_sal(su);
5553
5554 /*
Bram Moolenaar0c405862005-06-22 22:26:26 +00005555 * 3. Try finding sound-a-like words.
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005556 *
5557 * Only do this when we don't have a lot of suggestions yet, because it's
5558 * very slow and often doesn't find new suggestions.
5559 */
5560 if ((sps_flags & SPS_DOUBLE)
5561 || (!(sps_flags & SPS_FAST)
5562 && su->su_ga.ga_len < SUG_CLEAN_COUNT(su)))
5563 {
5564 /* Allow a higher score now. */
5565 su->su_maxscore = SCORE_MAXMAX;
Bram Moolenaar0c405862005-06-22 22:26:26 +00005566 suggest_try_soundalike(su);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005567 }
5568
5569 /* When CTRL-C was hit while searching do show the results. */
5570 ui_breakcheck();
5571 if (got_int)
5572 {
5573 (void)vgetc();
5574 got_int = FALSE;
5575 }
5576
5577 if (sps_flags & SPS_DOUBLE)
5578 {
5579 /* Combine the two list of suggestions. */
5580 score_combine(su);
5581 }
5582 else if (su->su_ga.ga_len != 0)
5583 {
5584 if (sps_flags & SPS_BEST)
5585 /* Adjust the word score for how it sounds like. */
5586 rescore_suggestions(su);
5587
5588 /* Sort the suggestions and truncate at "maxcount". */
5589 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, maxcount);
5590 }
5591}
5592
5593/*
5594 * Free the info put in "*su" by spell_find_suggest().
5595 */
5596 static void
5597spell_find_cleanup(su)
5598 suginfo_T *su;
5599{
5600 int i;
5601
5602 /* Free the suggestions. */
5603 for (i = 0; i < su->su_ga.ga_len; ++i)
5604 vim_free(SUG(su->su_ga, i).st_word);
5605 ga_clear(&su->su_ga);
5606 for (i = 0; i < su->su_sga.ga_len; ++i)
5607 vim_free(SUG(su->su_sga, i).st_word);
5608 ga_clear(&su->su_sga);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005609
5610 /* Free the banned words. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005611 free_banned(su);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005612}
5613
5614/*
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005615 * Make a copy of "word", with the first letter upper or lower cased, to
5616 * "wcopy[MAXWLEN]". "word" must not be empty.
5617 * The result is NUL terminated.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005618 */
5619 static void
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005620onecap_copy(word, wcopy, upper)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005621 char_u *word;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005622 char_u *wcopy;
5623 int upper; /* TRUE: first letter made upper case */
5624{
5625 char_u *p;
5626 int c;
5627 int l;
5628
5629 p = word;
5630#ifdef FEAT_MBYTE
5631 if (has_mbyte)
5632 c = mb_ptr2char_adv(&p);
5633 else
5634#endif
5635 c = *p++;
5636 if (upper)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005637 c = SPELL_TOUPPER(c);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005638 else
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005639 c = SPELL_TOFOLD(c);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005640#ifdef FEAT_MBYTE
5641 if (has_mbyte)
5642 l = mb_char2bytes(c, wcopy);
5643 else
5644#endif
5645 {
5646 l = 1;
5647 wcopy[0] = c;
5648 }
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005649 vim_strncpy(wcopy + l, p, MAXWLEN - l);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005650}
5651
5652/*
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005653 * Make a copy of "word" with all the letters upper cased into
5654 * "wcopy[MAXWLEN]". The result is NUL terminated.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005655 */
5656 static void
5657allcap_copy(word, wcopy)
5658 char_u *word;
5659 char_u *wcopy;
5660{
5661 char_u *s;
5662 char_u *d;
5663 int c;
5664
5665 d = wcopy;
5666 for (s = word; *s != NUL; )
5667 {
5668#ifdef FEAT_MBYTE
5669 if (has_mbyte)
5670 c = mb_ptr2char_adv(&s);
5671 else
5672#endif
5673 c = *s++;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005674 c = SPELL_TOUPPER(c);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005675
5676#ifdef FEAT_MBYTE
5677 if (has_mbyte)
5678 {
5679 if (d - wcopy >= MAXWLEN - MB_MAXBYTES)
5680 break;
5681 d += mb_char2bytes(c, d);
5682 }
5683 else
5684#endif
5685 {
5686 if (d - wcopy >= MAXWLEN - 1)
5687 break;
5688 *d++ = c;
5689 }
5690 }
5691 *d = NUL;
5692}
5693
5694/*
Bram Moolenaar0c405862005-06-22 22:26:26 +00005695 * Try finding suggestions by recognizing specific situations.
5696 */
5697 static void
5698suggest_try_special(su)
5699 suginfo_T *su;
5700{
5701 char_u *p;
5702 int len;
5703 int c;
5704 char_u word[MAXWLEN];
5705
5706 /*
5707 * Recognize a word that is repeated: "the the".
5708 */
5709 p = skiptowhite(su->su_fbadword);
5710 len = p - su->su_fbadword;
5711 p = skipwhite(p);
5712 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0)
5713 {
5714 /* Include badflags: if the badword is onecap or allcap
5715 * use that for the goodword too: "The the" -> "The". */
5716 c = su->su_fbadword[len];
5717 su->su_fbadword[len] = NUL;
5718 make_case_word(su->su_fbadword, word, su->su_badflags);
5719 su->su_fbadword[len] = c;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00005720 add_suggestion(su, &su->su_ga, word, su->su_badlen, SCORE_DEL, 0, TRUE);
Bram Moolenaar0c405862005-06-22 22:26:26 +00005721 }
5722}
5723
5724/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005725 * Try finding suggestions by adding/removing/swapping letters.
Bram Moolenaarea424162005-06-16 21:51:00 +00005726 *
5727 * This uses a state machine. At each node in the tree we try various
5728 * operations. When trying if an operation work "depth" is increased and the
5729 * stack[] is used to store info. This allows combinations, thus insert one
5730 * character, replace one and delete another. The number of changes is
5731 * limited by su->su_maxscore, checked in try_deeper().
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005732 */
5733 static void
Bram Moolenaar0c405862005-06-22 22:26:26 +00005734suggest_try_change(su)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005735 suginfo_T *su;
5736{
5737 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */
5738 char_u tword[MAXWLEN]; /* good word collected so far */
5739 trystate_T stack[MAXWLEN];
5740 char_u preword[MAXWLEN * 3]; /* word found with proper case (appended
5741 * to for word split) */
5742 char_u prewordlen = 0; /* length of word in "preword" */
5743 int splitoff = 0; /* index in tword after last split */
5744 trystate_T *sp;
5745 int newscore;
5746 langp_T *lp;
5747 char_u *byts;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005748 idx_T *idxs;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005749 int depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00005750 int c, c2, c3;
5751 int n = 0;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005752 int flags;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005753 garray_T *gap;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005754 idx_T arridx;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005755 int len;
5756 char_u *p;
5757 fromto_T *ftp;
Bram Moolenaarea424162005-06-16 21:51:00 +00005758 int fl = 0, tl;
Bram Moolenaar0c405862005-06-22 22:26:26 +00005759 int repextra = 0; /* extra bytes in fword[] from REP item */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005760
5761 /* We make a copy of the case-folded bad word, so that we can modify it
Bram Moolenaar0c405862005-06-22 22:26:26 +00005762 * to find matches (esp. REP items). Append some more text, changing
5763 * chars after the bad word may help. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005764 STRCPY(fword, su->su_fbadword);
Bram Moolenaar0c405862005-06-22 22:26:26 +00005765 n = STRLEN(fword);
5766 p = su->su_badptr + su->su_badlen;
5767 (void)spell_casefold(p, STRLEN(p), fword + n, MAXWLEN - n);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005768
5769 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
5770 lp->lp_slang != NULL; ++lp)
5771 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005772 /*
5773 * Go through the whole case-fold tree, try changes at each node.
5774 * "tword[]" contains the word collected from nodes in the tree.
5775 * "fword[]" the word we are trying to match with (initially the bad
5776 * word).
5777 */
5778 byts = lp->lp_slang->sl_fbyts;
5779 idxs = lp->lp_slang->sl_fidxs;
5780
5781 depth = 0;
5782 stack[0].ts_state = STATE_START;
5783 stack[0].ts_score = 0;
5784 stack[0].ts_curi = 1;
5785 stack[0].ts_fidx = 0;
5786 stack[0].ts_fidxtry = 0;
5787 stack[0].ts_twordlen = 0;
5788 stack[0].ts_arridx = 0;
Bram Moolenaarea424162005-06-16 21:51:00 +00005789#ifdef FEAT_MBYTE
5790 stack[0].ts_tcharlen = 0;
5791#endif
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005792
Bram Moolenaarea424162005-06-16 21:51:00 +00005793 /*
5794 * Loop to find all suggestions. At each round we either:
5795 * - For the current state try one operation, advance "ts_curi",
5796 * increase "depth".
5797 * - When a state is done go to the next, set "ts_state".
5798 * - When all states are tried decrease "depth".
5799 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005800 while (depth >= 0 && !got_int)
5801 {
5802 sp = &stack[depth];
5803 switch (sp->ts_state)
5804 {
5805 case STATE_START:
5806 /*
5807 * Start of node: Deal with NUL bytes, which means
5808 * tword[] may end here.
5809 */
5810 arridx = sp->ts_arridx; /* current node in the tree */
5811 len = byts[arridx]; /* bytes in this node */
5812 arridx += sp->ts_curi; /* index of current byte */
5813
Bram Moolenaar0c405862005-06-22 22:26:26 +00005814 if (sp->ts_curi > len || byts[arridx] != 0)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005815 {
5816 /* Past bytes in node and/or past NUL bytes. */
5817 sp->ts_state = STATE_ENDNUL;
5818 break;
5819 }
5820
5821 /*
5822 * End of word in tree.
5823 */
5824 ++sp->ts_curi; /* eat one NUL byte */
5825
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005826 flags = (int)idxs[arridx];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005827
5828 /*
5829 * Form the word with proper case in preword.
5830 * If there is a word from a previous split, append.
5831 */
5832 tword[sp->ts_twordlen] = NUL;
5833 if (flags & WF_KEEPCAP)
5834 /* Must find the word in the keep-case tree. */
5835 find_keepcap_word(lp->lp_slang, tword + splitoff,
5836 preword + prewordlen);
5837 else
Bram Moolenaar0c405862005-06-22 22:26:26 +00005838 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005839 /* Include badflags: if the badword is onecap or allcap
Bram Moolenaar0c405862005-06-22 22:26:26 +00005840 * use that for the goodword too. But if the badword is
5841 * allcap and it's only one char long use onecap. */
5842 c = su->su_badflags;
5843 if ((c & WF_ALLCAP)
5844#ifdef FEAT_MBYTE
5845 && su->su_badlen == mb_ptr2len_check(su->su_badptr)
5846#else
5847 && su->su_badlen == 1
5848#endif
5849 )
5850 c = WF_ONECAP;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005851 make_case_word(tword + splitoff,
Bram Moolenaar0c405862005-06-22 22:26:26 +00005852 preword + prewordlen, flags | c);
5853 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005854
5855 /* Don't use a banned word. It may appear again as a good
5856 * word, thus remember it. */
5857 if (flags & WF_BANNED)
5858 {
5859 add_banned(su, preword + prewordlen);
5860 break;
5861 }
5862 if (was_banned(su, preword + prewordlen))
5863 break;
5864
5865 newscore = 0;
5866 if ((flags & WF_REGION)
5867 && (((unsigned)flags >> 8) & lp->lp_region) == 0)
5868 newscore += SCORE_REGION;
5869 if (flags & WF_RARE)
5870 newscore += SCORE_RARE;
5871
Bram Moolenaar0c405862005-06-22 22:26:26 +00005872 if (!spell_valid_case(su->su_badflags,
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005873 captype(preword + prewordlen, NULL)))
5874 newscore += SCORE_ICASE;
5875
Bram Moolenaar0c405862005-06-22 22:26:26 +00005876 if ((fword[sp->ts_fidx] == NUL
Bram Moolenaarea408852005-06-25 22:49:46 +00005877 || !spell_iswordp(fword + sp->ts_fidx))
Bram Moolenaar0c405862005-06-22 22:26:26 +00005878 && sp->ts_fidx >= sp->ts_fidxtry)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005879 {
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00005880 /* The badword also ends: add suggestions. Give a penalty
5881 * when changing non-word char to word char, e.g., "thes,"
5882 * -> "these". */
5883 p = fword + sp->ts_fidx;
5884#ifdef FEAT_MBYTE
5885 if (has_mbyte)
5886 mb_ptr_back(fword, p);
5887 else
5888#endif
5889 --p;
5890 if (!spell_iswordp(p))
5891 {
5892 p = preword + STRLEN(preword);
5893#ifdef FEAT_MBYTE
5894 if (has_mbyte)
5895 mb_ptr_back(preword, p);
5896 else
5897#endif
5898 --p;
5899 if (spell_iswordp(p))
5900 newscore += SCORE_NONWORD;
5901 }
5902
Bram Moolenaard857f0e2005-06-21 22:37:39 +00005903 add_suggestion(su, &su->su_ga, preword,
Bram Moolenaar0c405862005-06-22 22:26:26 +00005904 sp->ts_fidx - repextra,
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00005905 sp->ts_score + newscore, 0, FALSE);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005906 }
Bram Moolenaarea424162005-06-16 21:51:00 +00005907 else if (sp->ts_fidx >= sp->ts_fidxtry
5908#ifdef FEAT_MBYTE
5909 /* Don't split halfway a character. */
5910 && (!has_mbyte || sp->ts_tcharlen == 0)
5911#endif
5912 )
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005913 {
5914 /* The word in the tree ends but the badword
5915 * continues: try inserting a space and check that a valid
5916 * words starts at fword[sp->ts_fidx]. */
5917 if (try_deeper(su, stack, depth, newscore + SCORE_SPLIT))
5918 {
5919 /* Save things to be restored at STATE_SPLITUNDO. */
5920 sp->ts_save_prewordlen = prewordlen;
Bram Moolenaar0c405862005-06-22 22:26:26 +00005921 sp->ts_save_badflags = su->su_badflags;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005922 sp->ts_save_splitoff = splitoff;
5923
5924 /* Append a space to preword. */
5925 STRCAT(preword, " ");
5926 prewordlen = STRLEN(preword);
5927 splitoff = sp->ts_twordlen;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00005928#ifdef FEAT_MBYTE
5929 if (has_mbyte)
5930 {
5931 int i = 0;
5932
5933 /* Case-folding may change the number of bytes:
5934 * Count nr of chars in fword[sp->ts_fidx] and
5935 * advance that many chars in su->su_badptr. */
5936 for (p = fword; p < fword + sp->ts_fidx;
5937 mb_ptr_adv(p))
5938 ++i;
5939 for (p = su->su_badptr; i > 0; mb_ptr_adv(p))
5940 --i;
5941 }
5942 else
5943#endif
5944 p = su->su_badptr + sp->ts_fidx;
Bram Moolenaar0c405862005-06-22 22:26:26 +00005945 su->su_badflags = captype(p, su->su_badptr
5946 + su->su_badlen);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005947
5948 sp->ts_state = STATE_SPLITUNDO;
5949 ++depth;
5950 /* Restart at top of the tree. */
5951 stack[depth].ts_arridx = 0;
5952 }
5953 }
5954 break;
5955
5956 case STATE_SPLITUNDO:
Bram Moolenaar0c405862005-06-22 22:26:26 +00005957 /* Undo the changes done for word split. */
5958 su->su_badflags = sp->ts_save_badflags;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005959 splitoff = sp->ts_save_splitoff;
5960 prewordlen = sp->ts_save_prewordlen;
5961
5962 /* Continue looking for NUL bytes. */
5963 sp->ts_state = STATE_START;
5964 break;
5965
5966 case STATE_ENDNUL:
5967 /* Past the NUL bytes in the node. */
Bram Moolenaar0c405862005-06-22 22:26:26 +00005968 if (fword[sp->ts_fidx] == NUL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00005969 {
5970 /* The badword ends, can't use the bytes in this node. */
5971 sp->ts_state = STATE_DEL;
5972 break;
5973 }
5974 sp->ts_state = STATE_PLAIN;
5975 /*FALLTHROUGH*/
5976
5977 case STATE_PLAIN:
5978 /*
5979 * Go over all possible bytes at this node, add each to
5980 * tword[] and use child node. "ts_curi" is the index.
5981 */
5982 arridx = sp->ts_arridx;
5983 if (sp->ts_curi > byts[arridx])
5984 {
5985 /* Done all bytes at this node, do next state. When still
5986 * at already changed bytes skip the other tricks. */
5987 if (sp->ts_fidx >= sp->ts_fidxtry)
5988 sp->ts_state = STATE_DEL;
5989 else
5990 sp->ts_state = STATE_FINAL;
5991 }
5992 else
5993 {
5994 arridx += sp->ts_curi++;
5995 c = byts[arridx];
5996
5997 /* Normal byte, go one level deeper. If it's not equal to
5998 * the byte in the bad word adjust the score. But don't
5999 * even try when the byte was already changed. */
Bram Moolenaarea424162005-06-16 21:51:00 +00006000 if (c == fword[sp->ts_fidx]
6001#ifdef FEAT_MBYTE
6002 || (sp->ts_tcharlen > 0
6003 && sp->ts_isdiff != DIFF_NONE)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006004#endif
Bram Moolenaarea424162005-06-16 21:51:00 +00006005 )
6006 newscore = 0;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006007 else
6008 newscore = SCORE_SUBST;
6009 if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry)
6010 && try_deeper(su, stack, depth, newscore))
6011 {
6012 ++depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00006013 sp = &stack[depth];
6014 ++sp->ts_fidx;
6015 tword[sp->ts_twordlen++] = c;
6016 sp->ts_arridx = idxs[arridx];
6017#ifdef FEAT_MBYTE
6018 if (newscore == SCORE_SUBST)
6019 sp->ts_isdiff = DIFF_YES;
6020 if (has_mbyte)
6021 {
6022 /* Multi-byte characters are a bit complicated to
6023 * handle: They differ when any of the bytes
6024 * differ and then their length may also differ. */
6025 if (sp->ts_tcharlen == 0)
6026 {
6027 /* First byte. */
6028 sp->ts_tcharidx = 0;
6029 sp->ts_tcharlen = MB_BYTE2LEN(c);
6030 sp->ts_fcharstart = sp->ts_fidx - 1;
6031 sp->ts_isdiff = (newscore != 0)
6032 ? DIFF_YES : DIFF_NONE;
6033 }
6034 else if (sp->ts_isdiff == DIFF_INSERT)
6035 /* When inserting trail bytes don't advance in
6036 * the bad word. */
6037 --sp->ts_fidx;
6038 if (++sp->ts_tcharidx == sp->ts_tcharlen)
6039 {
6040 /* Last byte of character. */
6041 if (sp->ts_isdiff == DIFF_YES)
6042 {
6043 /* Correct ts_fidx for the byte length of
6044 * the character (we didn't check that
6045 * before). */
6046 sp->ts_fidx = sp->ts_fcharstart
6047 + MB_BYTE2LEN(
6048 fword[sp->ts_fcharstart]);
6049
6050 /* For a similar character adjust score
6051 * from SCORE_SUBST to SCORE_SIMILAR. */
6052 if (lp->lp_slang->sl_has_map
6053 && similar_chars(lp->lp_slang,
6054 mb_ptr2char(tword
6055 + sp->ts_twordlen
6056 - sp->ts_tcharlen),
6057 mb_ptr2char(fword
6058 + sp->ts_fcharstart)))
6059 sp->ts_score -=
6060 SCORE_SUBST - SCORE_SIMILAR;
6061 }
Bram Moolenaarea408852005-06-25 22:49:46 +00006062 else if (sp->ts_isdiff == DIFF_INSERT
6063 && sp->ts_twordlen > sp->ts_tcharlen)
6064 {
6065 /* If the previous character was the same,
6066 * thus doubling a character, give a bonus
6067 * to the score. */
6068 p = tword + sp->ts_twordlen
6069 - sp->ts_tcharlen;
6070 c = mb_ptr2char(p);
6071 mb_ptr_back(tword, p);
6072 if (c == mb_ptr2char(p))
6073 sp->ts_score -= SCORE_INS
6074 - SCORE_INSDUP;
6075 }
Bram Moolenaarea424162005-06-16 21:51:00 +00006076
6077 /* Starting a new char, reset the length. */
6078 sp->ts_tcharlen = 0;
6079 }
6080 }
6081 else
6082#endif
6083 {
6084 /* If we found a similar char adjust the score.
6085 * We do this after calling try_deeper() because
6086 * it's slow. */
6087 if (newscore != 0
6088 && lp->lp_slang->sl_has_map
6089 && similar_chars(lp->lp_slang,
6090 c, fword[sp->ts_fidx - 1]))
6091 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
6092 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006093 }
6094 }
6095 break;
6096
6097 case STATE_DEL:
Bram Moolenaarea424162005-06-16 21:51:00 +00006098#ifdef FEAT_MBYTE
6099 /* When past the first byte of a multi-byte char don't try
6100 * delete/insert/swap a character. */
6101 if (has_mbyte && sp->ts_tcharlen > 0)
6102 {
6103 sp->ts_state = STATE_FINAL;
6104 break;
6105 }
6106#endif
6107 /*
6108 * Try skipping one character in the bad word (delete it).
6109 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006110 sp->ts_state = STATE_INS;
6111 sp->ts_curi = 1;
6112 if (fword[sp->ts_fidx] != NUL
6113 && try_deeper(su, stack, depth, SCORE_DEL))
6114 {
6115 ++depth;
Bram Moolenaarea408852005-06-25 22:49:46 +00006116
6117 /* Advance over the character in fword[]. Give a bonus to
6118 * the score if the same character is following "nn" ->
6119 * "n". */
Bram Moolenaarea424162005-06-16 21:51:00 +00006120#ifdef FEAT_MBYTE
6121 if (has_mbyte)
Bram Moolenaarea408852005-06-25 22:49:46 +00006122 {
6123 c = mb_ptr2char(fword + sp->ts_fidx);
Bram Moolenaarea424162005-06-16 21:51:00 +00006124 stack[depth].ts_fidx += MB_BYTE2LEN(fword[sp->ts_fidx]);
Bram Moolenaarea408852005-06-25 22:49:46 +00006125 if (c == mb_ptr2char(fword + stack[depth].ts_fidx))
6126 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
6127 }
Bram Moolenaarea424162005-06-16 21:51:00 +00006128 else
6129#endif
Bram Moolenaarea408852005-06-25 22:49:46 +00006130 {
Bram Moolenaarea424162005-06-16 21:51:00 +00006131 ++stack[depth].ts_fidx;
Bram Moolenaarea408852005-06-25 22:49:46 +00006132 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1])
6133 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
6134 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006135 break;
6136 }
6137 /*FALLTHROUGH*/
6138
6139 case STATE_INS:
Bram Moolenaarea424162005-06-16 21:51:00 +00006140 /* Insert one byte. Do this for each possible byte at this
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006141 * node. */
6142 n = sp->ts_arridx;
6143 if (sp->ts_curi > byts[n])
6144 {
6145 /* Done all bytes at this node, do next state. */
6146 sp->ts_state = STATE_SWAP;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006147 }
6148 else
6149 {
Bram Moolenaarea424162005-06-16 21:51:00 +00006150 /* Do one more byte at this node. Skip NUL bytes. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006151 n += sp->ts_curi++;
6152 c = byts[n];
6153 if (c != 0 && try_deeper(su, stack, depth, SCORE_INS))
6154 {
6155 ++depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00006156 sp = &stack[depth];
6157 tword[sp->ts_twordlen++] = c;
6158 sp->ts_arridx = idxs[n];
6159#ifdef FEAT_MBYTE
6160 if (has_mbyte)
6161 {
6162 fl = MB_BYTE2LEN(c);
6163 if (fl > 1)
6164 {
6165 /* There are following bytes for the same
6166 * character. We must find all bytes before
6167 * trying delete/insert/swap/etc. */
6168 sp->ts_tcharlen = fl;
6169 sp->ts_tcharidx = 1;
6170 sp->ts_isdiff = DIFF_INSERT;
6171 }
6172 }
Bram Moolenaarea408852005-06-25 22:49:46 +00006173 else
6174 fl = 1;
6175 if (fl == 1)
Bram Moolenaarea424162005-06-16 21:51:00 +00006176#endif
Bram Moolenaarea408852005-06-25 22:49:46 +00006177 {
6178 /* If the previous character was the same, thus
6179 * doubling a character, give a bonus to the
6180 * score. */
6181 if (sp->ts_twordlen >= 2
6182 && tword[sp->ts_twordlen - 2] == c)
6183 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
6184 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006185 }
6186 }
6187 break;
6188
6189 case STATE_SWAP:
Bram Moolenaarea424162005-06-16 21:51:00 +00006190 /*
6191 * Swap two bytes in the bad word: "12" -> "21".
6192 * We change "fword" here, it's changed back afterwards.
6193 */
6194 p = fword + sp->ts_fidx;
6195 c = *p;
6196 if (c == NUL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006197 {
Bram Moolenaarea424162005-06-16 21:51:00 +00006198 /* End of word, can't swap or replace. */
6199 sp->ts_state = STATE_FINAL;
6200 break;
6201 }
6202#ifdef FEAT_MBYTE
6203 if (has_mbyte)
6204 {
6205 n = mb_ptr2len_check(p);
6206 c = mb_ptr2char(p);
6207 c2 = mb_ptr2char(p + n);
6208 }
6209 else
6210#endif
6211 c2 = p[1];
6212 if (c == c2)
6213 {
6214 /* Characters are identical, swap won't do anything. */
6215 sp->ts_state = STATE_SWAP3;
6216 break;
6217 }
6218 if (c2 != NUL && try_deeper(su, stack, depth, SCORE_SWAP))
6219 {
6220 sp->ts_state = STATE_UNSWAP;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006221 ++depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00006222#ifdef FEAT_MBYTE
6223 if (has_mbyte)
6224 {
6225 fl = mb_char2len(c2);
6226 mch_memmove(p, p + n, fl);
6227 mb_char2bytes(c, p + fl);
6228 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
6229 }
6230 else
6231#endif
6232 {
6233 p[0] = c2;
6234 p[1] = c;
6235 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
6236 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006237 }
6238 else
6239 /* If this swap doesn't work then SWAP3 won't either. */
6240 sp->ts_state = STATE_REP_INI;
6241 break;
6242
Bram Moolenaarea424162005-06-16 21:51:00 +00006243 case STATE_UNSWAP:
6244 /* Undo the STATE_SWAP swap: "21" -> "12". */
6245 p = fword + sp->ts_fidx;
6246#ifdef FEAT_MBYTE
6247 if (has_mbyte)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006248 {
Bram Moolenaarea424162005-06-16 21:51:00 +00006249 n = MB_BYTE2LEN(*p);
6250 c = mb_ptr2char(p + n);
6251 mch_memmove(p + MB_BYTE2LEN(p[n]), p, n);
6252 mb_char2bytes(c, p);
6253 }
6254 else
6255#endif
6256 {
6257 c = *p;
6258 *p = p[1];
6259 p[1] = c;
6260 }
6261 /*FALLTHROUGH*/
6262
6263 case STATE_SWAP3:
6264 /* Swap two bytes, skipping one: "123" -> "321". We change
6265 * "fword" here, it's changed back afterwards. */
6266 p = fword + sp->ts_fidx;
6267#ifdef FEAT_MBYTE
6268 if (has_mbyte)
6269 {
6270 n = mb_ptr2len_check(p);
6271 c = mb_ptr2char(p);
6272 fl = mb_ptr2len_check(p + n);
6273 c2 = mb_ptr2char(p + n);
6274 c3 = mb_ptr2char(p + n + fl);
6275 }
6276 else
6277#endif
6278 {
6279 c = *p;
6280 c2 = p[1];
6281 c3 = p[2];
6282 }
6283
6284 /* When characters are identical: "121" then SWAP3 result is
6285 * identical, ROT3L result is same as SWAP: "211", ROT3L
6286 * result is same as SWAP on next char: "112". Thus skip all
6287 * swapping. Also skip when c3 is NUL. */
6288 if (c == c3 || c3 == NUL)
6289 {
6290 sp->ts_state = STATE_REP_INI;
6291 break;
6292 }
6293 if (try_deeper(su, stack, depth, SCORE_SWAP3))
6294 {
6295 sp->ts_state = STATE_UNSWAP3;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006296 ++depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00006297#ifdef FEAT_MBYTE
6298 if (has_mbyte)
6299 {
6300 tl = mb_char2len(c3);
6301 mch_memmove(p, p + n + fl, tl);
6302 mb_char2bytes(c2, p + tl);
6303 mb_char2bytes(c, p + fl + tl);
6304 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
6305 }
6306 else
6307#endif
6308 {
6309 p[0] = p[2];
6310 p[2] = c;
6311 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
6312 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006313 }
6314 else
6315 sp->ts_state = STATE_REP_INI;
6316 break;
6317
Bram Moolenaarea424162005-06-16 21:51:00 +00006318 case STATE_UNSWAP3:
6319 /* Undo STATE_SWAP3: "321" -> "123" */
6320 p = fword + sp->ts_fidx;
6321#ifdef FEAT_MBYTE
6322 if (has_mbyte)
6323 {
6324 n = MB_BYTE2LEN(*p);
6325 c2 = mb_ptr2char(p + n);
6326 fl = MB_BYTE2LEN(p[n]);
6327 c = mb_ptr2char(p + n + fl);
6328 tl = MB_BYTE2LEN(p[n + fl]);
6329 mch_memmove(p + fl + tl, p, n);
6330 mb_char2bytes(c, p);
6331 mb_char2bytes(c2, p + tl);
6332 }
6333 else
6334#endif
6335 {
6336 c = *p;
6337 *p = p[2];
6338 p[2] = c;
6339 }
Bram Moolenaarea424162005-06-16 21:51:00 +00006340
Bram Moolenaarea424162005-06-16 21:51:00 +00006341 /* Rotate three characters left: "123" -> "231". We change
6342 * "fword" here, it's changed back afterwards. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006343 if (try_deeper(su, stack, depth, SCORE_SWAP3))
6344 {
Bram Moolenaarea424162005-06-16 21:51:00 +00006345 sp->ts_state = STATE_UNROT3L;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006346 ++depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00006347 p = fword + sp->ts_fidx;
6348#ifdef FEAT_MBYTE
6349 if (has_mbyte)
6350 {
6351 n = mb_ptr2len_check(p);
6352 c = mb_ptr2char(p);
6353 fl = mb_ptr2len_check(p + n);
6354 fl += mb_ptr2len_check(p + n + fl);
6355 mch_memmove(p, p + n, fl);
6356 mb_char2bytes(c, p + fl);
6357 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
6358 }
6359 else
6360#endif
6361 {
6362 c = *p;
6363 *p = p[1];
6364 p[1] = p[2];
6365 p[2] = c;
6366 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
6367 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006368 }
6369 else
6370 sp->ts_state = STATE_REP_INI;
6371 break;
6372
Bram Moolenaarea424162005-06-16 21:51:00 +00006373 case STATE_UNROT3L:
Bram Moolenaar0c405862005-06-22 22:26:26 +00006374 /* Undo ROT3L: "231" -> "123" */
Bram Moolenaarea424162005-06-16 21:51:00 +00006375 p = fword + sp->ts_fidx;
6376#ifdef FEAT_MBYTE
6377 if (has_mbyte)
6378 {
6379 n = MB_BYTE2LEN(*p);
6380 n += MB_BYTE2LEN(p[n]);
6381 c = mb_ptr2char(p + n);
6382 tl = MB_BYTE2LEN(p[n]);
6383 mch_memmove(p + tl, p, n);
6384 mb_char2bytes(c, p);
6385 }
6386 else
6387#endif
6388 {
6389 c = p[2];
6390 p[2] = p[1];
6391 p[1] = *p;
6392 *p = c;
6393 }
Bram Moolenaarea424162005-06-16 21:51:00 +00006394
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006395 /* Rotate three bytes right: "123" -> "312". We change
Bram Moolenaarea424162005-06-16 21:51:00 +00006396 * "fword" here, it's changed back afterwards. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006397 if (try_deeper(su, stack, depth, SCORE_SWAP3))
6398 {
Bram Moolenaarea424162005-06-16 21:51:00 +00006399 sp->ts_state = STATE_UNROT3R;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006400 ++depth;
Bram Moolenaarea424162005-06-16 21:51:00 +00006401 p = fword + sp->ts_fidx;
6402#ifdef FEAT_MBYTE
6403 if (has_mbyte)
6404 {
6405 n = mb_ptr2len_check(p);
6406 n += mb_ptr2len_check(p + n);
6407 c = mb_ptr2char(p + n);
6408 tl = mb_ptr2len_check(p + n);
6409 mch_memmove(p + tl, p, n);
6410 mb_char2bytes(c, p);
6411 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
6412 }
6413 else
6414#endif
6415 {
6416 c = p[2];
6417 p[2] = p[1];
6418 p[1] = *p;
6419 *p = c;
6420 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
6421 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006422 }
6423 else
6424 sp->ts_state = STATE_REP_INI;
6425 break;
6426
Bram Moolenaarea424162005-06-16 21:51:00 +00006427 case STATE_UNROT3R:
Bram Moolenaar0c405862005-06-22 22:26:26 +00006428 /* Undo ROT3R: "312" -> "123" */
Bram Moolenaarea424162005-06-16 21:51:00 +00006429 p = fword + sp->ts_fidx;
6430#ifdef FEAT_MBYTE
6431 if (has_mbyte)
6432 {
6433 c = mb_ptr2char(p);
6434 tl = MB_BYTE2LEN(*p);
6435 n = MB_BYTE2LEN(p[tl]);
6436 n += MB_BYTE2LEN(p[tl + n]);
6437 mch_memmove(p, p + tl, n);
6438 mb_char2bytes(c, p + n);
6439 }
6440 else
6441#endif
6442 {
6443 c = *p;
6444 *p = p[1];
6445 p[1] = p[2];
6446 p[2] = c;
6447 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006448 /*FALLTHROUGH*/
6449
6450 case STATE_REP_INI:
6451 /* Check if matching with REP items from the .aff file would
6452 * work. Quickly skip if there are no REP items or the score
6453 * is going to be too high anyway. */
6454 gap = &lp->lp_slang->sl_rep;
6455 if (gap->ga_len == 0
6456 || sp->ts_score + SCORE_REP >= su->su_maxscore)
6457 {
6458 sp->ts_state = STATE_FINAL;
6459 break;
6460 }
6461
6462 /* Use the first byte to quickly find the first entry that
Bram Moolenaarea424162005-06-16 21:51:00 +00006463 * may match. If the index is -1 there is none. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006464 sp->ts_curi = lp->lp_slang->sl_rep_first[fword[sp->ts_fidx]];
6465 if (sp->ts_curi < 0)
6466 {
6467 sp->ts_state = STATE_FINAL;
6468 break;
6469 }
6470
6471 sp->ts_state = STATE_REP;
6472 /*FALLTHROUGH*/
6473
6474 case STATE_REP:
6475 /* Try matching with REP items from the .aff file. For each
Bram Moolenaarea424162005-06-16 21:51:00 +00006476 * match replace the characters and check if the resulting
6477 * word is valid. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006478 p = fword + sp->ts_fidx;
6479
6480 gap = &lp->lp_slang->sl_rep;
6481 while (sp->ts_curi < gap->ga_len)
6482 {
6483 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
6484 if (*ftp->ft_from != *p)
6485 {
6486 /* past possible matching entries */
6487 sp->ts_curi = gap->ga_len;
6488 break;
6489 }
6490 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
6491 && try_deeper(su, stack, depth, SCORE_REP))
6492 {
6493 /* Need to undo this afterwards. */
6494 sp->ts_state = STATE_REP_UNDO;
6495
6496 /* Change the "from" to the "to" string. */
6497 ++depth;
6498 fl = STRLEN(ftp->ft_from);
6499 tl = STRLEN(ftp->ft_to);
6500 if (fl != tl)
Bram Moolenaar0c405862005-06-22 22:26:26 +00006501 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006502 mch_memmove(p + tl, p + fl, STRLEN(p + fl) + 1);
Bram Moolenaar0c405862005-06-22 22:26:26 +00006503 repextra += tl - fl;
6504 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006505 mch_memmove(p, ftp->ft_to, tl);
6506 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
Bram Moolenaarea424162005-06-16 21:51:00 +00006507#ifdef FEAT_MBYTE
6508 stack[depth].ts_tcharlen = 0;
6509#endif
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006510 break;
6511 }
6512 }
6513
6514 if (sp->ts_curi >= gap->ga_len)
6515 /* No (more) matches. */
6516 sp->ts_state = STATE_FINAL;
6517
6518 break;
6519
6520 case STATE_REP_UNDO:
6521 /* Undo a REP replacement and continue with the next one. */
6522 ftp = (fromto_T *)lp->lp_slang->sl_rep.ga_data
6523 + sp->ts_curi - 1;
6524 fl = STRLEN(ftp->ft_from);
6525 tl = STRLEN(ftp->ft_to);
6526 p = fword + sp->ts_fidx;
6527 if (fl != tl)
Bram Moolenaar0c405862005-06-22 22:26:26 +00006528 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006529 mch_memmove(p + fl, p + tl, STRLEN(p + tl) + 1);
Bram Moolenaar0c405862005-06-22 22:26:26 +00006530 repextra -= tl - fl;
6531 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006532 mch_memmove(p, ftp->ft_from, fl);
6533 sp->ts_state = STATE_REP;
6534 break;
6535
6536 default:
6537 /* Did all possible states at this level, go up one level. */
6538 --depth;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006539
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006540 /* Don't check for CTRL-C too often, it takes time. */
6541 line_breakcheck();
6542 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006543 }
6544 }
6545}
6546
6547/*
6548 * Try going one level deeper in the tree.
6549 */
6550 static int
6551try_deeper(su, stack, depth, score_add)
6552 suginfo_T *su;
6553 trystate_T *stack;
6554 int depth;
6555 int score_add;
6556{
6557 int newscore;
6558
6559 /* Refuse to go deeper if the scrore is getting too big. */
6560 newscore = stack[depth].ts_score + score_add;
6561 if (newscore >= su->su_maxscore)
6562 return FALSE;
6563
Bram Moolenaarea424162005-06-16 21:51:00 +00006564 stack[depth + 1] = stack[depth];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006565 stack[depth + 1].ts_state = STATE_START;
6566 stack[depth + 1].ts_score = newscore;
6567 stack[depth + 1].ts_curi = 1; /* start just after length byte */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006568 return TRUE;
6569}
6570
6571/*
6572 * "fword" is a good word with case folded. Find the matching keep-case
6573 * words and put it in "kword".
6574 * Theoretically there could be several keep-case words that result in the
6575 * same case-folded word, but we only find one...
6576 */
6577 static void
6578find_keepcap_word(slang, fword, kword)
6579 slang_T *slang;
6580 char_u *fword;
6581 char_u *kword;
6582{
6583 char_u uword[MAXWLEN]; /* "fword" in upper-case */
6584 int depth;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006585 idx_T tryidx;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006586
6587 /* The following arrays are used at each depth in the tree. */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006588 idx_T arridx[MAXWLEN];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006589 int round[MAXWLEN];
6590 int fwordidx[MAXWLEN];
6591 int uwordidx[MAXWLEN];
6592 int kwordlen[MAXWLEN];
6593
6594 int flen, ulen;
6595 int l;
6596 int len;
6597 int c;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006598 idx_T lo, hi, m;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006599 char_u *p;
6600 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006601 idx_T *idxs = slang->sl_kidxs; /* array with indexes */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006602
6603 if (byts == NULL)
6604 {
6605 /* array is empty: "cannot happen" */
6606 *kword = NUL;
6607 return;
6608 }
6609
6610 /* Make an all-cap version of "fword". */
6611 allcap_copy(fword, uword);
6612
6613 /*
6614 * Each character needs to be tried both case-folded and upper-case.
6615 * All this gets very complicated if we keep in mind that changing case
6616 * may change the byte length of a multi-byte character...
6617 */
6618 depth = 0;
6619 arridx[0] = 0;
6620 round[0] = 0;
6621 fwordidx[0] = 0;
6622 uwordidx[0] = 0;
6623 kwordlen[0] = 0;
6624 while (depth >= 0)
6625 {
6626 if (fword[fwordidx[depth]] == NUL)
6627 {
6628 /* We are at the end of "fword". If the tree allows a word to end
6629 * here we have found a match. */
6630 if (byts[arridx[depth] + 1] == 0)
6631 {
6632 kword[kwordlen[depth]] = NUL;
6633 return;
6634 }
6635
6636 /* kword is getting too long, continue one level up */
6637 --depth;
6638 }
6639 else if (++round[depth] > 2)
6640 {
6641 /* tried both fold-case and upper-case character, continue one
6642 * level up */
6643 --depth;
6644 }
6645 else
6646 {
6647 /*
6648 * round[depth] == 1: Try using the folded-case character.
6649 * round[depth] == 2: Try using the upper-case character.
6650 */
6651#ifdef FEAT_MBYTE
6652 if (has_mbyte)
6653 {
6654 flen = mb_ptr2len_check(fword + fwordidx[depth]);
6655 ulen = mb_ptr2len_check(uword + uwordidx[depth]);
6656 }
6657 else
6658#endif
6659 ulen = flen = 1;
6660 if (round[depth] == 1)
6661 {
6662 p = fword + fwordidx[depth];
6663 l = flen;
6664 }
6665 else
6666 {
6667 p = uword + uwordidx[depth];
6668 l = ulen;
6669 }
6670
6671 for (tryidx = arridx[depth]; l > 0; --l)
6672 {
6673 /* Perform a binary search in the list of accepted bytes. */
6674 len = byts[tryidx++];
6675 c = *p++;
6676 lo = tryidx;
6677 hi = tryidx + len - 1;
6678 while (lo < hi)
6679 {
6680 m = (lo + hi) / 2;
6681 if (byts[m] > c)
6682 hi = m - 1;
6683 else if (byts[m] < c)
6684 lo = m + 1;
6685 else
6686 {
6687 lo = hi = m;
6688 break;
6689 }
6690 }
6691
6692 /* Stop if there is no matching byte. */
6693 if (hi < lo || byts[lo] != c)
6694 break;
6695
6696 /* Continue at the child (if there is one). */
6697 tryidx = idxs[lo];
6698 }
6699
6700 if (l == 0)
6701 {
6702 /*
6703 * Found the matching char. Copy it to "kword" and go a
6704 * level deeper.
6705 */
6706 if (round[depth] == 1)
6707 {
6708 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
6709 flen);
6710 kwordlen[depth + 1] = kwordlen[depth] + flen;
6711 }
6712 else
6713 {
6714 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
6715 ulen);
6716 kwordlen[depth + 1] = kwordlen[depth] + ulen;
6717 }
6718 fwordidx[depth + 1] = fwordidx[depth] + flen;
6719 uwordidx[depth + 1] = uwordidx[depth] + ulen;
6720
6721 ++depth;
6722 arridx[depth] = tryidx;
6723 round[depth] = 0;
6724 }
6725 }
6726 }
6727
6728 /* Didn't find it: "cannot happen". */
6729 *kword = NUL;
6730}
6731
6732/*
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006733 * Compute the sound-a-like score for suggestions in su->su_ga and add them to
6734 * su->su_sga.
6735 */
6736 static void
6737score_comp_sal(su)
6738 suginfo_T *su;
6739{
6740 langp_T *lp;
6741 char_u badsound[MAXWLEN];
6742 int i;
6743 suggest_T *stp;
6744 suggest_T *sstp;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006745 int score;
6746
6747 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL)
6748 return;
6749
6750 /* Use the sound-folding of the first language that supports it. */
6751 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
6752 lp->lp_slang != NULL; ++lp)
6753 if (lp->lp_slang->sl_sal.ga_len > 0)
6754 {
6755 /* soundfold the bad word */
6756 spell_soundfold(lp->lp_slang, su->su_fbadword, badsound);
6757
6758 for (i = 0; i < su->su_ga.ga_len; ++i)
6759 {
6760 stp = &SUG(su->su_ga, i);
6761
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00006762 /* Case-fold the suggested word, sound-fold it and compute the
6763 * sound-a-like score. */
6764 score = stp_sal_score(stp, su, lp->lp_slang, badsound);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006765 if (score < SCORE_MAXMAX)
6766 {
6767 /* Add the suggestion. */
6768 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
6769 sstp->st_word = vim_strsave(stp->st_word);
6770 if (sstp->st_word != NULL)
6771 {
6772 sstp->st_score = score;
6773 sstp->st_altscore = 0;
6774 sstp->st_orglen = stp->st_orglen;
6775 ++su->su_sga.ga_len;
6776 }
6777 }
6778 }
6779 break;
6780 }
6781}
6782
6783/*
6784 * Combine the list of suggestions in su->su_ga and su->su_sga.
6785 * They are intwined.
6786 */
6787 static void
6788score_combine(su)
6789 suginfo_T *su;
6790{
6791 int i;
6792 int j;
6793 garray_T ga;
6794 garray_T *gap;
6795 langp_T *lp;
6796 suggest_T *stp;
6797 char_u *p;
6798 char_u badsound[MAXWLEN];
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006799 int round;
6800
6801 /* Add the alternate score to su_ga. */
6802 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
6803 lp->lp_slang != NULL; ++lp)
6804 {
6805 if (lp->lp_slang->sl_sal.ga_len > 0)
6806 {
6807 /* soundfold the bad word */
6808 spell_soundfold(lp->lp_slang, su->su_fbadword, badsound);
6809
6810 for (i = 0; i < su->su_ga.ga_len; ++i)
6811 {
6812 stp = &SUG(su->su_ga, i);
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00006813 stp->st_altscore = stp_sal_score(stp, su, lp->lp_slang,
6814 badsound);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006815 if (stp->st_altscore == SCORE_MAXMAX)
6816 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
6817 else
6818 stp->st_score = (stp->st_score * 3
6819 + stp->st_altscore) / 4;
6820 stp->st_salscore = FALSE;
6821 }
6822 break;
6823 }
6824 }
6825
6826 /* Add the alternate score to su_sga. */
6827 for (i = 0; i < su->su_sga.ga_len; ++i)
6828 {
6829 stp = &SUG(su->su_sga, i);
6830 stp->st_altscore = spell_edit_score(su->su_badword, stp->st_word);
6831 if (stp->st_score == SCORE_MAXMAX)
6832 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
6833 else
6834 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
6835 stp->st_salscore = TRUE;
6836 }
6837
6838 /* Sort the suggestions and truncate at "maxcount" for both lists. */
6839 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
6840 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
6841
6842 ga_init2(&ga, (int)sizeof(suginfo_T), 1);
6843 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL)
6844 return;
6845
6846 stp = &SUG(ga, 0);
6847 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i)
6848 {
6849 /* round 1: get a suggestion from su_ga
6850 * round 2: get a suggestion from su_sga */
6851 for (round = 1; round <= 2; ++round)
6852 {
6853 gap = round == 1 ? &su->su_ga : &su->su_sga;
6854 if (i < gap->ga_len)
6855 {
6856 /* Don't add a word if it's already there. */
6857 p = SUG(*gap, i).st_word;
6858 for (j = 0; j < ga.ga_len; ++j)
6859 if (STRCMP(stp[j].st_word, p) == 0)
6860 break;
6861 if (j == ga.ga_len)
6862 stp[ga.ga_len++] = SUG(*gap, i);
6863 else
6864 vim_free(p);
6865 }
6866 }
6867 }
6868
6869 ga_clear(&su->su_ga);
6870 ga_clear(&su->su_sga);
6871
6872 /* Truncate the list to the number of suggestions that will be displayed. */
6873 if (ga.ga_len > su->su_maxcount)
6874 {
6875 for (i = su->su_maxcount; i < ga.ga_len; ++i)
6876 vim_free(stp[i].st_word);
6877 ga.ga_len = su->su_maxcount;
6878 }
6879
6880 su->su_ga = ga;
6881}
6882
6883/*
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00006884 * For the goodword in "stp" compute the soundalike score compared to the
6885 * badword.
6886 */
6887 static int
6888stp_sal_score(stp, su, slang, badsound)
6889 suggest_T *stp;
6890 suginfo_T *su;
6891 slang_T *slang;
6892 char_u *badsound; /* sound-folded badword */
6893{
6894 char_u *p;
6895 char_u badsound2[MAXWLEN];
6896 char_u fword[MAXWLEN];
6897 char_u goodsound[MAXWLEN];
6898
6899 if (stp->st_orglen <= su->su_badlen)
6900 p = badsound;
6901 else
6902 {
6903 /* soundfold the bad word with more characters following */
6904 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN);
6905
6906 /* When joining two words the sound often changes a lot. E.g., "t he"
6907 * sounds like "t h" while "the" sounds like "@". Avoid that by
6908 * removing the space. Don't do it when the good word also contains a
6909 * space. */
6910 if (vim_iswhite(su->su_badptr[su->su_badlen])
6911 && *skiptowhite(stp->st_word) == NUL)
6912 for (p = fword; *(p = skiptowhite(p)) != NUL; )
6913 mch_memmove(p, p + 1, STRLEN(p));
6914
6915 spell_soundfold(slang, fword, badsound2);
6916 p = badsound2;
6917 }
6918
6919 /* Case-fold the word, sound-fold the word and compute the score for the
6920 * difference. */
6921 (void)spell_casefold(stp->st_word, STRLEN(stp->st_word), fword, MAXWLEN);
6922 spell_soundfold(slang, fword, goodsound);
6923
6924 return soundalike_score(goodsound, p);
6925}
6926
6927/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006928 * Find suggestions by comparing the word in a sound-a-like form.
6929 */
6930 static void
Bram Moolenaar0c405862005-06-22 22:26:26 +00006931suggest_try_soundalike(su)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006932 suginfo_T *su;
6933{
6934 char_u salword[MAXWLEN];
6935 char_u tword[MAXWLEN];
6936 char_u tfword[MAXWLEN];
6937 char_u tsalword[MAXWLEN];
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006938 idx_T arridx[MAXWLEN];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006939 int curi[MAXWLEN];
6940 langp_T *lp;
6941 char_u *byts;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006942 idx_T *idxs;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006943 int depth;
6944 int c;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006945 idx_T n;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006946 int round;
6947 int flags;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006948 int sound_score;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006949
Bram Moolenaard857f0e2005-06-21 22:37:39 +00006950 /* Do this for all languages that support sound folding. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006951 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
6952 lp->lp_slang != NULL; ++lp)
6953 {
6954 if (lp->lp_slang->sl_sal.ga_len > 0)
6955 {
6956 /* soundfold the bad word */
6957 spell_soundfold(lp->lp_slang, su->su_fbadword, salword);
6958
6959 /*
6960 * Go through the whole tree, soundfold each word and compare.
6961 * round 1: use the case-folded tree.
6962 * round 2: use the keep-case tree.
6963 */
6964 for (round = 1; round <= 2; ++round)
6965 {
6966 if (round == 1)
6967 {
6968 byts = lp->lp_slang->sl_fbyts;
6969 idxs = lp->lp_slang->sl_fidxs;
6970 }
6971 else
6972 {
6973 byts = lp->lp_slang->sl_kbyts;
6974 idxs = lp->lp_slang->sl_kidxs;
6975 }
6976
6977 depth = 0;
6978 arridx[0] = 0;
6979 curi[0] = 1;
6980 while (depth >= 0 && !got_int)
6981 {
6982 if (curi[depth] > byts[arridx[depth]])
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00006983 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006984 /* Done all bytes at this node, go up one level. */
6985 --depth;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00006986 line_breakcheck();
6987 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006988 else
6989 {
6990 /* Do one more byte at this node. */
6991 n = arridx[depth] + curi[depth];
6992 ++curi[depth];
6993 c = byts[n];
6994 if (c == 0)
6995 {
6996 /* End of word, deal with the word. */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00006997 flags = (int)idxs[n];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00006998 if (round == 2 || (flags & WF_KEEPCAP) == 0)
6999 {
7000 tword[depth] = NUL;
7001 if (round == 1)
7002 spell_soundfold(lp->lp_slang,
7003 tword, tsalword);
7004 else
7005 {
7006 /* In keep-case tree need to case-fold the
7007 * word. */
7008 (void)spell_casefold(tword, depth,
7009 tfword, MAXWLEN);
7010 spell_soundfold(lp->lp_slang,
7011 tfword, tsalword);
7012 }
7013
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007014 /* Compute the edit distance between the
7015 * sound-a-like words. */
7016 sound_score = soundalike_score(salword,
7017 tsalword);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007018 if (sound_score < SCORE_MAXMAX)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007019 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007020 char_u cword[MAXWLEN];
7021 char_u *p;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007022 int score;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007023
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007024 if (round == 1 && (flags & WF_CAPMASK) != 0)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007025 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007026 /* Need to fix case according to
7027 * "flags". */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007028 make_case_word(tword, cword, flags);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007029 p = cword;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007030 }
7031 else
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007032 p = tword;
7033
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007034 if (sps_flags & SPS_DOUBLE)
7035 add_suggestion(su, &su->su_sga, p,
Bram Moolenaar0c405862005-06-22 22:26:26 +00007036 su->su_badlen,
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007037 sound_score, 0, FALSE);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007038 else
7039 {
7040 /* Compute the score. */
7041 score = spell_edit_score(
7042 su->su_badword, p);
7043 if (sps_flags & SPS_BEST)
7044 /* give a bonus for the good word
7045 * sounding the same as the bad
7046 * word */
7047 add_suggestion(su, &su->su_ga, p,
Bram Moolenaar0c405862005-06-22 22:26:26 +00007048 su->su_badlen,
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007049 RESCORE(score, sound_score),
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007050 sound_score, TRUE);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007051 else
7052 add_suggestion(su, &su->su_ga, p,
Bram Moolenaar0c405862005-06-22 22:26:26 +00007053 su->su_badlen,
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007054 score + sound_score, 0, FALSE);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007055 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007056 }
7057 }
7058
7059 /* Skip over other NUL bytes. */
7060 while (byts[n + 1] == 0)
7061 {
7062 ++n;
7063 ++curi[depth];
7064 }
7065 }
7066 else
7067 {
7068 /* Normal char, go one level deeper. */
7069 tword[depth++] = c;
7070 arridx[depth] = idxs[n];
7071 curi[depth] = 1;
7072 }
7073 }
7074 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007075 }
7076 }
7077 }
7078}
7079
7080/*
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007081 * Copy "fword" to "cword", fixing case according to "flags".
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007082 */
7083 static void
7084make_case_word(fword, cword, flags)
7085 char_u *fword;
7086 char_u *cword;
7087 int flags;
7088{
7089 if (flags & WF_ALLCAP)
7090 /* Make it all upper-case */
7091 allcap_copy(fword, cword);
7092 else if (flags & WF_ONECAP)
7093 /* Make the first letter upper-case */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007094 onecap_copy(fword, cword, TRUE);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007095 else
7096 /* Use goodword as-is. */
7097 STRCPY(cword, fword);
7098}
7099
Bram Moolenaarea424162005-06-16 21:51:00 +00007100/*
7101 * Use map string "map" for languages "lp".
7102 */
7103 static void
7104set_map_str(lp, map)
7105 slang_T *lp;
7106 char_u *map;
7107{
7108 char_u *p;
7109 int headc = 0;
7110 int c;
7111 int i;
7112
7113 if (*map == NUL)
7114 {
7115 lp->sl_has_map = FALSE;
7116 return;
7117 }
7118 lp->sl_has_map = TRUE;
7119
7120 /* Init the array and hash table empty. */
7121 for (i = 0; i < 256; ++i)
7122 lp->sl_map_array[i] = 0;
7123#ifdef FEAT_MBYTE
7124 hash_init(&lp->sl_map_hash);
7125#endif
7126
7127 /*
7128 * The similar characters are stored separated with slashes:
7129 * "aaa/bbb/ccc/". Fill sl_map_array[c] with the character before c and
7130 * before the same slash. For characters above 255 sl_map_hash is used.
7131 */
7132 for (p = map; *p != NUL; )
7133 {
7134#ifdef FEAT_MBYTE
7135 c = mb_ptr2char_adv(&p);
7136#else
7137 c = *p++;
7138#endif
7139 if (c == '/')
7140 headc = 0;
7141 else
7142 {
7143 if (headc == 0)
7144 headc = c;
7145
7146#ifdef FEAT_MBYTE
7147 /* Characters above 255 don't fit in sl_map_array[], put them in
7148 * the hash table. Each entry is the char, a NUL the headchar and
7149 * a NUL. */
7150 if (c >= 256)
7151 {
7152 int cl = mb_char2len(c);
7153 int headcl = mb_char2len(headc);
7154 char_u *b;
7155 hash_T hash;
7156 hashitem_T *hi;
7157
7158 b = alloc((unsigned)(cl + headcl + 2));
7159 if (b == NULL)
7160 return;
7161 mb_char2bytes(c, b);
7162 b[cl] = NUL;
7163 mb_char2bytes(headc, b + cl + 1);
7164 b[cl + 1 + headcl] = NUL;
7165 hash = hash_hash(b);
7166 hi = hash_lookup(&lp->sl_map_hash, b, hash);
7167 if (HASHITEM_EMPTY(hi))
7168 hash_add_item(&lp->sl_map_hash, hi, b, hash);
7169 else
7170 {
7171 /* This should have been checked when generating the .spl
7172 * file. */
7173 EMSG(_("E999: duplicate char in MAP entry"));
7174 vim_free(b);
7175 }
7176 }
7177 else
7178#endif
7179 lp->sl_map_array[c] = headc;
7180 }
7181 }
7182}
7183
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007184/*
7185 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
7186 * lines in the .aff file.
7187 */
7188 static int
7189similar_chars(slang, c1, c2)
7190 slang_T *slang;
7191 int c1;
7192 int c2;
7193{
Bram Moolenaarea424162005-06-16 21:51:00 +00007194 int m1, m2;
7195#ifdef FEAT_MBYTE
7196 char_u buf[MB_MAXBYTES];
7197 hashitem_T *hi;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007198
Bram Moolenaarea424162005-06-16 21:51:00 +00007199 if (c1 >= 256)
7200 {
7201 buf[mb_char2bytes(c1, buf)] = 0;
7202 hi = hash_find(&slang->sl_map_hash, buf);
7203 if (HASHITEM_EMPTY(hi))
7204 m1 = 0;
7205 else
7206 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
7207 }
7208 else
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007209#endif
Bram Moolenaarea424162005-06-16 21:51:00 +00007210 m1 = slang->sl_map_array[c1];
7211 if (m1 == 0)
7212 return FALSE;
7213
7214
7215#ifdef FEAT_MBYTE
7216 if (c2 >= 256)
7217 {
7218 buf[mb_char2bytes(c2, buf)] = 0;
7219 hi = hash_find(&slang->sl_map_hash, buf);
7220 if (HASHITEM_EMPTY(hi))
7221 m2 = 0;
7222 else
7223 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
7224 }
7225 else
7226#endif
7227 m2 = slang->sl_map_array[c2];
7228
7229 return m1 == m2;
7230}
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007231
7232/*
7233 * Add a suggestion to the list of suggestions.
7234 * Do not add a duplicate suggestion or suggestions with a bad score.
7235 * When "use_score" is not zero it's used, otherwise the score is computed
7236 * with spell_edit_score().
7237 */
7238 static void
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007239add_suggestion(su, gap, goodword, badlen, score, altscore, had_bonus)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007240 suginfo_T *su;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007241 garray_T *gap;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007242 char_u *goodword;
Bram Moolenaar0c405862005-06-22 22:26:26 +00007243 int badlen; /* length of bad word used */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007244 int score;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007245 int altscore;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007246 int had_bonus; /* value for st_had_bonus */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007247{
7248 suggest_T *stp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007249 int i;
Bram Moolenaar0c405862005-06-22 22:26:26 +00007250 char_u *p = NULL;
7251 int c = 0;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007252
7253 /* Check that the word wasn't banned. */
7254 if (was_banned(su, goodword))
7255 return;
7256
Bram Moolenaar0c405862005-06-22 22:26:26 +00007257 /* If past "su_badlen" and the rest is identical stop at "su_badlen".
7258 * Remove the common part from "goodword". */
7259 i = badlen - su->su_badlen;
7260 if (i > 0)
7261 {
7262 /* This assumes there was no case folding or it didn't change the
7263 * length... */
7264 p = goodword + STRLEN(goodword) - i;
7265 if (p > goodword && STRNICMP(su->su_badptr + su->su_badlen, p, i) == 0)
7266 {
7267 badlen = su->su_badlen;
7268 c = *p;
7269 *p = NUL;
7270 }
7271 else
7272 p = NULL;
7273 }
7274
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007275 if (score <= su->su_maxscore)
7276 {
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00007277 /* Check if the word is already there. Also check the length that is
7278 * being replaced "thes," -> "these" is a different suggestion from
7279 * "thes" -> "these". */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007280 stp = &SUG(*gap, 0);
7281 for (i = gap->ga_len - 1; i >= 0; --i)
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00007282 if (STRCMP(stp[i].st_word, goodword) == 0
7283 && stp[i].st_orglen == badlen)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007284 {
7285 /* Found it. Remember the lowest score. */
7286 if (stp[i].st_score > score)
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007287 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007288 stp[i].st_score = score;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007289 stp[i].st_had_bonus = had_bonus;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007290 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007291 break;
7292 }
7293
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007294 if (i < 0 && ga_grow(gap, 1) == OK)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007295 {
7296 /* Add a suggestion. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007297 stp = &SUG(*gap, gap->ga_len);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007298 stp->st_word = vim_strsave(goodword);
7299 if (stp->st_word != NULL)
7300 {
7301 stp->st_score = score;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007302 stp->st_altscore = altscore;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007303 stp->st_had_bonus = had_bonus;
Bram Moolenaar0c405862005-06-22 22:26:26 +00007304 stp->st_orglen = badlen;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007305 ++gap->ga_len;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007306
7307 /* If we have too many suggestions now, sort the list and keep
7308 * the best suggestions. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007309 if (gap->ga_len > SUG_MAX_COUNT(su))
7310 su->su_maxscore = cleanup_suggestions(gap, su->su_maxscore,
7311 SUG_CLEAN_COUNT(su));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007312 }
7313 }
7314 }
Bram Moolenaar0c405862005-06-22 22:26:26 +00007315
7316 if (p != NULL)
7317 *p = c; /* restore "goodword" */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007318}
7319
7320/*
7321 * Add a word to be banned.
7322 */
7323 static void
7324add_banned(su, word)
7325 suginfo_T *su;
7326 char_u *word;
7327{
7328 char_u *s = vim_strsave(word);
7329 hash_T hash;
7330 hashitem_T *hi;
7331
7332 if (s != NULL)
7333 {
7334 hash = hash_hash(s);
7335 hi = hash_lookup(&su->su_banned, s, hash);
7336 if (HASHITEM_EMPTY(hi))
7337 hash_add_item(&su->su_banned, hi, s, hash);
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00007338 else
7339 vim_free(s);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007340 }
7341}
7342
7343/*
7344 * Return TRUE if a word appears in the list of banned words.
7345 */
7346 static int
7347was_banned(su, word)
7348 suginfo_T *su;
7349 char_u *word;
7350{
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007351 hashitem_T *hi = hash_find(&su->su_banned, word);
7352
7353 return !HASHITEM_EMPTY(hi);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007354}
7355
7356/*
7357 * Free the banned words in "su".
7358 */
7359 static void
7360free_banned(su)
7361 suginfo_T *su;
7362{
7363 int todo;
7364 hashitem_T *hi;
7365
7366 todo = su->su_banned.ht_used;
7367 for (hi = su->su_banned.ht_array; todo > 0; ++hi)
7368 {
7369 if (!HASHITEM_EMPTY(hi))
7370 {
7371 vim_free(hi->hi_key);
7372 --todo;
7373 }
7374 }
7375 hash_clear(&su->su_banned);
7376}
7377
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007378/*
7379 * Recompute the score if sound-folding is possible. This is slow,
7380 * thus only done for the final results.
7381 */
7382 static void
7383rescore_suggestions(su)
7384 suginfo_T *su;
7385{
7386 langp_T *lp;
7387 suggest_T *stp;
7388 char_u sal_badword[MAXWLEN];
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007389 int i;
7390
7391 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
7392 lp->lp_slang != NULL; ++lp)
7393 {
7394 if (lp->lp_slang->sl_sal.ga_len > 0)
7395 {
7396 /* soundfold the bad word */
7397 spell_soundfold(lp->lp_slang, su->su_fbadword, sal_badword);
7398
7399 for (i = 0; i < su->su_ga.ga_len; ++i)
7400 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007401 stp = &SUG(su->su_ga, i);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007402 if (!stp->st_had_bonus)
7403 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007404 stp->st_altscore = stp_sal_score(stp, su,
7405 lp->lp_slang, sal_badword);
7406 if (stp->st_altscore == SCORE_MAXMAX)
7407 stp->st_altscore = SCORE_BIG;
7408 stp->st_score = RESCORE(stp->st_score, stp->st_altscore);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007409 }
7410 }
7411 break;
7412 }
7413 }
7414}
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007415
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007416static int
7417#ifdef __BORLANDC__
7418_RTLENTRYF
7419#endif
7420sug_compare __ARGS((const void *s1, const void *s2));
7421
7422/*
7423 * Function given to qsort() to sort the suggestions on st_score.
7424 */
7425 static int
7426#ifdef __BORLANDC__
7427_RTLENTRYF
7428#endif
7429sug_compare(s1, s2)
7430 const void *s1;
7431 const void *s2;
7432{
7433 suggest_T *p1 = (suggest_T *)s1;
7434 suggest_T *p2 = (suggest_T *)s2;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007435 int n = p1->st_score - p2->st_score;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007436
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007437 if (n == 0)
7438 return p1->st_altscore - p2->st_altscore;
7439 return n;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007440}
7441
7442/*
7443 * Cleanup the suggestions:
7444 * - Sort on score.
7445 * - Remove words that won't be displayed.
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007446 * Returns the maximum score in the list or "maxscore" unmodified.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007447 */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007448 static int
7449cleanup_suggestions(gap, maxscore, keep)
7450 garray_T *gap;
7451 int maxscore;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007452 int keep; /* nr of suggestions to keep */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007453{
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007454 suggest_T *stp = &SUG(*gap, 0);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007455 int i;
7456
7457 /* Sort the list. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007458 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007459
7460 /* Truncate the list to the number of suggestions that will be displayed. */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007461 if (gap->ga_len > keep)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007462 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007463 for (i = keep; i < gap->ga_len; ++i)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007464 vim_free(stp[i].st_word);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007465 gap->ga_len = keep;
7466 return stp[keep - 1].st_score;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007467 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007468 return maxscore;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007469}
7470
7471/*
7472 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
7473 */
7474 static void
7475spell_soundfold(slang, inword, res)
7476 slang_T *slang;
7477 char_u *inword;
7478 char_u *res;
7479{
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007480 salitem_T *smp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007481 char_u word[MAXWLEN];
7482#ifdef FEAT_MBYTE
7483 int l;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007484 int found_mbyte = FALSE;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007485#endif
7486 char_u *s;
7487 char_u *t;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007488 char_u *pf;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007489 int i, j, z;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007490 int reslen;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007491 int n, k = 0;
7492 int z0;
7493 int k0;
7494 int n0;
7495 int c;
7496 int pri;
7497 int p0 = -333;
7498 int c0;
7499
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007500 /* Remove accents, if wanted. We actually remove all non-word characters.
7501 * But keep white space. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007502 if (slang->sl_rem_accents)
7503 {
7504 t = word;
7505 for (s = inword; *s != NUL; )
7506 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007507 if (vim_iswhite(*s))
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007508 {
7509 *t++ = ' ';
7510 s = skipwhite(s);
7511 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007512#ifdef FEAT_MBYTE
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007513 else if (has_mbyte)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007514 {
7515 l = mb_ptr2len_check(s);
Bram Moolenaarea408852005-06-25 22:49:46 +00007516 if (spell_iswordp(s))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007517 {
7518 mch_memmove(t, s, l);
7519 t += l;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007520 if (l > 1)
7521 found_mbyte = TRUE;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007522 }
7523 s += l;
7524 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007525#endif
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007526 else
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007527 {
Bram Moolenaarea408852005-06-25 22:49:46 +00007528 if (spell_iswordp(s))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007529 *t++ = *s;
7530 ++s;
7531 }
7532 }
7533 *t = NUL;
7534 }
7535 else
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007536 {
7537#ifdef FEAT_MBYTE
7538 if (has_mbyte)
7539 for (s = inword; *s != NUL; s += l)
7540 if ((l = mb_ptr2len_check(s)) > 1)
7541 {
7542 found_mbyte = TRUE;
7543 break;
7544 }
7545#endif
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007546 STRCPY(word, inword);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007547 }
7548
7549#ifdef FEAT_MBYTE
7550 /* If there are multi-byte characters in the word return it as-is, because
7551 * the following won't work. */
7552 if (found_mbyte)
7553 {
7554 STRCPY(res, word);
7555 return;
7556 }
7557#endif
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007558
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007559 smp = (salitem_T *)slang->sl_sal.ga_data;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007560
7561 /*
7562 * This comes from Aspell phonet.cpp. Converted from C++ to C.
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007563 * Changed to keep spaces.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007564 * TODO: support for multi-byte chars.
7565 */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007566 i = reslen = z = 0;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007567 while ((c = word[i]) != NUL)
7568 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007569 /* Start with the first rule that has the character in the word. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007570 n = slang->sl_sal_first[c];
7571 z0 = 0;
7572
7573 if (n >= 0)
7574 {
7575 /* check all rules for the same letter */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007576 for (; (s = smp[n].sm_lead)[0] == c; ++n)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007577 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007578 /* Quickly skip entries that don't match the word. Most
7579 * entries are less then three chars, optimize for that. */
7580 k = smp[n].sm_leadlen;
7581 if (k > 1)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007582 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007583 if (word[i + 1] != s[1])
7584 continue;
7585 if (k > 2)
7586 {
7587 for (j = 2; j < k; ++j)
7588 if (word[i + j] != s[j])
7589 break;
7590 if (j < k)
7591 continue;
7592 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007593 }
7594
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007595 if ((pf = smp[n].sm_oneoff) != NULL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007596 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007597 /* Check for match with one of the chars in "sm_oneoff". */
7598 while (*pf != NUL && *pf != word[i + k])
7599 ++pf;
7600 if (*pf == NUL)
7601 continue;
7602 ++k;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007603 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007604 s = smp[n].sm_rules;
7605 pri = 5; /* default priority */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007606
7607 p0 = *s;
7608 k0 = k;
7609 while (*s == '-' && k > 1)
7610 {
7611 k--;
7612 s++;
7613 }
7614 if (*s == '<')
7615 s++;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007616 if (VIM_ISDIGIT(*s))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007617 {
7618 /* determine priority */
7619 pri = *s - '0';
7620 s++;
7621 }
7622 if (*s == '^' && *(s + 1) == '^')
7623 s++;
7624
7625 if (*s == NUL
7626 || (*s == '^'
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007627 && (i == 0 || !(word[i - 1] == ' '
Bram Moolenaarea408852005-06-25 22:49:46 +00007628 || spell_iswordp(word + i - 1)))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007629 && (*(s + 1) != '$'
Bram Moolenaarea408852005-06-25 22:49:46 +00007630 || (!spell_iswordp(word + i + k0))))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007631 || (*s == '$' && i > 0
Bram Moolenaarea408852005-06-25 22:49:46 +00007632 && spell_iswordp(word + i - 1)
7633 && (!spell_iswordp(word + i + k0))))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007634 {
7635 /* search for followup rules, if: */
7636 /* followup and k > 1 and NO '-' in searchstring */
7637 c0 = word[i + k - 1];
7638 n0 = slang->sl_sal_first[c0];
7639
7640 if (slang->sl_followup && k > 1 && n0 >= 0
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007641 && p0 != '-' && word[i + k] != NUL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007642 {
7643 /* test follow-up rule for "word[i + k]" */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007644 for ( ; (s = smp[n0].sm_lead)[0] == c0; ++n0)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007645 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007646 /* Quickly skip entries that don't match the word.
7647 * */
7648 k0 = smp[n0].sm_leadlen;
7649 if (k0 > 1)
7650 {
7651 if (word[i + k] != s[1])
7652 continue;
7653 if (k0 > 2)
7654 {
7655 pf = word + i + k + 1;
7656 for (j = 2; j < k0; ++j)
7657 if (*pf++ != s[j])
7658 break;
7659 if (j < k0)
7660 continue;
7661 }
7662 }
7663 k0 += k - 1;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007664
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007665 if ((pf = smp[n0].sm_oneoff) != NULL)
7666 {
7667 /* Check for match with one of the chars in
7668 * "sm_oneoff". */
7669 while (*pf != NUL && *pf != word[i + k0])
7670 ++pf;
7671 if (*pf == NUL)
7672 continue;
7673 ++k0;
7674 }
7675
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007676 p0 = 5;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007677 s = smp[n0].sm_rules;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007678 while (*s == '-')
7679 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007680 /* "k0" gets NOT reduced because
7681 * "if (k0 == k)" */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007682 s++;
7683 }
7684 if (*s == '<')
7685 s++;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007686 if (VIM_ISDIGIT(*s))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007687 {
7688 p0 = *s - '0';
7689 s++;
7690 }
7691
7692 if (*s == NUL
7693 /* *s == '^' cuts */
7694 || (*s == '$'
Bram Moolenaarea408852005-06-25 22:49:46 +00007695 && !spell_iswordp(word + i + k0)))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007696 {
7697 if (k0 == k)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007698 /* this is just a piece of the string */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007699 continue;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007700
7701 if (p0 < pri)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007702 /* priority too low */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007703 continue;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007704 /* rule fits; stop search */
7705 break;
7706 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007707 }
7708
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007709 if (p0 >= pri && smp[n0].sm_lead[0] == c0)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007710 continue;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007711 }
7712
7713 /* replace string */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007714 s = smp[n].sm_to;
7715 pf = smp[n].sm_rules;
7716 p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007717 if (p0 == 1 && z == 0)
7718 {
7719 /* rule with '<' is used */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007720 if (reslen > 0 && *s != NUL && (res[reslen - 1] == c
7721 || res[reslen - 1] == *s))
7722 reslen--;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007723 z0 = 1;
7724 z = 1;
7725 k0 = 0;
7726 while (*s != NUL && word[i+k0] != NUL)
7727 {
7728 word[i + k0] = *s;
7729 k0++;
7730 s++;
7731 }
7732 if (k > k0)
7733 mch_memmove(word + i + k0, word + i + k,
7734 STRLEN(word + i + k) + 1);
7735
7736 /* new "actual letter" */
7737 c = word[i];
7738 }
7739 else
7740 {
7741 /* no '<' rule used */
7742 i += k - 1;
7743 z = 0;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007744 while (*s != NUL && s[1] != NUL && reslen < MAXWLEN)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007745 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007746 if (reslen == 0 || res[reslen - 1] != *s)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007747 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007748 res[reslen] = *s;
7749 reslen++;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007750 }
7751 s++;
7752 }
7753 /* new "actual letter" */
7754 c = *s;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007755 if (strstr((char *)pf, "^^") != NULL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007756 {
7757 if (c != NUL)
7758 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007759 res[reslen] = c;
7760 reslen++;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007761 }
7762 mch_memmove(word, word + i + 1,
7763 STRLEN(word + i + 1) + 1);
7764 i = 0;
7765 z0 = 1;
7766 }
7767 }
7768 break;
7769 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007770 }
7771 }
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007772 else if (vim_iswhite(c))
7773 {
7774 c = ' ';
7775 k = 1;
7776 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007777
7778 if (z0 == 0)
7779 {
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007780 if (k && !p0 && reslen < MAXWLEN && c != NUL
7781 && (!slang->sl_collapse || reslen == 0
7782 || res[reslen - 1] != c))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007783 {
7784 /* condense only double letters */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007785 res[reslen] = c;
7786 reslen++;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007787 }
7788
7789 i++;
7790 z = 0;
7791 k = 0;
7792 }
7793 }
7794
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007795 res[reslen] = NUL;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00007796}
7797
Bram Moolenaar9f30f502005-06-14 22:01:04 +00007798/*
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007799 * Compute a score for two sound-a-like words.
7800 * This permits up to two inserts/deletes/swaps/etc. to keep things fast.
7801 * Instead of a generic loop we write out the code. That keeps it fast by
7802 * avoiding checks that will not be possible.
7803 */
7804 static int
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007805soundalike_score(goodstart, badstart)
7806 char_u *goodstart; /* sound-folded good word */
7807 char_u *badstart; /* sound-folded bad word */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007808{
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007809 char_u *goodsound = goodstart;
7810 char_u *badsound = badstart;
7811 int goodlen;
7812 int badlen;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007813 int n;
7814 char_u *pl, *ps;
7815 char_u *pl2, *ps2;
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007816 int score = 0;
7817
7818 /* adding/inserting "*" at the start (word starts with vowel) shouldn't be
7819 * counted so much, vowels halfway the word aren't counted at all. */
7820 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound)
7821 {
7822 score = SCORE_DEL / 2;
7823 if (*badsound == '*')
7824 ++badsound;
7825 else
7826 ++goodsound;
7827 }
7828
7829 goodlen = STRLEN(goodsound);
7830 badlen = STRLEN(badsound);
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007831
7832 /* Return quickly if the lenghts are too different to be fixed by two
7833 * changes. */
7834 n = goodlen - badlen;
7835 if (n < -2 || n > 2)
7836 return SCORE_MAXMAX;
7837
7838 if (n > 0)
7839 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007840 pl = goodsound; /* goodsound is longest */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007841 ps = badsound;
7842 }
7843 else
7844 {
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007845 pl = badsound; /* badsound is longest */
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007846 ps = goodsound;
7847 }
7848
7849 /* Skip over the identical part. */
7850 while (*pl == *ps && *pl != NUL)
7851 {
7852 ++pl;
7853 ++ps;
7854 }
7855
7856 switch (n)
7857 {
7858 case -2:
7859 case 2:
7860 /*
7861 * Must delete two characters from "pl".
7862 */
7863 ++pl; /* first delete */
7864 while (*pl == *ps)
7865 {
7866 ++pl;
7867 ++ps;
7868 }
7869 /* strings must be equal after second delete */
7870 if (STRCMP(pl + 1, ps) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007871 return score + SCORE_DEL * 2;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007872
7873 /* Failed to compare. */
7874 break;
7875
7876 case -1:
7877 case 1:
7878 /*
7879 * Minimal one delete from "pl" required.
7880 */
7881
7882 /* 1: delete */
7883 pl2 = pl + 1;
7884 ps2 = ps;
7885 while (*pl2 == *ps2)
7886 {
7887 if (*pl2 == NUL) /* reached the end */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007888 return score + SCORE_DEL;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007889 ++pl2;
7890 ++ps2;
7891 }
7892
7893 /* 2: delete then swap, then rest must be equal */
7894 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
7895 && STRCMP(pl2 + 2, ps2 + 2) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007896 return score + SCORE_DEL + SCORE_SWAP;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007897
7898 /* 3: delete then substitute, then the rest must be equal */
7899 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007900 return score + SCORE_DEL + SCORE_SUBST;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007901
7902 /* 4: first swap then delete */
7903 if (pl[0] == ps[1] && pl[1] == ps[0])
7904 {
7905 pl2 = pl + 2; /* swap, skip two chars */
7906 ps2 = ps + 2;
7907 while (*pl2 == *ps2)
7908 {
7909 ++pl2;
7910 ++ps2;
7911 }
7912 /* delete a char and then strings must be equal */
7913 if (STRCMP(pl2 + 1, ps2) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007914 return score + SCORE_SWAP + SCORE_DEL;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007915 }
7916
7917 /* 5: first substitute then delete */
7918 pl2 = pl + 1; /* substitute, skip one char */
7919 ps2 = ps + 1;
7920 while (*pl2 == *ps2)
7921 {
7922 ++pl2;
7923 ++ps2;
7924 }
7925 /* delete a char and then strings must be equal */
7926 if (STRCMP(pl2 + 1, ps2) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007927 return score + SCORE_SUBST + SCORE_DEL;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007928
7929 /* Failed to compare. */
7930 break;
7931
7932 case 0:
7933 /*
7934 * Lenghts are equal, thus changes must result in same length: An
7935 * insert is only possible in combination with a delete.
7936 * 1: check if for identical strings
7937 */
7938 if (*pl == NUL)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007939 return score;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007940
7941 /* 2: swap */
7942 if (pl[0] == ps[1] && pl[1] == ps[0])
7943 {
7944 pl2 = pl + 2; /* swap, skip two chars */
7945 ps2 = ps + 2;
7946 while (*pl2 == *ps2)
7947 {
7948 if (*pl2 == NUL) /* reached the end */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007949 return score + SCORE_SWAP;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007950 ++pl2;
7951 ++ps2;
7952 }
7953 /* 3: swap and swap again */
7954 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
7955 && STRCMP(pl2 + 2, ps2 + 2) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007956 return score + SCORE_SWAP + SCORE_SWAP;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007957
7958 /* 4: swap and substitute */
7959 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007960 return score + SCORE_SWAP + SCORE_SUBST;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007961 }
7962
7963 /* 5: substitute */
7964 pl2 = pl + 1;
7965 ps2 = ps + 1;
7966 while (*pl2 == *ps2)
7967 {
7968 if (*pl2 == NUL) /* reached the end */
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007969 return score + SCORE_SUBST;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007970 ++pl2;
7971 ++ps2;
7972 }
7973
7974 /* 6: substitute and swap */
7975 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
7976 && STRCMP(pl2 + 2, ps2 + 2) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007977 return score + SCORE_SUBST + SCORE_SWAP;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007978
7979 /* 7: substitute and substitute */
7980 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007981 return score + SCORE_SUBST + SCORE_SUBST;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007982
7983 /* 8: insert then delete */
7984 pl2 = pl;
7985 ps2 = ps + 1;
7986 while (*pl2 == *ps2)
7987 {
7988 ++pl2;
7989 ++ps2;
7990 }
7991 if (STRCMP(pl2 + 1, ps2) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00007992 return score + SCORE_INS + SCORE_DEL;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00007993
7994 /* 9: delete then insert */
7995 pl2 = pl + 1;
7996 ps2 = ps;
7997 while (*pl2 == *ps2)
7998 {
7999 ++pl2;
8000 ++ps2;
8001 }
8002 if (STRCMP(pl2, ps2 + 1) == 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00008003 return score + SCORE_INS + SCORE_DEL;
Bram Moolenaard857f0e2005-06-21 22:37:39 +00008004
8005 /* Failed to compare. */
8006 break;
8007 }
8008
8009 return SCORE_MAXMAX;
8010}
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008011
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008012/*
8013 * Compute the "edit distance" to turn "badword" into "goodword". The less
Bram Moolenaard857f0e2005-06-21 22:37:39 +00008014 * deletes/inserts/substitutes/swaps are required the lower the score.
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008015 *
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008016 * The algorithm comes from Aspell editdist.cpp, edit_distance().
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008017 * It has been converted from C++ to C and modified to support multi-byte
8018 * characters.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008019 */
8020 static int
8021spell_edit_score(badword, goodword)
8022 char_u *badword;
8023 char_u *goodword;
8024{
8025 int *cnt;
8026 int badlen, goodlen;
8027 int j, i;
8028 int t;
8029 int bc, gc;
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008030 int pbc, pgc;
8031#ifdef FEAT_MBYTE
8032 char_u *p;
8033 int wbadword[MAXWLEN];
8034 int wgoodword[MAXWLEN];
8035
8036 if (has_mbyte)
8037 {
8038 /* Get the characters from the multi-byte strings and put them in an
8039 * int array for easy access. */
8040 for (p = badword, badlen = 0; *p != NUL; )
8041 wbadword[badlen++] = mb_ptr2char_adv(&p);
8042 ++badlen;
8043 for (p = goodword, goodlen = 0; *p != NUL; )
8044 wgoodword[goodlen++] = mb_ptr2char_adv(&p);
8045 ++goodlen;
8046 }
8047 else
8048#endif
8049 {
8050 badlen = STRLEN(badword) + 1;
8051 goodlen = STRLEN(goodword) + 1;
8052 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008053
8054 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */
8055#define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008056 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)),
8057 TRUE);
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008058 if (cnt == NULL)
8059 return 0; /* out of memory */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008060
8061 CNT(0, 0) = 0;
8062 for (j = 1; j <= goodlen; ++j)
8063 CNT(0, j) = CNT(0, j - 1) + SCORE_DEL;
8064
8065 for (i = 1; i <= badlen; ++i)
8066 {
8067 CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS;
8068 for (j = 1; j <= goodlen; ++j)
8069 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008070#ifdef FEAT_MBYTE
8071 if (has_mbyte)
8072 {
8073 bc = wbadword[i - 1];
8074 gc = wgoodword[j - 1];
8075 }
8076 else
8077#endif
8078 {
8079 bc = badword[i - 1];
8080 gc = goodword[j - 1];
8081 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008082 if (bc == gc)
8083 CNT(i, j) = CNT(i - 1, j - 1);
8084 else
8085 {
8086 /* Use a better score when there is only a case difference. */
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008087 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008088 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
8089 else
8090 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
8091
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008092 if (i > 1 && j > 1)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008093 {
Bram Moolenaar9f30f502005-06-14 22:01:04 +00008094#ifdef FEAT_MBYTE
8095 if (has_mbyte)
8096 {
8097 pbc = wbadword[i - 2];
8098 pgc = wgoodword[j - 2];
8099 }
8100 else
8101#endif
8102 {
8103 pbc = badword[i - 2];
8104 pgc = goodword[j - 2];
8105 }
8106 if (bc == pgc && pbc == gc)
8107 {
8108 t = SCORE_SWAP + CNT(i - 2, j - 2);
8109 if (t < CNT(i, j))
8110 CNT(i, j) = t;
8111 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008112 }
8113 t = SCORE_DEL + CNT(i - 1, j);
8114 if (t < CNT(i, j))
8115 CNT(i, j) = t;
8116 t = SCORE_INS + CNT(i, j - 1);
8117 if (t < CNT(i, j))
8118 CNT(i, j) = t;
8119 }
8120 }
8121 }
Bram Moolenaard857f0e2005-06-21 22:37:39 +00008122
8123 i = CNT(badlen - 1, goodlen - 1);
8124 vim_free(cnt);
8125 return i;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00008126}
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00008127
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00008128/*
8129 * ":spelldump"
8130 */
8131/*ARGSUSED*/
8132 void
8133ex_spelldump(eap)
8134 exarg_T *eap;
8135{
8136 buf_T *buf = curbuf;
8137 langp_T *lp;
8138 slang_T *slang;
8139 idx_T arridx[MAXWLEN];
8140 int curi[MAXWLEN];
8141 char_u word[MAXWLEN];
8142 int c;
8143 char_u *byts;
8144 idx_T *idxs;
8145 linenr_T lnum = 0;
8146 int round;
8147 int depth;
8148 int n;
8149 int flags;
8150
8151 if (no_spell_checking())
8152 return;
8153
8154 /* Create a new empty buffer by splitting the window. */
8155 do_cmdline_cmd((char_u *)"new");
8156 if (!bufempty() || !buf_valid(buf))
8157 return;
8158
8159 for (lp = LANGP_ENTRY(buf->b_langp, 0); lp->lp_slang != NULL; ++lp)
8160 {
8161 slang = lp->lp_slang;
8162
8163 vim_snprintf((char *)IObuff, IOSIZE, "# file: %s", slang->sl_fname);
8164 ml_append(lnum++, IObuff, (colnr_T)0, FALSE);
8165
8166 /* round 1: case-folded tree
8167 * round 2: keep-case tree */
8168 for (round = 1; round <= 2; ++round)
8169 {
8170 if (round == 1)
8171 {
8172 byts = slang->sl_fbyts;
8173 idxs = slang->sl_fidxs;
8174 }
8175 else
8176 {
8177 byts = slang->sl_kbyts;
8178 idxs = slang->sl_kidxs;
8179 }
8180 if (byts == NULL)
8181 continue; /* array is empty */
8182
8183 depth = 0;
8184 arridx[0] = 0;
8185 curi[0] = 1;
8186 while (depth >= 0 && !got_int)
8187 {
8188 if (curi[depth] > byts[arridx[depth]])
8189 {
8190 /* Done all bytes at this node, go up one level. */
8191 --depth;
8192 line_breakcheck();
8193 }
8194 else
8195 {
8196 /* Do one more byte at this node. */
8197 n = arridx[depth] + curi[depth];
8198 ++curi[depth];
8199 c = byts[n];
8200 if (c == 0)
8201 {
8202 /* End of word, deal with the word.
8203 * Don't use keep-case words in the fold-case tree,
8204 * they will appear in the keep-case tree.
8205 * Only use the word when the region matches. */
8206 flags = (int)idxs[n];
8207 if ((round == 2 || (flags & WF_KEEPCAP) == 0)
8208 && ((flags & WF_REGION) == 0
8209 || (((unsigned)flags >> 8)
8210 & lp->lp_region) != 0))
8211 {
8212 word[depth] = NUL;
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00008213
8214 /* Dump the basic word if there is no prefix or
8215 * when it's the first one. */
8216 c = (unsigned)flags >> 16;
8217 if (c == 0 || curi[depth] == 2)
8218 dump_word(word, round, flags, lnum++);
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00008219
8220 /* Apply the prefix, if there is one. */
Bram Moolenaar0a5fe212005-06-24 23:01:23 +00008221 if (c != 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00008222 lnum = apply_prefixes(slang, word, round,
8223 flags, lnum);
8224 }
8225 }
8226 else
8227 {
8228 /* Normal char, go one level deeper. */
8229 word[depth++] = c;
8230 arridx[depth] = idxs[n];
8231 curi[depth] = 1;
8232 }
8233 }
8234 }
8235 }
8236 }
8237
8238 /* Delete the empty line that we started with. */
8239 if (curbuf->b_ml.ml_line_count > 1)
8240 ml_delete(curbuf->b_ml.ml_line_count, FALSE);
8241
8242 redraw_later(NOT_VALID);
8243}
8244
8245/*
8246 * Dump one word: apply case modifications and append a line to the buffer.
8247 */
8248 static void
8249dump_word(word, round, flags, lnum)
8250 char_u *word;
8251 int round;
8252 int flags;
8253 linenr_T lnum;
8254{
8255 int keepcap = FALSE;
8256 char_u *p;
8257 char_u cword[MAXWLEN];
8258 char_u badword[MAXWLEN + 3];
8259
8260 if (round == 1 && (flags & WF_CAPMASK) != 0)
8261 {
8262 /* Need to fix case according to "flags". */
8263 make_case_word(word, cword, flags);
8264 p = cword;
8265 }
8266 else
8267 {
8268 p = word;
8269 if (round == 2 && (captype(word, NULL) & WF_KEEPCAP) == 0)
8270 keepcap = TRUE;
8271 }
8272
8273 /* Bad word is preceded by "/!" and some other
8274 * flags. */
8275 if ((flags & (WF_BANNED | WF_RARE)) || keepcap)
8276 {
8277 STRCPY(badword, "/");
8278 if (keepcap)
8279 STRCAT(badword, "=");
8280 if (flags & WF_BANNED)
8281 STRCAT(badword, "!");
8282 else if (flags & WF_RARE)
8283 STRCAT(badword, "?");
8284 STRCAT(badword, p);
8285 p = badword;
8286 }
8287
8288 ml_append(lnum, p, (colnr_T)0, FALSE);
8289}
8290
8291/*
8292 * Find matching prefixes for "word". Prepend each to "word" and append
8293 * a line to the buffer.
8294 * Return the updated line number.
8295 */
8296 static linenr_T
8297apply_prefixes(slang, word, round, flags, startlnum)
8298 slang_T *slang;
8299 char_u *word; /* case-folded word */
8300 int round;
8301 int flags; /* flags with prefix ID */
8302 linenr_T startlnum;
8303{
8304 idx_T arridx[MAXWLEN];
8305 int curi[MAXWLEN];
8306 char_u prefix[MAXWLEN];
8307 int c;
8308 char_u *byts;
8309 idx_T *idxs;
8310 linenr_T lnum = startlnum;
8311 int depth;
8312 int n;
8313 int len;
8314 int prefid = (unsigned)flags >> 16;
8315 int i;
8316
8317 byts = slang->sl_pbyts;
8318 idxs = slang->sl_pidxs;
8319 if (byts != NULL) /* array not is empty */
8320 {
8321 /*
8322 * Loop over all prefixes, building them byte-by-byte in prefix[].
8323 * When at the end of a prefix check that it supports "prefid".
8324 */
8325 depth = 0;
8326 arridx[0] = 0;
8327 curi[0] = 1;
8328 while (depth >= 0 && !got_int)
8329 {
8330 len = arridx[depth];
8331 if (curi[depth] > byts[len])
8332 {
8333 /* Done all bytes at this node, go up one level. */
8334 --depth;
8335 line_breakcheck();
8336 }
8337 else
8338 {
8339 /* Do one more byte at this node. */
8340 n = len + curi[depth];
8341 ++curi[depth];
8342 c = byts[n];
8343 if (c == 0)
8344 {
8345 /* End of prefix, find out how many IDs there are. */
8346 for (i = 1; i < len; ++i)
8347 if (byts[n + i] != 0)
8348 break;
8349 curi[depth] += i - 1;
8350
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00008351 i = valid_word_prefix(i, n, prefid, word, slang);
8352 if (i != 0)
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00008353 {
8354 vim_strncpy(prefix + depth, word, MAXWLEN - depth);
Bram Moolenaarcf6bf392005-06-27 22:27:46 +00008355 dump_word(prefix, round,
8356 (i & WF_RAREPFX) ? (flags | WF_RARE)
8357 : flags, lnum++);
Bram Moolenaarf417f2b2005-06-23 22:29:21 +00008358 }
8359 }
8360 else
8361 {
8362 /* Normal char, go one level deeper. */
8363 prefix[depth++] = c;
8364 arridx[depth] = idxs[n];
8365 curi[depth] = 1;
8366 }
8367 }
8368 }
8369 }
8370
8371 return lnum;
8372}
8373
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00008374#endif /* FEAT_SYN_HL */