blob: 2b37f632fc285aec0c2ae72b3bd2cf4029facfb9 [file] [log] [blame]
Bram Moolenaare19defe2005-03-21 08:23:33 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10/*
11 * spell.c: code for spell checking
Bram Moolenaarfc735152005-03-22 22:54:12 +000012 *
Bram Moolenaar51485f02005-06-04 21:55:20 +000013 * The spell checking mechanism uses a tree (aka trie). Each node in the tree
14 * has a list of bytes that can appear (siblings). For each byte there is a
15 * pointer to the node with the byte that follows in the word (child).
16 * A NUL byte is used where the word may end.
17 *
18 * There are two trees: one with case-folded words and one with words in
19 * original case. The second one is only used for keep-case words and is
20 * usually small.
21 *
22 * Thanks to Olaf Seibert for providing an example implementation of this tree
23 * and the compression mechanism.
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +000024 *
25 * Matching involves checking the caps type: Onecap ALLCAP KeepCap.
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +000026 *
Bram Moolenaar402d2fe2005-04-15 21:00:38 +000027 * Why doesn't Vim use aspell/ispell/myspell/etc.?
28 * See ":help develop-spell".
29 */
30
Bram Moolenaar51485f02005-06-04 21:55:20 +000031/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +000032 * Use this to let the score depend in how much a suggestion sounds like the
33 * bad word. It's quite slow and doesn't make the sorting much better....
34 * #define SOUNDFOLD_SCORE
35 */
36
37/*
Bram Moolenaar51485f02005-06-04 21:55:20 +000038 * Vim spell file format: <HEADER> <SUGGEST> <LWORDTREE> <KWORDTREE>
39 *
40 * <HEADER>: <fileID> <regioncnt> <regionname> ...
41 * <charflagslen> <charflags> <fcharslen> <fchars>
42 *
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +000043 * <fileID> 10 bytes "VIMspell06"
Bram Moolenaar51485f02005-06-04 21:55:20 +000044 * <regioncnt> 1 byte number of regions following (8 supported)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +000045 * <regionname> 2 bytes Region name: ca, au, etc. Lower case.
Bram Moolenaar51485f02005-06-04 21:55:20 +000046 * First <regionname> is region 1.
47 *
48 * <charflagslen> 1 byte Number of bytes in <charflags> (should be 128).
49 * <charflags> N bytes List of flags (first one is for character 128):
50 * 0x01 word character
Bram Moolenaarcfc6c432005-06-06 21:50:35 +000051 * 0x02 upper-case character
Bram Moolenaar51485f02005-06-04 21:55:20 +000052 * <fcharslen> 2 bytes Number of bytes in <fchars>.
53 * <fchars> N bytes Folded characters, first one is for character 128.
54 *
55 *
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +000056 * <SUGGEST> : <repcount> <rep> ...
57 * <salflags> <salcount> <sal> ...
58 * <maplen> <mapstr>
Bram Moolenaar51485f02005-06-04 21:55:20 +000059 *
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +000060 * <repcount> 2 bytes number of <rep> items, MSB first.
61 *
62 * <rep> : <repfromlen> <repfrom> <reptolen> <repto>
63 *
64 * <repfromlen> 1 byte length of <repfrom>
65 *
66 * <repfrom> N bytes "from" part of replacement
67 *
68 * <reptolen> 1 byte length of <repto>
69 *
70 * <repto> N bytes "to" part of replacement
71 *
72 * <salflags> 1 byte flags for soundsalike conversion:
73 * SAL_F0LLOWUP
74 * SAL_COLLAPSE
75 * SAL_REM_ACCENTS
76 *
77 * <sal> : <salfromlen> <salfrom> <saltolen> <salto>
78 *
79 * <salfromlen> 1 byte length of <salfrom>
80 *
81 * <salfrom> N bytes "from" part of soundsalike
82 *
83 * <saltolen> 1 byte length of <salto>
84 *
85 * <salto> N bytes "to" part of soundsalike
86 *
87 * <maplen> 2 bytes length of <mapstr>, MSB first
88 *
89 * <mapstr> N bytes String with sequences of similar characters,
90 * separated by slashes.
Bram Moolenaar51485f02005-06-04 21:55:20 +000091 *
92 *
93 * <LWORDTREE>: <wordtree>
94 *
95 * <wordtree>: <nodecount> <nodedata> ...
96 *
97 * <nodecount> 4 bytes Number of nodes following. MSB first.
98 *
99 * <nodedata>: <siblingcount> <sibling> ...
100 *
101 * <siblingcount> 1 byte Number of siblings in this node. The siblings
102 * follow in sorted order.
103 *
104 * <sibling>: <byte> [<nodeidx> <xbyte> | <flags> [<region>]]
105 *
106 * <byte> 1 byte Byte value of the sibling. Special cases:
107 * BY_NOFLAGS: End of word without flags and for all
108 * regions.
109 * BY_FLAGS: End of word, <flags> follow.
110 * BY_INDEX: Child of sibling is shared, <nodeidx>
111 * and <xbyte> follow.
112 *
113 * <nodeidx> 3 bytes Index of child for this sibling, MSB first.
114 *
115 * <xbyte> 1 byte byte value of the sibling.
116 *
117 * <flags> 1 byte bitmask of:
118 * WF_ALLCAP word must have only capitals
119 * WF_ONECAP first char of word must be capital
120 * WF_RARE rare word
121 * WF_REGION <region> follows
122 *
123 * <region> 1 byte Bitmask for regions in which word is valid. When
124 * omitted it's valid in all regions.
125 * Lowest bit is for region 1.
126 *
127 * <KWORDTREE>: <wordtree>
128 *
Bram Moolenaar51485f02005-06-04 21:55:20 +0000129 * All text characters are in 'encoding', but stored as single bytes.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000130 */
131
Bram Moolenaare19defe2005-03-21 08:23:33 +0000132#if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64)
133# include <io.h> /* for lseek(), must be before vim.h */
134#endif
135
136#include "vim.h"
137
138#if defined(FEAT_SYN_HL) || defined(PROTO)
139
140#ifdef HAVE_FCNTL_H
141# include <fcntl.h>
142#endif
143
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000144#define MAXWLEN 250 /* Assume max. word len is this many bytes.
145 Some places assume a word length fits in a
146 byte, thus it can't be above 255. */
Bram Moolenaarfc735152005-03-22 22:54:12 +0000147
Bram Moolenaar51485f02005-06-04 21:55:20 +0000148/* Flags used for a word. */
149#define WF_REGION 0x01 /* region byte follows */
150#define WF_ONECAP 0x02 /* word with one capital (or all capitals) */
151#define WF_ALLCAP 0x04 /* word must be all capitals */
152#define WF_RARE 0x08 /* rare word */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000153#define WF_BANNED 0x10 /* bad word */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000154#define WF_KEEPCAP 0x80 /* keep-case word */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000155
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000156#define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000157
158#define BY_NOFLAGS 0 /* end of word without flags or region */
159#define BY_FLAGS 1 /* end of word, flag byte follows */
160#define BY_INDEX 2 /* child is shared, index follows */
161#define BY_SPECIAL BY_INDEX /* hightest special byte value */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000162
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000163/* Info from "REP" and "SAL" entries in ".aff" file used in si_rep, sl_rep,
164 * si_sal and sl_sal.
165 * One replacement: from "ft_from" to "ft_to". */
166typedef struct fromto_S
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000167{
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000168 char_u *ft_from;
169 char_u *ft_to;
170} fromto_T;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000171
172/*
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000173 * Structure used to store words and other info for one language, loaded from
174 * a .spl file.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000175 * The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
176 * case-folded words. "sl_kbyts/sl_kidxs" is for keep-case words.
177 *
178 * The "byts" array stores the possible bytes in each tree node, preceded by
179 * the number of possible bytes, sorted on byte value:
180 * <len> <byte1> <byte2> ...
181 * The "idxs" array stores the index of the child node corresponding to the
182 * byte in "byts".
183 * Exception: when the byte is zero, the word may end here and "idxs" holds
184 * the flags and region for the word. There may be several zeros in sequence
185 * for alternative flag/region combinations.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000186 */
187typedef struct slang_S slang_T;
188struct slang_S
189{
190 slang_T *sl_next; /* next language */
191 char_u *sl_name; /* language name "en", "en.rare", "nl", etc. */
Bram Moolenaarb765d632005-06-07 21:00:02 +0000192 char_u *sl_fname; /* name of .spl file */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000193 int sl_add; /* TRUE if it's a .add file. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000194 char_u *sl_fbyts; /* case-folded word bytes */
195 int *sl_fidxs; /* case-folded word indexes */
196 char_u *sl_kbyts; /* keep-case word bytes */
197 int *sl_kidxs; /* keep-case word indexes */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000198 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000199
200 garray_T sl_rep; /* list of fromto_T entries from REP lines */
201 short sl_rep_first[256]; /* indexes where byte first appears, -1 if
202 there is none */
203 garray_T sl_sal; /* list of fromto_T entries from SAL lines */
204 short sl_sal_first[256]; /* indexes where byte first appears, -1 if
205 there is none */
206 int sl_followup; /* SAL followup */
207 int sl_collapse; /* SAL collapse_result */
208 int sl_rem_accents; /* SAL remove_accents */
209 char_u *sl_map; /* string with similar chars from MAP lines */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000210};
211
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000212/* First language that is loaded, start of the linked list of loaded
213 * languages. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000214static slang_T *first_lang = NULL;
215
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000216/* Flags used in .spl file for soundsalike flags. */
217#define SAL_F0LLOWUP 1
218#define SAL_COLLAPSE 2
219#define SAL_REM_ACCENTS 4
220
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000221/*
222 * Structure used in "b_langp", filled from 'spelllang'.
223 */
224typedef struct langp_S
225{
226 slang_T *lp_slang; /* info for this language (NULL for last one) */
227 int lp_region; /* bitmask for region or REGION_ALL */
228} langp_T;
229
230#define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
231
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000232#define REGION_ALL 0xff /* word valid in all regions */
233
234/* Result values. Lower number is accepted over higher one. */
235#define SP_BANNED -1
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000236#define SP_OK 0
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000237#define SP_RARE 1
238#define SP_LOCAL 2
239#define SP_BAD 3
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000240
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000241#define VIMSPELLMAGIC "VIMspell06" /* string at start of Vim spell file */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000242#define VIMSPELLMAGICL 10
243
244/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000245 * Information used when looking for suggestions.
246 */
247typedef struct suginfo_S
248{
249 garray_T su_ga; /* suggestions, contains "suggest_T" */
250 int su_maxscore; /* maximum score for adding to su_ga */
251 int su_icase; /* accept words with wrong case */
252 int su_icase_add; /* add matches while ignoring case */
253 char_u *su_badptr; /* start of bad word in line */
254 int su_badlen; /* length of detected bad word in line */
255 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */
256 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */
257 hashtab_T su_banned; /* table with banned words */
258#ifdef SOUNDFOLD_SCORE
259 slang_T *su_slang; /* currently used slang_T */
260 char_u su_salword[MAXWLEN]; /* soundfolded badword */
261#endif
262} suginfo_T;
263
264/* One word suggestion. Used in "si_ga". */
265typedef struct suggest_S
266{
267 char_u *st_word; /* suggested word, allocated string */
268 int st_orglen; /* length of replaced text */
269 int st_score; /* lower is better */
270} suggest_T;
271
272#define SUG(sup, i) (((suggest_T *)(sup)->su_ga.ga_data)[i])
273
274/* Number of suggestions displayed. */
275#define SUG_PROMPT_COUNT ((int)Rows - 2)
276
277/* Threshold for sorting and cleaning up suggestions. */
278#define SUG_CLEANUP_COUNT (SUG_PROMPT_COUNT + 50)
279
280/* score for various changes */
281#define SCORE_SPLIT 99 /* split bad word */
282#define SCORE_ICASE 52 /* slightly different case */
283#define SCORE_ALLCAP 120 /* need all-cap case */
284#define SCORE_REGION 70 /* word is for different region */
285#define SCORE_RARE 180 /* rare word */
286
287/* score for edit distance */
288#define SCORE_SWAP 90 /* swap two characters */
289#define SCORE_SWAP3 110 /* swap two characters in three */
290#define SCORE_REP 87 /* REP replacement */
291#define SCORE_SUBST 93 /* substitute a character */
292#define SCORE_SIMILAR 33 /* substitute a similar character */
293#define SCORE_DEL 96 /* delete a character */
294#define SCORE_INS 94 /* insert a character */
295
296#define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower.
297 * 350 allows for about three changes. */
298#define SCORE_MAXMAX 999999 /* accept any score */
299
300/*
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000301 * Structure to store info for word matching.
302 */
303typedef struct matchinf_S
304{
305 langp_T *mi_lp; /* info for language and region */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000306
307 /* pointers to original text to be checked */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000308 char_u *mi_word; /* start of word being checked */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000309 char_u *mi_end; /* end of matching word */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000310 char_u *mi_fend; /* next char to be added to mi_fword */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000311 char_u *mi_cend; /* char after what was used for
312 mi_capflags */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000313
314 /* case-folded text */
315 char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000316 int mi_fwordlen; /* nr of valid bytes in mi_fword */
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000317
318 /* others */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000319 int mi_result; /* result so far: SP_BAD, SP_OK, etc. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000320 int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000321} matchinf_T;
322
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000323/*
324 * The tables used for recognizing word characters according to spelling.
325 * These are only used for the first 256 characters of 'encoding'.
326 */
327typedef struct spelltab_S
328{
329 char_u st_isw[256]; /* flags: is word char */
330 char_u st_isu[256]; /* flags: is uppercase char */
331 char_u st_fold[256]; /* chars: folded case */
332} spelltab_T;
333
334static spelltab_T spelltab;
335static int did_set_spelltab;
336
337#define SPELL_ISWORD 1
338#define SPELL_ISUPPER 2
339
340static void clear_spell_chartab __ARGS((spelltab_T *sp));
341static int set_spell_finish __ARGS((spelltab_T *new_st));
342
343/*
344 * Return TRUE if "p" points to a word character or "c" is a word character
345 * for spelling.
346 * Checking for a word character is done very often, avoid the function call
347 * overhead.
348 */
349#ifdef FEAT_MBYTE
350# define SPELL_ISWORDP(p) ((has_mbyte && MB_BYTE2LEN(*(p)) > 1) \
351 ? (mb_get_class(p) >= 2) : spelltab.st_isw[*(p)])
352#else
353# define SPELL_ISWORDP(p) (spelltab.st_isw[*(p)])
354#endif
355
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000356/*
357 * Struct to keep the state at each level in spell_try_change().
358 */
359typedef struct trystate_S
360{
361 int ts_state; /* state at this level, STATE_ */
362 int ts_score; /* score */
363 int ts_curi; /* index in list of child nodes */
364 int ts_fidx; /* index in fword[], case-folded bad word */
365 int ts_fidxtry; /* ts_fidx at which bytes may be changed */
366 int ts_twordlen; /* valid length of tword[] */
367 int ts_arridx; /* index in tree array, start of node */
368 char_u ts_save_prewordlen; /* saved "prewordlen" */
369 int ts_save_splitoff; /* su_splitoff saved here */
370 int ts_save_badflags; /* badflags saved here */
371} trystate_T;
372
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000373static slang_T *slang_alloc __ARGS((char_u *lang));
374static void slang_free __ARGS((slang_T *lp));
Bram Moolenaarb765d632005-06-07 21:00:02 +0000375static void slang_clear __ARGS((slang_T *lp));
Bram Moolenaar51485f02005-06-04 21:55:20 +0000376static void find_word __ARGS((matchinf_T *mip, int keepcap));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000377static int spell_valid_case __ARGS((int origflags, int treeflags));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000378static void spell_load_lang __ARGS((char_u *lang));
Bram Moolenaarb765d632005-06-07 21:00:02 +0000379static char_u *spell_enc __ARGS((void));
380static void spell_load_cb __ARGS((char_u *fname, void *cookie));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000381static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent));
Bram Moolenaar51485f02005-06-04 21:55:20 +0000382static int read_tree __ARGS((FILE *fd, char_u *byts, int *idxs, int maxidx, int startidx));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000383static int find_region __ARGS((char_u *rp, char_u *region));
384static int captype __ARGS((char_u *word, char_u *end));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000385static void spell_reload_one __ARGS((char_u *fname, int added_word));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000386static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000387static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
388static void write_spell_chartab __ARGS((FILE *fd));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000389static int spell_isupper __ARGS((int c));
390static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000391static void onecap_copy __ARGS((char_u *word, int len, char_u *wcopy, int upper));
392static void spell_try_change __ARGS((suginfo_T *su));
393static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add));
394static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
395static void spell_try_soundalike __ARGS((suginfo_T *su));
396static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
397static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
398static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score));
399static void add_banned __ARGS((suginfo_T *su, char_u *word));
400static int was_banned __ARGS((suginfo_T *su, char_u *word));
401static void free_banned __ARGS((suginfo_T *su));
402static void cleanup_suggestions __ARGS((suginfo_T *su));
403static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res));
404static int spell_edit_score __ARGS((char_u *badword, char_u *goodword));
405
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000406
407static char *e_format = N_("E759: Format error in spell file");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000408
409/*
410 * Main spell-checking function.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000411 * "ptr" points to a character that could be the start of a word.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000412 * "*attrp" is set to the attributes for a badly spelled word. For a non-word
413 * or when it's OK it remains unchanged.
414 * This must only be called when 'spelllang' is not empty.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000415 *
416 * "sug" is normally NULL. When looking for suggestions it points to
417 * suginfo_T. It's passed as a void pointer to keep the struct local.
418 *
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000419 * Returns the length of the word in bytes, also when it's OK, so that the
420 * caller can skip over the word.
421 */
422 int
Bram Moolenaar51485f02005-06-04 21:55:20 +0000423spell_check(wp, ptr, attrp)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000424 win_T *wp; /* current window */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000425 char_u *ptr;
426 int *attrp;
427{
428 matchinf_T mi; /* Most things are put in "mi" so that it can
429 be passed to functions quickly. */
430
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000431 /* A word never starts at a space or a control character. Return quickly
432 * then, skipping over the character. */
433 if (*ptr <= ' ')
434 return 1;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000435
Bram Moolenaar51485f02005-06-04 21:55:20 +0000436 /* A word starting with a number is always OK. Also skip hexadecimal
437 * numbers 0xFF99 and 0X99FF. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000438 if (*ptr >= '0' && *ptr <= '9')
Bram Moolenaar51485f02005-06-04 21:55:20 +0000439 {
Bram Moolenaar3982c542005-06-08 21:56:31 +0000440 if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X'))
441 mi.mi_end = skiphex(ptr + 2);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000442 else
443 mi.mi_end = skipdigits(ptr);
444 }
445 else
446 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000447 /* Find the end of the word. */
448 mi.mi_word = ptr;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000449 mi.mi_fend = ptr;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000450
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000451 if (SPELL_ISWORDP(mi.mi_fend))
Bram Moolenaar51485f02005-06-04 21:55:20 +0000452 {
453 /* Make case-folded copy of the characters until the next non-word
454 * character. */
455 do
456 {
457 mb_ptr_adv(mi.mi_fend);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000458 } while (*mi.mi_fend != NUL && SPELL_ISWORDP(mi.mi_fend));
Bram Moolenaar51485f02005-06-04 21:55:20 +0000459 }
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000460
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000461 /* We always use the characters up to the next non-word character,
462 * also for bad words. */
463 mi.mi_end = mi.mi_fend;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000464
465 /* Check caps type later. */
466 mi.mi_capflags = 0;
467 mi.mi_cend = NULL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000468
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000469 /* Include one non-word character so that we can check for the
470 * word end. */
471 if (*mi.mi_fend != NUL)
472 mb_ptr_adv(mi.mi_fend);
473
474 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword,
475 MAXWLEN + 1);
476 mi.mi_fwordlen = STRLEN(mi.mi_fword);
477
Bram Moolenaar51485f02005-06-04 21:55:20 +0000478 /* The word is bad unless we recognize it. */
479 mi.mi_result = SP_BAD;
480
481 /*
482 * Loop over the languages specified in 'spelllang'.
483 * We check them all, because a matching word may be longer than an
484 * already found matching word.
485 */
486 for (mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000487 mi.mi_lp->lp_slang != NULL; ++mi.mi_lp)
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000488 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000489 /* Check for a matching word in case-folded words. */
490 find_word(&mi, FALSE);
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000491
Bram Moolenaar51485f02005-06-04 21:55:20 +0000492 find_word(&mi, TRUE);
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000493 }
494
Bram Moolenaar51485f02005-06-04 21:55:20 +0000495 if (mi.mi_result != SP_OK)
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +0000496 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000497 /* When we are at a non-word character there is no error, just
498 * skip over the character (try looking for a word after it). */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000499 if (!SPELL_ISWORDP(ptr))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000500 {
501#ifdef FEAT_MBYTE
502 if (has_mbyte)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000503 return mb_ptr2len_check(ptr);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000504#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +0000505 return 1;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000506 }
Bram Moolenaar51485f02005-06-04 21:55:20 +0000507
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000508 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000509 *attrp = highlight_attr[HLF_SPB];
510 else if (mi.mi_result == SP_RARE)
511 *attrp = highlight_attr[HLF_SPR];
512 else
513 *attrp = highlight_attr[HLF_SPL];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000514 }
515 }
516
Bram Moolenaar51485f02005-06-04 21:55:20 +0000517 return (int)(mi.mi_end - ptr);
518}
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000519
Bram Moolenaar51485f02005-06-04 21:55:20 +0000520/*
521 * Check if the word at "mip->mi_word" is in the tree.
522 * When "keepcap" is TRUE check in keep-case word tree.
523 *
524 * For a match mip->mi_result is updated.
525 */
526 static void
527find_word(mip, keepcap)
528 matchinf_T *mip;
529 int keepcap;
530{
531 int arridx = 0;
532 int endlen[MAXWLEN]; /* length at possible word endings */
533 int endidx[MAXWLEN]; /* possible word endings */
534 int endidxcnt = 0;
535 int len;
536 int wlen = 0;
537 int flen;
538 int c;
539 char_u *ptr;
540 unsigned lo, hi, m;
541#ifdef FEAT_MBYTE
542 char_u *s;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000543#endif
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000544 char_u *p;
545 int res = SP_BAD;
546 int valid;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000547 slang_T *slang = mip->mi_lp->lp_slang;
548 unsigned flags;
549 char_u *byts;
550 int *idxs;
551
552 if (keepcap)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000553 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000554 /* Check for word with matching case in keep-case tree. */
555 ptr = mip->mi_word;
556 flen = 9999; /* no case folding, always enough bytes */
557 byts = slang->sl_kbyts;
558 idxs = slang->sl_kidxs;
559 }
560 else
561 {
562 /* Check for case-folded in case-folded tree. */
563 ptr = mip->mi_fword;
564 flen = mip->mi_fwordlen; /* available case-folded bytes */
565 byts = slang->sl_fbyts;
566 idxs = slang->sl_fidxs;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000567 }
568
Bram Moolenaar51485f02005-06-04 21:55:20 +0000569 if (byts == NULL)
570 return; /* array is empty */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000571
Bram Moolenaar51485f02005-06-04 21:55:20 +0000572 /*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000573 * Repeat advancing in the tree until:
574 * - there is a byte that doesn't match,
575 * - we reach the end of the tree,
576 * - or we reach the end of the line.
Bram Moolenaar51485f02005-06-04 21:55:20 +0000577 */
578 for (;;)
579 {
580 if (flen == 0 && *mip->mi_fend != NUL)
581 {
582 /* Need to fold at least one more character. Do until next
583 * non-word character for efficiency. */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000584 p = mip->mi_fend;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000585 do
586 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000587 mb_ptr_adv(mip->mi_fend);
588 } while (*mip->mi_fend != NUL && SPELL_ISWORDP(mip->mi_fend));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000589
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000590 /* Include the non-word character so that we can check for the
591 * word end. */
592 if (*mip->mi_fend != NUL)
593 mb_ptr_adv(mip->mi_fend);
594
595 (void)spell_casefold(p, (int)(mip->mi_fend - p),
Bram Moolenaar51485f02005-06-04 21:55:20 +0000596 mip->mi_fword + mip->mi_fwordlen,
597 MAXWLEN - mip->mi_fwordlen);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000598 flen = STRLEN(mip->mi_fword + mip->mi_fwordlen);
599 mip->mi_fwordlen += flen;
600 }
601
602 len = byts[arridx++];
603
604 /* If the first possible byte is a zero the word could end here.
605 * Remember this index, we first check for the longest word. */
606 if (byts[arridx] == 0)
607 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000608 if (endidxcnt == MAXWLEN)
609 {
610 /* Must be a corrupted spell file. */
611 EMSG(_(e_format));
612 return;
613 }
Bram Moolenaar51485f02005-06-04 21:55:20 +0000614 endlen[endidxcnt] = wlen;
615 endidx[endidxcnt++] = arridx++;
616 --len;
617
618 /* Skip over the zeros, there can be several flag/region
619 * combinations. */
620 while (len > 0 && byts[arridx] == 0)
621 {
622 ++arridx;
623 --len;
624 }
625 if (len == 0)
626 break; /* no children, word must end here */
627 }
628
629 /* Stop looking at end of the line. */
630 if (ptr[wlen] == NUL)
631 break;
632
633 /* Perform a binary search in the list of accepted bytes. */
634 c = ptr[wlen];
635 lo = arridx;
636 hi = arridx + len - 1;
637 while (lo < hi)
638 {
639 m = (lo + hi) / 2;
640 if (byts[m] > c)
641 hi = m - 1;
642 else if (byts[m] < c)
643 lo = m + 1;
644 else
645 {
646 lo = hi = m;
647 break;
648 }
649 }
650
651 /* Stop if there is no matching byte. */
652 if (hi < lo || byts[lo] != c)
653 break;
654
655 /* Continue at the child (if there is one). */
656 arridx = idxs[lo];
657 ++wlen;
658 --flen;
659 }
660
661 /*
662 * Verify that one of the possible endings is valid. Try the longest
663 * first.
664 */
665 while (endidxcnt > 0)
666 {
667 --endidxcnt;
668 arridx = endidx[endidxcnt];
669 wlen = endlen[endidxcnt];
670
671#ifdef FEAT_MBYTE
672 if ((*mb_head_off)(ptr, ptr + wlen) > 0)
673 continue; /* not at first byte of character */
674#endif
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000675 if (SPELL_ISWORDP(ptr + wlen))
Bram Moolenaar51485f02005-06-04 21:55:20 +0000676 continue; /* next char is a word character */
677
678#ifdef FEAT_MBYTE
679 if (!keepcap && has_mbyte)
680 {
681 /* Compute byte length in original word, length may change
682 * when folding case. */
683 p = mip->mi_word;
684 for (s = ptr; s < ptr + wlen; mb_ptr_adv(s))
685 mb_ptr_adv(p);
686 wlen = p - mip->mi_word;
687 }
688#endif
689
690 /* Check flags and region. Repeat this if there are more
691 * flags/region alternatives until there is a match. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000692 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; --len)
693 {
694 flags = idxs[arridx];
695 if (keepcap)
696 {
697 /* For "keepcap" tree the case is always right. */
698 valid = TRUE;
699 }
700 else
701 {
702 /* Check that the word is in the required case. */
703 if (mip->mi_cend != mip->mi_word + wlen)
704 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000705 /* mi_capflags was set for a different word length, need
706 * to do it again. */
Bram Moolenaar51485f02005-06-04 21:55:20 +0000707 mip->mi_cend = mip->mi_word + wlen;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000708 mip->mi_capflags = captype(mip->mi_word, mip->mi_cend);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000709 }
710
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000711 valid = spell_valid_case(mip->mi_capflags, flags);
Bram Moolenaar51485f02005-06-04 21:55:20 +0000712 }
713
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000714 if (valid)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000715 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000716 if (flags & WF_BANNED)
717 res = SP_BANNED;
718 else if (flags & WF_REGION)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000719 {
720 /* Check region. */
721 if ((mip->mi_lp->lp_region & (flags >> 8)) != 0)
722 res = SP_OK;
723 else
724 res = SP_LOCAL;
725 }
726 else if (flags & WF_RARE)
727 res = SP_RARE;
728 else
729 res = SP_OK;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000730
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000731 /* Always use the longest match and the best result. */
732 if (mip->mi_result > res)
733 {
734 mip->mi_result = res;
735 mip->mi_end = mip->mi_word + wlen;
736 }
737 else if (mip->mi_result == res
738 && mip->mi_end < mip->mi_word + wlen)
739 mip->mi_end = mip->mi_word + wlen;
740
741 if (res == SP_OK)
742 break;
743 }
744 else
745 res = SP_BAD;
746
Bram Moolenaar51485f02005-06-04 21:55:20 +0000747 ++arridx;
748 }
749
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000750 if (res == SP_OK)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000751 break;
Bram Moolenaar51485f02005-06-04 21:55:20 +0000752 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000753}
754
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000755/*
756 * Check case flags for a word. Return TRUE if the word has the requested
757 * case.
758 */
759 static int
760spell_valid_case(origflags, treeflags)
761 int origflags; /* flags for the checked word. */
762 int treeflags; /* flags for the word in the spell tree */
763{
764 return (origflags == WF_ALLCAP
765 || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0
766 && ((treeflags & WF_ONECAP) == 0 || origflags == WF_ONECAP)));
767}
768
Bram Moolenaar51485f02005-06-04 21:55:20 +0000769
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000770/*
771 * Move to next spell error.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000772 * "curline" is TRUE for "z?": find word under/after cursor in the same line.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000773 * Return OK if found, FAIL otherwise.
774 */
775 int
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000776spell_move_to(dir, allwords, curline)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000777 int dir; /* FORWARD or BACKWARD */
778 int allwords; /* TRUE for "[s" and "]s" */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000779 int curline;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000780{
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000781 linenr_T lnum;
782 pos_T found_pos;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000783 char_u *line;
784 char_u *p;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000785 int attr = 0;
786 int len;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000787 int has_syntax = syntax_present(curbuf);
788 int col;
789 int can_spell;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000790
Bram Moolenaarb765d632005-06-07 21:00:02 +0000791 if (!curwin->w_p_spell || *curbuf->b_p_spl == NUL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000792 {
793 EMSG(_("E756: Spell checking not enabled"));
794 return FAIL;
795 }
796
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000797 /*
798 * Start looking for bad word at the start of the line, because we can't
799 * start halfway a word, we don't know where it starts or ends.
800 *
801 * When searching backwards, we continue in the line to find the last
802 * bad word (in the cursor line: before the cursor).
803 */
804 lnum = curwin->w_cursor.lnum;
805 found_pos.lnum = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000806
807 while (!got_int)
808 {
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000809 line = ml_get(lnum);
810 p = line;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000811
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000812 while (*p != NUL)
813 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000814 /* When searching backward don't search after the cursor. */
815 if (dir == BACKWARD
816 && lnum == curwin->w_cursor.lnum
817 && (colnr_T)(p - line) >= curwin->w_cursor.col)
818 break;
819
820 /* start of word */
821 len = spell_check(curwin, p, &attr);
822
823 if (attr != 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000824 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000825 /* We found a bad word. Check the attribute. */
826 /* TODO: check for syntax @Spell cluster. */
827 if (allwords || attr == highlight_attr[HLF_SPB])
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000828 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000829 /* When searching forward only accept a bad word after
830 * the cursor. */
831 if (dir == BACKWARD
832 || lnum > curwin->w_cursor.lnum
833 || (lnum == curwin->w_cursor.lnum
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000834 && (colnr_T)(curline ? p - line + len
835 : p - line)
Bram Moolenaar51485f02005-06-04 21:55:20 +0000836 > curwin->w_cursor.col))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000837 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000838 if (has_syntax)
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000839 {
Bram Moolenaar51485f02005-06-04 21:55:20 +0000840 col = p - line;
841 (void)syn_get_id(lnum, (colnr_T)col,
842 FALSE, &can_spell);
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000843
Bram Moolenaar51485f02005-06-04 21:55:20 +0000844 /* have to get the line again, a multi-line
845 * regexp may make it invalid */
846 line = ml_get(lnum);
847 p = line + col;
848 }
849 else
850 can_spell = TRUE;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000851
Bram Moolenaar51485f02005-06-04 21:55:20 +0000852 if (can_spell)
853 {
854 found_pos.lnum = lnum;
855 found_pos.col = p - line;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000856#ifdef FEAT_VIRTUALEDIT
Bram Moolenaar51485f02005-06-04 21:55:20 +0000857 found_pos.coladd = 0;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000858#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +0000859 if (dir == FORWARD)
860 {
861 /* No need to search further. */
862 curwin->w_cursor = found_pos;
863 return OK;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000864 }
865 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000866 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000867 }
Bram Moolenaar51485f02005-06-04 21:55:20 +0000868 attr = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000869 }
870
Bram Moolenaar51485f02005-06-04 21:55:20 +0000871 /* advance to character after the word */
872 p += len;
873 if (*p == NUL)
874 break;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000875 }
876
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000877 if (curline)
878 return FAIL; /* only check cursor line */
879
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000880 /* Advance to next line. */
Bram Moolenaar2cf8b302005-04-20 19:37:22 +0000881 if (dir == BACKWARD)
882 {
883 if (found_pos.lnum != 0)
884 {
885 /* Use the last match in the line. */
886 curwin->w_cursor = found_pos;
887 return OK;
888 }
889 if (lnum == 1)
890 return FAIL;
891 --lnum;
892 }
893 else
894 {
895 if (lnum == curbuf->b_ml.ml_line_count)
896 return FAIL;
897 ++lnum;
898 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000899
900 line_breakcheck();
901 }
902
903 return FAIL; /* interrupted */
904}
905
906/*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000907 * Load word list(s) for "lang" from Vim spell file(s).
Bram Moolenaarb765d632005-06-07 21:00:02 +0000908 * "lang" must be the language without the region: e.g., "en".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000909 */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000910 static void
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000911spell_load_lang(lang)
912 char_u *lang;
913{
Bram Moolenaarb765d632005-06-07 21:00:02 +0000914 char_u fname_enc[85];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000915 int r;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000916 char_u langcp[MAXWLEN + 1];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000917
Bram Moolenaarb765d632005-06-07 21:00:02 +0000918 /* Copy the language name to pass it to spell_load_cb() as a cookie.
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000919 * It's truncated when an error is detected. */
920 STRCPY(langcp, lang);
921
Bram Moolenaarb765d632005-06-07 21:00:02 +0000922 /*
923 * Find the first spell file for "lang" in 'runtimepath' and load it.
924 */
925 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
926 "spell/%s.%s.spl", lang, spell_enc());
927 r = do_in_runtimepath(fname_enc, FALSE, spell_load_cb, &langcp);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000928
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000929 if (r == FAIL && *langcp != NUL)
930 {
931 /* Try loading the ASCII version. */
Bram Moolenaarb765d632005-06-07 21:00:02 +0000932 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
Bram Moolenaar9c13b352005-05-19 20:53:52 +0000933 "spell/%s.ascii.spl", lang);
Bram Moolenaarb765d632005-06-07 21:00:02 +0000934 r = do_in_runtimepath(fname_enc, FALSE, spell_load_cb, &langcp);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000935 }
936
Bram Moolenaarcfc6c432005-06-06 21:50:35 +0000937 if (r == FAIL)
938 smsg((char_u *)_("Warning: Cannot find word list \"%s\""),
939 fname_enc + 6);
Bram Moolenaarb765d632005-06-07 21:00:02 +0000940 else if (*langcp != NUL)
941 {
942 /* Load all the additions. */
943 STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl");
944 do_in_runtimepath(fname_enc, TRUE, spell_load_cb, &langcp);
945 }
946}
947
948/*
949 * Return the encoding used for spell checking: Use 'encoding', except that we
950 * use "latin1" for "latin9". And limit to 60 characters (just in case).
951 */
952 static char_u *
953spell_enc()
954{
955
956#ifdef FEAT_MBYTE
957 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0)
958 return p_enc;
959#endif
960 return (char_u *)"latin1";
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000961}
962
963/*
964 * Allocate a new slang_T.
965 * Caller must fill "sl_next".
966 */
967 static slang_T *
968slang_alloc(lang)
969 char_u *lang;
970{
971 slang_T *lp;
972
Bram Moolenaar51485f02005-06-04 21:55:20 +0000973 lp = (slang_T *)alloc_clear(sizeof(slang_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000974 if (lp != NULL)
975 {
976 lp->sl_name = vim_strsave(lang);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +0000977 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10);
978 ga_init2(&lp->sl_sal, sizeof(fromto_T), 10);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000979 }
980 return lp;
981}
982
983/*
984 * Free the contents of an slang_T and the structure itself.
985 */
986 static void
987slang_free(lp)
988 slang_T *lp;
989{
Bram Moolenaar402d2fe2005-04-15 21:00:38 +0000990 vim_free(lp->sl_name);
Bram Moolenaarb765d632005-06-07 21:00:02 +0000991 vim_free(lp->sl_fname);
992 slang_clear(lp);
993 vim_free(lp);
994}
995
996/*
997 * Clear an slang_T so that the file can be reloaded.
998 */
999 static void
1000slang_clear(lp)
1001 slang_T *lp;
1002{
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001003 garray_T *gap;
1004 fromto_T *ftp;
1005 int round;
1006
Bram Moolenaar51485f02005-06-04 21:55:20 +00001007 vim_free(lp->sl_fbyts);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001008 lp->sl_fbyts = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001009 vim_free(lp->sl_kbyts);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001010 lp->sl_kbyts = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001011 vim_free(lp->sl_fidxs);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001012 lp->sl_fidxs = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001013 vim_free(lp->sl_kidxs);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001014 lp->sl_kidxs = NULL;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001015
1016 for (round = 1; round <= 2; ++round)
1017 {
1018 gap = round == 1 ? &lp->sl_rep : &lp->sl_sal;
1019 while (gap->ga_len > 0)
1020 {
1021 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len];
1022 vim_free(ftp->ft_from);
1023 vim_free(ftp->ft_to);
1024 }
1025 ga_clear(gap);
1026 }
1027
1028 vim_free(lp->sl_map);
1029 lp->sl_map = NULL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001030}
1031
1032/*
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001033 * Load one spell file and store the info into a slang_T.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001034 * Invoked through do_in_runtimepath().
1035 */
1036 static void
Bram Moolenaarb765d632005-06-07 21:00:02 +00001037spell_load_cb(fname, cookie)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001038 char_u *fname;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001039 void *cookie; /* points to the language name */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001040{
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001041 (void)spell_load_file(fname, (char_u *)cookie, NULL, FALSE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001042}
1043
1044/*
1045 * Load one spell file and store the info into a slang_T.
1046 *
1047 * This is invoked in two ways:
1048 * - From spell_load_cb() to load a spell file for the first time. "lang" is
1049 * the language name, "old_lp" is NULL. Will allocate an slang_T.
1050 * - To reload a spell file that was changed. "lang" is NULL and "old_lp"
1051 * points to the existing slang_T.
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001052 * Returns the slang_T the spell file was loaded into. NULL for error.
Bram Moolenaarb765d632005-06-07 21:00:02 +00001053 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001054 static slang_T *
1055spell_load_file(fname, lang, old_lp, silent)
Bram Moolenaarb765d632005-06-07 21:00:02 +00001056 char_u *fname;
1057 char_u *lang;
1058 slang_T *old_lp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001059 int silent; /* no error if file doesn't exist */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001060{
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001061 FILE *fd;
1062 char_u buf[MAXWLEN + 1];
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001063 char_u *p;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001064 int i;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001065 int len;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001066 int round;
1067 char_u *save_sourcing_name = sourcing_name;
1068 linenr_T save_sourcing_lnum = sourcing_lnum;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001069 int cnt, ccnt;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001070 char_u *fol;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001071 slang_T *lp = NULL;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001072 garray_T *gap;
1073 fromto_T *ftp;
1074 int rr;
1075 short *first;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001076
Bram Moolenaarb765d632005-06-07 21:00:02 +00001077 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001078 if (fd == NULL)
1079 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001080 if (!silent)
1081 EMSG2(_(e_notopen), fname);
1082 else if (p_verbose > 2)
1083 {
1084 verbose_enter();
1085 smsg((char_u *)e_notopen, fname);
1086 verbose_leave();
1087 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001088 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001089 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00001090 if (p_verbose > 2)
1091 {
1092 verbose_enter();
1093 smsg((char_u *)_("Reading spell file \"%s\""), fname);
1094 verbose_leave();
1095 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001096
Bram Moolenaarb765d632005-06-07 21:00:02 +00001097 if (old_lp == NULL)
1098 {
1099 lp = slang_alloc(lang);
1100 if (lp == NULL)
1101 goto endFAIL;
1102
1103 /* Remember the file name, used to reload the file when it's updated. */
1104 lp->sl_fname = vim_strsave(fname);
1105 if (lp->sl_fname == NULL)
1106 goto endFAIL;
1107
1108 /* Check for .add.spl. */
1109 lp->sl_add = strstr((char *)gettail(fname), ".add.") != NULL;
1110 }
1111 else
1112 lp = old_lp;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001113
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001114 /* Set sourcing_name, so that error messages mention the file name. */
1115 sourcing_name = fname;
1116 sourcing_lnum = 0;
1117
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001118 /* <HEADER>: <fileID> <regioncnt> <regionname> ...
1119 * <charflagslen> <charflags> <fcharslen> <fchars> */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001120 for (i = 0; i < VIMSPELLMAGICL; ++i)
1121 buf[i] = getc(fd); /* <fileID> */
1122 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0)
1123 {
1124 EMSG(_("E757: Wrong file ID in spell file"));
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001125 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001126 }
1127
1128 cnt = getc(fd); /* <regioncnt> */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001129 if (cnt < 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001130 {
1131truncerr:
1132 EMSG(_("E758: Truncated spell file"));
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001133 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001134 }
1135 if (cnt > 8)
1136 {
1137formerr:
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001138 EMSG(_(e_format));
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001139 goto endFAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001140 }
1141 for (i = 0; i < cnt; ++i)
1142 {
1143 lp->sl_regions[i * 2] = getc(fd); /* <regionname> */
1144 lp->sl_regions[i * 2 + 1] = getc(fd);
1145 }
1146 lp->sl_regions[cnt * 2] = NUL;
1147
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001148 cnt = getc(fd); /* <charflagslen> */
1149 if (cnt > 0)
1150 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001151 p = alloc((unsigned)cnt);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001152 if (p == NULL)
1153 goto endFAIL;
1154 for (i = 0; i < cnt; ++i)
1155 p[i] = getc(fd); /* <charflags> */
1156
1157 ccnt = (getc(fd) << 8) + getc(fd); /* <fcharslen> */
1158 if (ccnt <= 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001159 {
1160 vim_free(p);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001161 goto formerr;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001162 }
1163 fol = alloc((unsigned)ccnt + 1);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001164 if (fol == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001165 {
1166 vim_free(p);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001167 goto endFAIL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001168 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001169 for (i = 0; i < ccnt; ++i)
1170 fol[i] = getc(fd); /* <fchars> */
1171 fol[i] = NUL;
1172
1173 /* Set the word-char flags and fill spell_isupper() table. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001174 i = set_spell_charflags(p, cnt, fol);
1175 vim_free(p);
1176 vim_free(fol);
1177 if (i == FAIL)
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001178 goto formerr;
1179 }
1180 else
1181 {
1182 /* When <charflagslen> is zero then <fcharlen> must also be zero. */
1183 cnt = (getc(fd) << 8) + getc(fd);
1184 if (cnt != 0)
1185 goto formerr;
1186 }
1187
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001188 /* <SUGGEST> : <repcount> <rep> ...
1189 * <salflags> <salcount> <sal> ...
1190 * <maplen> <mapstr> */
1191 for (round = 1; round <= 2; ++round)
1192 {
1193 if (round == 1)
1194 {
1195 gap = &lp->sl_rep;
1196 first = lp->sl_rep_first;
1197 }
1198 else
1199 {
1200 gap = &lp->sl_sal;
1201 first = lp->sl_sal_first;
1202
1203 i = getc(fd); /* <salflags> */
1204 if (i & SAL_F0LLOWUP)
1205 lp->sl_followup = TRUE;
1206 if (i & SAL_COLLAPSE)
1207 lp->sl_collapse = TRUE;
1208 if (i & SAL_REM_ACCENTS)
1209 lp->sl_rem_accents = TRUE;
1210 }
1211
1212 cnt = (getc(fd) << 8) + getc(fd); /* <repcount> or <salcount> */
1213 if (cnt < 0)
1214 goto formerr;
1215
1216 if (ga_grow(gap, cnt) == FAIL)
1217 goto endFAIL;
1218 for (; gap->ga_len < cnt; ++gap->ga_len)
1219 {
1220 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */
1221 /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
1222 ftp = &((fromto_T *)gap->ga_data)[gap->ga_len];
1223 for (rr = 1; rr <= 2; ++rr)
1224 {
1225 ccnt = getc(fd);
1226 if (ccnt < 0)
1227 {
1228 if (rr == 2)
1229 vim_free(ftp->ft_from);
1230 goto formerr;
1231 }
1232 if ((p = alloc(ccnt + 1)) == NULL)
1233 {
1234 if (rr == 2)
1235 vim_free(ftp->ft_from);
1236 goto endFAIL;
1237 }
1238 for (i = 0; i < ccnt; ++i)
1239 p[i] = getc(fd); /* <repfrom> or <salfrom> */
1240 p[i] = NUL;
1241 if (rr == 1)
1242 ftp->ft_from = p;
1243 else
1244 ftp->ft_to = p;
1245 }
1246 }
1247
1248 /* Fill the first-index table. */
1249 for (i = 0; i < 256; ++i)
1250 first[i] = -1;
1251 for (i = 0; i < gap->ga_len; ++i)
1252 {
1253 ftp = &((fromto_T *)gap->ga_data)[i];
1254 if (first[*ftp->ft_from] == -1)
1255 first[*ftp->ft_from] = i;
1256 }
1257 }
1258
1259 cnt = (getc(fd) << 8) + getc(fd); /* <maplen> */
1260 if (cnt < 0)
1261 goto formerr;
1262 p = alloc(cnt + 1);
1263 if (p == NULL)
1264 goto endFAIL;
1265 for (i = 0; i < cnt; ++i)
1266 p[i] = getc(fd); /* <mapstr> */
1267 p[i] = NUL;
1268 lp->sl_map = p;
1269
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001270
Bram Moolenaar51485f02005-06-04 21:55:20 +00001271 /* round 1: <LWORDTREE>
1272 * round 2: <KWORDTREE> */
1273 for (round = 1; round <= 2; ++round)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001274 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001275 /* The tree size was computed when writing the file, so that we can
1276 * allocate it as one long block. <nodecount> */
1277 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
1278 if (len < 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001279 goto truncerr;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001280 if (len > 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001281 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001282 /* Allocate the byte array. */
1283 p = lalloc((long_u)len, TRUE);
1284 if (p == NULL)
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001285 goto endFAIL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001286 if (round == 1)
1287 lp->sl_fbyts = p;
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00001288 else
Bram Moolenaar51485f02005-06-04 21:55:20 +00001289 lp->sl_kbyts = p;
1290
1291 /* Allocate the index array. */
1292 p = lalloc_clear((long_u)(len * sizeof(int)), TRUE);
1293 if (p == NULL)
1294 goto endFAIL;
1295 if (round == 1)
1296 lp->sl_fidxs = (int *)p;
1297 else
1298 lp->sl_kidxs = (int *)p;
1299
1300
1301 /* Read the tree and store it in the array. */
1302 i = read_tree(fd,
1303 round == 1 ? lp->sl_fbyts : lp->sl_kbyts,
1304 round == 1 ? lp->sl_fidxs : lp->sl_kidxs,
1305 len, 0);
1306 if (i == -1)
1307 goto truncerr;
1308 if (i < 0)
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001309 goto formerr;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001310 }
1311 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00001312
Bram Moolenaarb765d632005-06-07 21:00:02 +00001313 /* For a new file link it in the list of spell files. */
1314 if (old_lp == NULL)
1315 {
1316 lp->sl_next = first_lang;
1317 first_lang = lp;
1318 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001319
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001320 goto endOK;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001321
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001322endFAIL:
Bram Moolenaarb765d632005-06-07 21:00:02 +00001323 if (lang != NULL)
1324 /* truncating the name signals the error to spell_load_lang() */
1325 *lang = NUL;
1326 if (lp != NULL && old_lp == NULL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001327 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001328 slang_free(lp);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001329 lp = NULL;
1330 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001331
1332endOK:
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001333 if (fd != NULL)
1334 fclose(fd);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001335 sourcing_name = save_sourcing_name;
1336 sourcing_lnum = save_sourcing_lnum;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001337
1338 return lp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001339}
1340
1341/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00001342 * Read one row of siblings from the spell file and store it in the byte array
1343 * "byts" and index array "idxs". Recursively read the children.
1344 *
1345 * NOTE: The code here must match put_tree().
1346 *
1347 * Returns the index follosing the siblings.
1348 * Returns -1 if the file is shorter than expected.
1349 * Returns -2 if there is a format error.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001350 */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001351 static int
1352read_tree(fd, byts, idxs, maxidx, startidx)
1353 FILE *fd;
1354 char_u *byts;
1355 int *idxs;
1356 int maxidx; /* size of arrays */
1357 int startidx; /* current index in "byts" and "idxs" */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001358{
Bram Moolenaar51485f02005-06-04 21:55:20 +00001359 int len;
1360 int i;
1361 int n;
1362 int idx = startidx;
1363 int c;
1364#define SHARED_MASK 0x8000000
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001365
Bram Moolenaar51485f02005-06-04 21:55:20 +00001366 len = getc(fd); /* <siblingcount> */
1367 if (len <= 0)
1368 return -1;
1369
1370 if (startidx + len >= maxidx)
1371 return -2;
1372 byts[idx++] = len;
1373
1374 /* Read the byte values, flag/region bytes and shared indexes. */
1375 for (i = 1; i <= len; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001376 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001377 c = getc(fd); /* <byte> */
1378 if (c < 0)
1379 return -1;
1380 if (c <= BY_SPECIAL)
1381 {
1382 if (c == BY_NOFLAGS)
1383 {
1384 /* No flags, all regions. */
1385 idxs[idx] = 0;
1386 c = 0;
1387 }
1388 else if (c == BY_FLAGS)
1389 {
1390 /* Read flags and option region. */
1391 c = getc(fd); /* <flags> */
1392 if (c & WF_REGION)
1393 c = (getc(fd) << 8) + c; /* <region> */
1394 idxs[idx] = c;
1395 c = 0;
1396 }
1397 else /* c == BY_INDEX */
1398 {
1399 /* <nodeidx> */
1400 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
1401 if (n < 0 || n >= maxidx)
1402 return -2;
1403 idxs[idx] = n + SHARED_MASK;
1404 c = getc(fd); /* <xbyte> */
1405 }
1406 }
1407 byts[idx++] = c;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001408 }
1409
Bram Moolenaar51485f02005-06-04 21:55:20 +00001410 /* Recursively read the children for non-shared siblings.
1411 * Skip the end-of-word ones (zero byte value) and the shared ones (and
1412 * remove SHARED_MASK) */
1413 for (i = 1; i <= len; ++i)
1414 if (byts[startidx + i] != 0)
1415 {
1416 if (idxs[startidx + i] & SHARED_MASK)
1417 idxs[startidx + i] &= ~SHARED_MASK;
1418 else
1419 {
1420 idxs[startidx + i] = idx;
1421 idx = read_tree(fd, byts, idxs, maxidx, idx);
1422 if (idx < 0)
1423 break;
1424 }
1425 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001426
Bram Moolenaar51485f02005-06-04 21:55:20 +00001427 return idx;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001428}
1429
1430/*
1431 * Parse 'spelllang' and set buf->b_langp accordingly.
1432 * Returns an error message or NULL.
1433 */
1434 char_u *
1435did_set_spelllang(buf)
1436 buf_T *buf;
1437{
1438 garray_T ga;
1439 char_u *lang;
1440 char_u *e;
1441 char_u *region;
1442 int region_mask;
1443 slang_T *lp;
1444 int c;
1445 char_u lbuf[MAXWLEN + 1];
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001446 char_u spf_name[MAXPATHL];
1447 int did_spf = FALSE;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001448
1449 ga_init2(&ga, sizeof(langp_T), 2);
1450
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001451 /* Get the name of the .spl file associated with 'spellfile'. */
1452 if (*buf->b_p_spf == NUL)
1453 did_spf = TRUE;
1454 else
1455 vim_snprintf((char *)spf_name, sizeof(spf_name), "%s.spl",
1456 buf->b_p_spf);
1457
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001458 /* loop over comma separated languages. */
1459 for (lang = buf->b_p_spl; *lang != NUL; lang = e)
1460 {
1461 e = vim_strchr(lang, ',');
1462 if (e == NULL)
1463 e = lang + STRLEN(lang);
Bram Moolenaar5482f332005-04-17 20:18:43 +00001464 region = NULL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001465 if (e > lang + 2)
1466 {
1467 if (e - lang >= MAXWLEN)
1468 {
1469 ga_clear(&ga);
1470 return e_invarg;
1471 }
1472 if (lang[2] == '_')
1473 region = lang + 3;
1474 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001475
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001476 /* Check if we loaded this language before. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001477 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
1478 if (STRNICMP(lp->sl_name, lang, 2) == 0)
1479 break;
1480
1481 if (lp == NULL)
1482 {
1483 /* Not found, load the language. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001484 vim_strncpy(lbuf, lang, e - lang);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001485 if (region != NULL)
1486 mch_memmove(lbuf + 2, lbuf + 5, e - lang - 4);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001487 spell_load_lang(lbuf);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001488 }
1489
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001490 /*
1491 * Loop over the languages, there can be several files for each.
1492 */
1493 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
1494 if (STRNICMP(lp->sl_name, lang, 2) == 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001495 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00001496 region_mask = REGION_ALL;
1497 if (region != NULL)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001498 {
1499 /* find region in sl_regions */
1500 c = find_region(lp->sl_regions, region);
1501 if (c == REGION_ALL)
1502 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00001503 if (!lp->sl_add)
1504 {
1505 c = *e;
1506 *e = NUL;
1507 smsg((char_u *)_("Warning: region %s not supported"),
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001508 lang);
Bram Moolenaar3982c542005-06-08 21:56:31 +00001509 *e = c;
1510 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001511 }
1512 else
1513 region_mask = 1 << c;
1514 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001515
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001516 if (ga_grow(&ga, 1) == FAIL)
1517 {
1518 ga_clear(&ga);
1519 return e_outofmem;
1520 }
1521 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp;
1522 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask;
1523 ++ga.ga_len;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001524
1525 /* Check if this is the 'spellfile' spell file. */
1526 if (fullpathcmp(spf_name, lp->sl_fname, FALSE) == FPC_SAME)
1527 did_spf = TRUE;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001528 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001529
1530 if (*e == ',')
1531 ++e;
1532 }
1533
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001534 /*
1535 * Make sure the 'spellfile' file is loaded. It may be in 'runtimepath',
1536 * then it's probably loaded above already. Otherwise load it here.
1537 */
1538 if (!did_spf)
1539 {
1540 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
1541 if (fullpathcmp(spf_name, lp->sl_fname, FALSE) == FPC_SAME)
1542 break;
1543 if (lp == NULL)
1544 {
1545 vim_strncpy(lbuf, gettail(spf_name), 2);
1546 lp = spell_load_file(spf_name, lbuf, NULL, TRUE);
1547 }
1548 if (lp != NULL && ga_grow(&ga, 1) == OK)
1549 {
1550 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp;
1551 LANGP_ENTRY(ga, ga.ga_len)->lp_region = REGION_ALL;
1552 ++ga.ga_len;
1553 }
1554 }
1555
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001556 /* Add a NULL entry to mark the end of the list. */
1557 if (ga_grow(&ga, 1) == FAIL)
1558 {
1559 ga_clear(&ga);
1560 return e_outofmem;
1561 }
1562 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = NULL;
1563 ++ga.ga_len;
1564
1565 /* Everything is fine, store the new b_langp value. */
1566 ga_clear(&buf->b_langp);
1567 buf->b_langp = ga;
1568
1569 return NULL;
1570}
1571
1572/*
1573 * Find the region "region[2]" in "rp" (points to "sl_regions").
1574 * Each region is simply stored as the two characters of it's name.
1575 * Returns the index if found, REGION_ALL if not found.
1576 */
1577 static int
1578find_region(rp, region)
1579 char_u *rp;
1580 char_u *region;
1581{
1582 int i;
1583
1584 for (i = 0; ; i += 2)
1585 {
1586 if (rp[i] == NUL)
1587 return REGION_ALL;
1588 if (rp[i] == region[0] && rp[i + 1] == region[1])
1589 break;
1590 }
1591 return i / 2;
1592}
1593
1594/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001595 * Return case type of word:
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001596 * w word 0
Bram Moolenaar51485f02005-06-04 21:55:20 +00001597 * Word WF_ONECAP
1598 * W WORD WF_ALLCAP
1599 * WoRd wOrd WF_KEEPCAP
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001600 */
1601 static int
1602captype(word, end)
1603 char_u *word;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001604 char_u *end; /* When NULL use up to NUL byte. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001605{
1606 char_u *p;
1607 int c;
1608 int firstcap;
1609 int allcap;
1610 int past_second = FALSE; /* past second word char */
1611
1612 /* find first letter */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001613 for (p = word; !SPELL_ISWORDP(p); mb_ptr_adv(p))
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001614 if (end == NULL ? *p == NUL : p >= end)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001615 return 0; /* only non-word characters, illegal word */
1616#ifdef FEAT_MBYTE
Bram Moolenaarb765d632005-06-07 21:00:02 +00001617 if (has_mbyte)
1618 c = mb_ptr2char_adv(&p);
1619 else
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001620#endif
Bram Moolenaarb765d632005-06-07 21:00:02 +00001621 c = *p++;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001622 firstcap = allcap = spell_isupper(c);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001623
1624 /*
1625 * Need to check all letters to find a word with mixed upper/lower.
1626 * But a word with an upper char only at start is a ONECAP.
1627 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001628 for ( ; end == NULL ? *p != NUL : p < end; mb_ptr_adv(p))
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001629 if (SPELL_ISWORDP(p))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001630 {
1631#ifdef FEAT_MBYTE
1632 c = mb_ptr2char(p);
1633#else
1634 c = *p;
1635#endif
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001636 if (!spell_isupper(c))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001637 {
1638 /* UUl -> KEEPCAP */
1639 if (past_second && allcap)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001640 return WF_KEEPCAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001641 allcap = FALSE;
1642 }
1643 else if (!allcap)
1644 /* UlU -> KEEPCAP */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001645 return WF_KEEPCAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001646 past_second = TRUE;
1647 }
1648
1649 if (allcap)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001650 return WF_ALLCAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001651 if (firstcap)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001652 return WF_ONECAP;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001653 return 0;
1654}
1655
1656# if defined(FEAT_MBYTE) || defined(PROTO)
1657/*
1658 * Clear all spelling tables and reload them.
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001659 * Used after 'encoding' is set and when ":mkspell" was used.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001660 */
1661 void
1662spell_reload()
1663{
1664 buf_T *buf;
1665 slang_T *lp;
Bram Moolenaar3982c542005-06-08 21:56:31 +00001666 win_T *wp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001667
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001668 /* Initialize the table for SPELL_ISWORDP(). */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001669 init_spell_chartab();
1670
1671 /* Unload all allocated memory. */
1672 while (first_lang != NULL)
1673 {
1674 lp = first_lang;
1675 first_lang = lp->sl_next;
1676 slang_free(lp);
1677 }
1678
1679 /* Go through all buffers and handle 'spelllang'. */
1680 for (buf = firstbuf; buf != NULL; buf = buf->b_next)
1681 {
1682 ga_clear(&buf->b_langp);
Bram Moolenaar3982c542005-06-08 21:56:31 +00001683
1684 /* Only load the wordlists when 'spelllang' is set and there is a
1685 * window for this buffer in which 'spell' is set. */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001686 if (*buf->b_p_spl != NUL)
Bram Moolenaar3982c542005-06-08 21:56:31 +00001687 {
1688 FOR_ALL_WINDOWS(wp)
1689 if (wp->w_buffer == buf && wp->w_p_spell)
1690 {
1691 (void)did_set_spelllang(buf);
1692# ifdef FEAT_WINDOWS
1693 break;
1694# endif
1695 }
1696 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001697 }
1698}
1699# endif
1700
Bram Moolenaarb765d632005-06-07 21:00:02 +00001701/*
1702 * Reload the spell file "fname" if it's loaded.
1703 */
1704 static void
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001705spell_reload_one(fname, added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00001706 char_u *fname;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001707 int added_word; /* invoked through "zg" */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001708{
1709 slang_T *lp;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001710 int didit = FALSE;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001711
Bram Moolenaarb765d632005-06-07 21:00:02 +00001712 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
1713 if (fullpathcmp(fname, lp->sl_fname, FALSE) == FPC_SAME)
1714 {
1715 slang_clear(lp);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001716 (void)spell_load_file(fname, NULL, lp, FALSE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001717 redraw_all_later(NOT_VALID);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001718 didit = TRUE;
Bram Moolenaarb765d632005-06-07 21:00:02 +00001719 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001720
1721 /* When "zg" was used and the file wasn't loaded yet, should redo
1722 * 'spelllang' to get it loaded. */
1723 if (added_word && !didit)
1724 did_set_spelllang(curbuf);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001725}
1726
1727
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001728/*
1729 * Functions for ":mkspell".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001730 */
1731
Bram Moolenaar51485f02005-06-04 21:55:20 +00001732#define MAXLINELEN 500 /* Maximum length in bytes of a line in a .aff
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001733 and .dic file. */
1734/*
1735 * Main structure to store the contents of a ".aff" file.
1736 */
1737typedef struct afffile_S
1738{
1739 char_u *af_enc; /* "SET", normalized, alloc'ed string or NULL */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001740 int af_rar; /* RAR ID for rare word */
1741 int af_kep; /* KEP ID for keep-case word */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001742 hashtab_T af_pref; /* hashtable for prefixes, affheader_T */
1743 hashtab_T af_suff; /* hashtable for suffixes, affheader_T */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001744} afffile_T;
1745
1746typedef struct affentry_S affentry_T;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001747/* Affix entry from ".aff" file. Used for prefixes and suffixes. */
1748struct affentry_S
1749{
1750 affentry_T *ae_next; /* next affix with same name/number */
1751 char_u *ae_chop; /* text to chop off basic word (can be NULL) */
1752 char_u *ae_add; /* text to add to basic word (can be NULL) */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001753 char_u *ae_cond; /* condition (NULL for ".") */
1754 regprog_T *ae_prog; /* regexp program for ae_cond or NULL */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001755};
1756
1757/* Affix header from ".aff" file. Used for af_pref and af_suff. */
1758typedef struct affheader_S
1759{
1760 char_u ah_key[2]; /* key for hashtable == name of affix entry */
1761 int ah_combine; /* suffix may combine with prefix */
1762 affentry_T *ah_first; /* first affix entry */
1763} affheader_T;
1764
1765#define HI2AH(hi) ((affheader_T *)(hi)->hi_key)
1766
1767/*
1768 * Structure that is used to store the items in the word tree. This avoids
1769 * the need to keep track of each allocated thing, it's freed all at once
1770 * after ":mkspell" is done.
1771 */
1772#define SBLOCKSIZE 16000 /* size of sb_data */
1773typedef struct sblock_S sblock_T;
1774struct sblock_S
1775{
1776 sblock_T *sb_next; /* next block in list */
1777 int sb_used; /* nr of bytes already in use */
1778 char_u sb_data[1]; /* data, actually longer */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001779};
1780
1781/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00001782 * A node in the tree.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001783 */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001784typedef struct wordnode_S wordnode_T;
1785struct wordnode_S
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001786{
Bram Moolenaar51485f02005-06-04 21:55:20 +00001787 char_u wn_hashkey[6]; /* room for the hash key */
1788 wordnode_T *wn_next; /* next node with same hash key */
1789 wordnode_T *wn_child; /* child (next byte in word) */
1790 wordnode_T *wn_sibling; /* next sibling (alternate byte in word,
1791 always sorted) */
1792 wordnode_T *wn_wnode; /* parent node that will write this node */
1793 int wn_index; /* index in written nodes (valid after first
1794 round) */
1795 char_u wn_byte; /* Byte for this node. NUL for word end */
1796 char_u wn_flags; /* when wn_byte is NUL: WF_ flags */
1797 char_u wn_region; /* when wn_byte is NUL: region mask */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001798};
1799
Bram Moolenaar51485f02005-06-04 21:55:20 +00001800#define HI2WN(hi) (wordnode_T *)((hi)->hi_key)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001801
Bram Moolenaar51485f02005-06-04 21:55:20 +00001802/*
1803 * Info used while reading the spell files.
1804 */
1805typedef struct spellinfo_S
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001806{
Bram Moolenaar51485f02005-06-04 21:55:20 +00001807 wordnode_T *si_foldroot; /* tree with case-folded words */
1808 wordnode_T *si_keeproot; /* tree with keep-case words */
1809 sblock_T *si_blocks; /* memory blocks used */
1810 int si_ascii; /* handling only ASCII words */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001811 int si_add; /* addition file */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001812 int si_region; /* region mask */
1813 vimconv_T si_conv; /* for conversion to 'encoding' */
Bram Moolenaar50cde822005-06-05 21:54:54 +00001814 int si_memtot; /* runtime memory used */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001815 int si_verbose; /* verbose messages */
Bram Moolenaar3982c542005-06-08 21:56:31 +00001816 int si_region_count; /* number of regions supported (1 when there
1817 are no regions) */
1818 char_u si_region_name[16]; /* region names (if count > 1) */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001819
1820 garray_T si_rep; /* list of fromto_T entries from REP lines */
1821 garray_T si_sal; /* list of fromto_T entries from SAL lines */
1822 int si_followup; /* soundsalike: ? */
1823 int si_collapse; /* soundsalike: ? */
1824 int si_rem_accents; /* soundsalike: remove accents */
1825 garray_T si_map; /* MAP info concatenated */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001826} spellinfo_T;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001827
Bram Moolenaar51485f02005-06-04 21:55:20 +00001828static afffile_T *spell_read_aff __ARGS((char_u *fname, spellinfo_T *spin));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001829static void add_fromto __ARGS((spellinfo_T *spin, garray_T *gap, char_u *from, char_u *to));
1830static int sal_to_bool __ARGS((char_u *s));
Bram Moolenaar5482f332005-04-17 20:18:43 +00001831static int has_non_ascii __ARGS((char_u *s));
Bram Moolenaar51485f02005-06-04 21:55:20 +00001832static void spell_free_aff __ARGS((afffile_T *aff));
1833static int spell_read_dic __ARGS((char_u *fname, spellinfo_T *spin, afffile_T *affile));
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001834static int store_aff_word __ARGS((char_u *word, spellinfo_T *spin, char_u *afflist, hashtab_T *ht, hashtab_T *xht, int comb, int flags));
Bram Moolenaar51485f02005-06-04 21:55:20 +00001835static int spell_read_wordfile __ARGS((char_u *fname, spellinfo_T *spin));
1836static void *getroom __ARGS((sblock_T **blp, size_t len));
1837static char_u *getroom_save __ARGS((sblock_T **blp, char_u *s));
1838static void free_blocks __ARGS((sblock_T *bl));
1839static wordnode_T *wordtree_alloc __ARGS((sblock_T **blp));
Bram Moolenaar3982c542005-06-08 21:56:31 +00001840static int store_word __ARGS((char_u *word, spellinfo_T *spin, int flags, int region));
Bram Moolenaar51485f02005-06-04 21:55:20 +00001841static int tree_add_word __ARGS((char_u *word, wordnode_T *tree, int flags, int region, sblock_T **blp));
Bram Moolenaarb765d632005-06-07 21:00:02 +00001842static void wordtree_compress __ARGS((wordnode_T *root, spellinfo_T *spin));
Bram Moolenaar51485f02005-06-04 21:55:20 +00001843static int node_compress __ARGS((wordnode_T *node, hashtab_T *ht, int *tot));
1844static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2));
Bram Moolenaar3982c542005-06-08 21:56:31 +00001845static void write_vim_spell __ARGS((char_u *fname, spellinfo_T *spin));
Bram Moolenaar51485f02005-06-04 21:55:20 +00001846static int put_tree __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001847static void mkspell __ARGS((int fcount, char_u **fnames, int ascii, int overwrite, int added_word));
Bram Moolenaarb765d632005-06-07 21:00:02 +00001848static void init_spellfile __ARGS((void));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001849
1850/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001851 * Read the affix file "fname".
Bram Moolenaar3982c542005-06-08 21:56:31 +00001852 * Returns an afffile_T, NULL for complete failure.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001853 */
1854 static afffile_T *
Bram Moolenaar51485f02005-06-04 21:55:20 +00001855spell_read_aff(fname, spin)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001856 char_u *fname;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001857 spellinfo_T *spin;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001858{
1859 FILE *fd;
1860 afffile_T *aff;
1861 char_u rline[MAXLINELEN];
1862 char_u *line;
1863 char_u *pc = NULL;
1864 char_u *(items[6]);
1865 int itemcnt;
1866 char_u *p;
1867 int lnum = 0;
1868 affheader_T *cur_aff = NULL;
1869 int aff_todo = 0;
1870 hashtab_T *tp;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001871 char_u *low = NULL;
1872 char_u *fol = NULL;
1873 char_u *upp = NULL;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001874 static char *e_affname = N_("Affix name too long in %s line %d: %s");
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001875 int do_rep;
1876 int do_sal;
1877 int do_map;
1878 int found_map = FALSE;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001879
Bram Moolenaar51485f02005-06-04 21:55:20 +00001880 /*
1881 * Open the file.
1882 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00001883 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001884 if (fd == NULL)
1885 {
1886 EMSG2(_(e_notopen), fname);
1887 return NULL;
1888 }
1889
Bram Moolenaarb765d632005-06-07 21:00:02 +00001890 if (spin->si_verbose || p_verbose > 2)
1891 {
1892 if (!spin->si_verbose)
1893 verbose_enter();
1894 smsg((char_u *)_("Reading affix file %s..."), fname);
1895 out_flush();
1896 if (!spin->si_verbose)
1897 verbose_leave();
1898 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001899
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001900 /* Only do REP lines when not done in another .aff file already. */
1901 do_rep = spin->si_rep.ga_len == 0;
1902
1903 /* Only do SAL lines when not done in another .aff file already. */
1904 do_sal = spin->si_sal.ga_len == 0;
1905
1906 /* Only do MAP lines when not done in another .aff file already. */
1907 do_map = spin->si_map.ga_len == 0;
1908
Bram Moolenaar51485f02005-06-04 21:55:20 +00001909 /*
1910 * Allocate and init the afffile_T structure.
1911 */
1912 aff = (afffile_T *)getroom(&spin->si_blocks, sizeof(afffile_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001913 if (aff == NULL)
1914 return NULL;
1915 hash_init(&aff->af_pref);
1916 hash_init(&aff->af_suff);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001917
1918 /*
1919 * Read all the lines in the file one by one.
1920 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001921 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001922 {
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001923 line_breakcheck();
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001924 ++lnum;
1925
1926 /* Skip comment lines. */
1927 if (*rline == '#')
1928 continue;
1929
1930 /* Convert from "SET" to 'encoding' when needed. */
1931 vim_free(pc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001932#ifdef FEAT_MBYTE
Bram Moolenaar51485f02005-06-04 21:55:20 +00001933 if (spin->si_conv.vc_type != CONV_NONE)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001934 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00001935 pc = string_convert(&spin->si_conv, rline, NULL);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00001936 if (pc == NULL)
1937 {
1938 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
1939 fname, lnum, rline);
1940 continue;
1941 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001942 line = pc;
1943 }
1944 else
Bram Moolenaarb765d632005-06-07 21:00:02 +00001945#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001946 {
1947 pc = NULL;
1948 line = rline;
1949 }
1950
1951 /* Split the line up in white separated items. Put a NUL after each
1952 * item. */
1953 itemcnt = 0;
1954 for (p = line; ; )
1955 {
1956 while (*p != NUL && *p <= ' ') /* skip white space and CR/NL */
1957 ++p;
1958 if (*p == NUL)
1959 break;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001960 if (itemcnt == 6) /* too many items */
1961 break;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001962 items[itemcnt++] = p;
Bram Moolenaar51485f02005-06-04 21:55:20 +00001963 while (*p > ' ') /* skip until white space or CR/NL */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001964 ++p;
1965 if (*p == NUL)
1966 break;
1967 *p++ = NUL;
1968 }
1969
1970 /* Handle non-empty lines. */
1971 if (itemcnt > 0)
1972 {
1973 if (STRCMP(items[0], "SET") == 0 && itemcnt == 2
1974 && aff->af_enc == NULL)
1975 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00001976#ifdef FEAT_MBYTE
Bram Moolenaar51485f02005-06-04 21:55:20 +00001977 /* Setup for conversion from "ENC" to 'encoding'. */
1978 aff->af_enc = enc_canonize(items[1]);
1979 if (aff->af_enc != NULL && !spin->si_ascii
1980 && convert_setup(&spin->si_conv, aff->af_enc,
1981 p_enc) == FAIL)
1982 smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
1983 fname, aff->af_enc, p_enc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00001984#else
1985 smsg((char_u *)_("Conversion in %s not supported"), fname);
1986#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00001987 }
Bram Moolenaar50cde822005-06-05 21:54:54 +00001988 else if (STRCMP(items[0], "NOSPLITSUGS") == 0 && itemcnt == 1)
1989 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001990 /* ignored, we always split */
Bram Moolenaar50cde822005-06-05 21:54:54 +00001991 }
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001992 else if (STRCMP(items[0], "TRY") == 0 && itemcnt == 2)
Bram Moolenaar51485f02005-06-04 21:55:20 +00001993 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00001994 /* ignored, we look in the tree for what chars may appear */
Bram Moolenaar51485f02005-06-04 21:55:20 +00001995 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00001996 else if (STRCMP(items[0], "RAR") == 0 && itemcnt == 2
1997 && aff->af_rar == 0)
1998 {
1999 aff->af_rar = items[1][0];
2000 if (items[1][1] != NUL)
2001 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
2002 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00002003 else if (STRCMP(items[0], "KEP") == 0 && itemcnt == 2
2004 && aff->af_kep == 0)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002005 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00002006 aff->af_kep = items[1][0];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002007 if (items[1][1] != NUL)
2008 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
2009 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002010 else if ((STRCMP(items[0], "PFX") == 0
2011 || STRCMP(items[0], "SFX") == 0)
2012 && aff_todo == 0
2013 && itemcnt == 4)
2014 {
2015 /* New affix letter. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002016 cur_aff = (affheader_T *)getroom(&spin->si_blocks,
2017 sizeof(affheader_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002018 if (cur_aff == NULL)
2019 break;
2020 cur_aff->ah_key[0] = *items[1];
2021 cur_aff->ah_key[1] = NUL;
2022 if (items[1][1] != NUL)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002023 smsg((char_u *)_(e_affname), fname, lnum, items[1]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002024 if (*items[2] == 'Y')
2025 cur_aff->ah_combine = TRUE;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002026 else if (*items[2] != 'N')
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002027 smsg((char_u *)_("Expected Y or N in %s line %d: %s"),
2028 fname, lnum, items[2]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002029 if (*items[0] == 'P')
2030 tp = &aff->af_pref;
2031 else
2032 tp = &aff->af_suff;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002033 aff_todo = atoi((char *)items[3]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002034 if (!HASHITEM_EMPTY(hash_find(tp, cur_aff->ah_key)))
Bram Moolenaar51485f02005-06-04 21:55:20 +00002035 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002036 smsg((char_u *)_("Duplicate affix in %s line %d: %s"),
2037 fname, lnum, items[1]);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002038 aff_todo = 0;
2039 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002040 else
2041 hash_add(tp, cur_aff->ah_key);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002042 }
2043 else if ((STRCMP(items[0], "PFX") == 0
2044 || STRCMP(items[0], "SFX") == 0)
2045 && aff_todo > 0
2046 && STRCMP(cur_aff->ah_key, items[1]) == 0
2047 && itemcnt == 5)
2048 {
2049 affentry_T *aff_entry;
2050
2051 /* New item for an affix letter. */
2052 --aff_todo;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002053 aff_entry = (affentry_T *)getroom(&spin->si_blocks,
2054 sizeof(affentry_T));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002055 if (aff_entry == NULL)
2056 break;
Bram Moolenaar5482f332005-04-17 20:18:43 +00002057
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002058 if (STRCMP(items[2], "0") != 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002059 aff_entry->ae_chop = getroom_save(&spin->si_blocks,
2060 items[2]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002061 if (STRCMP(items[3], "0") != 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002062 aff_entry->ae_add = getroom_save(&spin->si_blocks,
2063 items[3]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002064
Bram Moolenaar51485f02005-06-04 21:55:20 +00002065 /* Don't use an affix entry with non-ASCII characters when
2066 * "spin->si_ascii" is TRUE. */
2067 if (!spin->si_ascii || !(has_non_ascii(aff_entry->ae_chop)
Bram Moolenaar5482f332005-04-17 20:18:43 +00002068 || has_non_ascii(aff_entry->ae_add)))
2069 {
Bram Moolenaar5482f332005-04-17 20:18:43 +00002070 aff_entry->ae_next = cur_aff->ah_first;
2071 cur_aff->ah_first = aff_entry;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002072
2073 if (STRCMP(items[4], ".") != 0)
2074 {
2075 char_u buf[MAXLINELEN];
2076
2077 aff_entry->ae_cond = getroom_save(&spin->si_blocks,
2078 items[4]);
2079 if (*items[0] == 'P')
2080 sprintf((char *)buf, "^%s", items[4]);
2081 else
2082 sprintf((char *)buf, "%s$", items[4]);
2083 aff_entry->ae_prog = vim_regcomp(buf,
2084 RE_MAGIC + RE_STRING);
2085 }
Bram Moolenaar5482f332005-04-17 20:18:43 +00002086 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002087 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002088 else if (STRCMP(items[0], "FOL") == 0 && itemcnt == 2)
2089 {
2090 if (fol != NULL)
2091 smsg((char_u *)_("Duplicate FOL in %s line %d"),
2092 fname, lnum);
2093 else
2094 fol = vim_strsave(items[1]);
2095 }
2096 else if (STRCMP(items[0], "LOW") == 0 && itemcnt == 2)
2097 {
2098 if (low != NULL)
2099 smsg((char_u *)_("Duplicate LOW in %s line %d"),
2100 fname, lnum);
2101 else
2102 low = vim_strsave(items[1]);
2103 }
2104 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2)
2105 {
2106 if (upp != NULL)
2107 smsg((char_u *)_("Duplicate UPP in %s line %d"),
2108 fname, lnum);
2109 else
2110 upp = vim_strsave(items[1]);
2111 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002112 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 2)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002113 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002114 /* Ignore REP count */;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002115 if (!isdigit(*items[1]))
2116 smsg((char_u *)_("Expected REP count in %s line %d"),
2117 fname, lnum);
2118 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002119 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 3)
2120 {
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002121 /* REP item */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002122 if (do_rep)
2123 add_fromto(spin, &spin->si_rep, items[1], items[2]);
2124 }
2125 else if (STRCMP(items[0], "MAP") == 0 && itemcnt == 2)
2126 {
2127 /* MAP item or count */
2128 if (!found_map)
2129 {
2130 /* First line contains the count. */
2131 found_map = TRUE;
2132 if (!isdigit(*items[1]))
2133 smsg((char_u *)_("Expected MAP count in %s line %d"),
2134 fname, lnum);
2135 }
2136 else if (do_map)
2137 {
2138 /* We simply concatenate all the MAP strings, separated by
2139 * slashes. */
2140 ga_concat(&spin->si_map, items[1]);
2141 ga_append(&spin->si_map, '/');
2142 }
2143 }
2144 else if (STRCMP(items[0], "SAL") == 0 && itemcnt == 3)
2145 {
2146 if (do_sal)
2147 {
2148 /* SAL item (sounds-a-like)
2149 * Either one of the known keys or a from-to pair. */
2150 if (STRCMP(items[1], "followup") == 0)
2151 spin->si_followup = sal_to_bool(items[2]);
2152 else if (STRCMP(items[1], "collapse_result") == 0)
2153 spin->si_collapse = sal_to_bool(items[2]);
2154 else if (STRCMP(items[1], "remove_accents") == 0)
2155 spin->si_rem_accents = sal_to_bool(items[2]);
2156 else
2157 /* when "to" is "_" it means empty */
2158 add_fromto(spin, &spin->si_sal, items[1],
2159 STRCMP(items[2], "_") == 0 ? (char_u *)""
2160 : items[2]);
2161 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002162 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002163 else
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002164 smsg((char_u *)_("Unrecognized item in %s line %d: %s"),
2165 fname, lnum, items[0]);
2166 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002167 }
2168
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002169 if (fol != NULL || low != NULL || upp != NULL)
2170 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00002171 /*
2172 * Don't write a word table for an ASCII file, so that we don't check
2173 * for conflicts with a word table that matches 'encoding'.
2174 * Don't write one for utf-8 either, we use utf_isupper() and
2175 * mb_get_class(), the list of chars in the file will be incomplete.
2176 */
2177 if (!spin->si_ascii
2178#ifdef FEAT_MBYTE
2179 && !enc_utf8
2180#endif
2181 )
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00002182 {
2183 if (fol == NULL || low == NULL || upp == NULL)
2184 smsg((char_u *)_("Missing FOL/LOW/UPP line in %s"), fname);
2185 else
Bram Moolenaar3982c542005-06-08 21:56:31 +00002186 (void)set_spell_chartab(fol, low, upp);
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00002187 }
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002188
2189 vim_free(fol);
2190 vim_free(low);
2191 vim_free(upp);
2192 }
2193
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002194 vim_free(pc);
2195 fclose(fd);
2196 return aff;
2197}
2198
2199/*
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002200 * Add a from-to item to "gap". Used for REP and SAL items.
2201 * They are stored case-folded.
2202 */
2203 static void
2204add_fromto(spin, gap, from, to)
2205 spellinfo_T *spin;
2206 garray_T *gap;
2207 char_u *from;
2208 char_u *to;
2209{
2210 fromto_T *ftp;
2211 char_u word[MAXWLEN];
2212
2213 if (ga_grow(gap, 1) == OK)
2214 {
2215 ftp = ((fromto_T *)gap->ga_data) + gap->ga_len;
2216 (void)spell_casefold(from, STRLEN(from), word, MAXWLEN);
2217 ftp->ft_from = getroom_save(&spin->si_blocks, word);
2218 (void)spell_casefold(to, STRLEN(to), word, MAXWLEN);
2219 ftp->ft_to = getroom_save(&spin->si_blocks, word);
2220 ++gap->ga_len;
2221 }
2222}
2223
2224/*
2225 * Convert a boolean argument in a SAL line to TRUE or FALSE;
2226 */
2227 static int
2228sal_to_bool(s)
2229 char_u *s;
2230{
2231 return STRCMP(s, "1") == 0 || STRCMP(s, "true") == 0;
2232}
2233
2234/*
Bram Moolenaar5482f332005-04-17 20:18:43 +00002235 * Return TRUE if string "s" contains a non-ASCII character (128 or higher).
2236 * When "s" is NULL FALSE is returned.
2237 */
2238 static int
2239has_non_ascii(s)
2240 char_u *s;
2241{
2242 char_u *p;
2243
2244 if (s != NULL)
2245 for (p = s; *p != NUL; ++p)
2246 if (*p >= 128)
2247 return TRUE;
2248 return FALSE;
2249}
2250
2251/*
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002252 * Free the structure filled by spell_read_aff().
2253 */
2254 static void
2255spell_free_aff(aff)
2256 afffile_T *aff;
2257{
2258 hashtab_T *ht;
2259 hashitem_T *hi;
2260 int todo;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002261 affheader_T *ah;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002262 affentry_T *ae;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002263
2264 vim_free(aff->af_enc);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002265
Bram Moolenaar51485f02005-06-04 21:55:20 +00002266 /* All this trouble to foree the "ae_prog" items... */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002267 for (ht = &aff->af_pref; ; ht = &aff->af_suff)
2268 {
2269 todo = ht->ht_used;
2270 for (hi = ht->ht_array; todo > 0; ++hi)
2271 {
2272 if (!HASHITEM_EMPTY(hi))
2273 {
2274 --todo;
2275 ah = HI2AH(hi);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002276 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
2277 vim_free(ae->ae_prog);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002278 }
2279 }
2280 if (ht == &aff->af_suff)
2281 break;
2282 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002283
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002284 hash_clear(&aff->af_pref);
2285 hash_clear(&aff->af_suff);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002286}
2287
2288/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00002289 * Read dictionary file "fname".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002290 * Returns OK or FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002291 */
2292 static int
Bram Moolenaar51485f02005-06-04 21:55:20 +00002293spell_read_dic(fname, spin, affile)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002294 char_u *fname;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002295 spellinfo_T *spin;
2296 afffile_T *affile;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002297{
Bram Moolenaar51485f02005-06-04 21:55:20 +00002298 hashtab_T ht;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002299 char_u line[MAXLINELEN];
Bram Moolenaar51485f02005-06-04 21:55:20 +00002300 char_u *afflist;
2301 char_u *dw;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002302 char_u *pc;
2303 char_u *w;
2304 int l;
2305 hash_T hash;
2306 hashitem_T *hi;
2307 FILE *fd;
2308 int lnum = 1;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002309 int non_ascii = 0;
2310 int retval = OK;
2311 char_u message[MAXLINELEN + MAXWLEN];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002312 int flags;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002313
Bram Moolenaar51485f02005-06-04 21:55:20 +00002314 /*
2315 * Open the file.
2316 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002317 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002318 if (fd == NULL)
2319 {
2320 EMSG2(_(e_notopen), fname);
2321 return FAIL;
2322 }
2323
Bram Moolenaar51485f02005-06-04 21:55:20 +00002324 /* The hashtable is only used to detect duplicated words. */
2325 hash_init(&ht);
2326
Bram Moolenaarb765d632005-06-07 21:00:02 +00002327 if (spin->si_verbose || p_verbose > 2)
2328 {
2329 if (!spin->si_verbose)
2330 verbose_enter();
2331 smsg((char_u *)_("Reading dictionary file %s..."), fname);
2332 out_flush();
2333 if (!spin->si_verbose)
2334 verbose_leave();
2335 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002336
2337 /* Read and ignore the first line: word count. */
2338 (void)vim_fgets(line, MAXLINELEN, fd);
2339 if (!isdigit(*skipwhite(line)))
2340 EMSG2(_("E760: No word count in %s"), fname);
2341
2342 /*
2343 * Read all the lines in the file one by one.
2344 * The words are converted to 'encoding' here, before being added to
2345 * the hashtable.
2346 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002347 while (!vim_fgets(line, MAXLINELEN, fd) && !got_int)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002348 {
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002349 line_breakcheck();
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002350 ++lnum;
2351
Bram Moolenaar51485f02005-06-04 21:55:20 +00002352 /* Remove CR, LF and white space from the end. White space halfway
2353 * the word is kept to allow e.g., "et al.". */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002354 l = STRLEN(line);
2355 while (l > 0 && line[l - 1] <= ' ')
2356 --l;
2357 if (l == 0)
2358 continue; /* empty line */
2359 line[l] = NUL;
2360
Bram Moolenaar51485f02005-06-04 21:55:20 +00002361 /* This takes time, print a message now and then. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002362 if (spin->si_verbose && (lnum & 0x3ff) == 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002363 {
2364 vim_snprintf((char *)message, sizeof(message),
2365 _("line %6d - %s"), lnum, line);
2366 msg_start();
2367 msg_outtrans_attr(message, 0);
2368 msg_clr_eos();
2369 msg_didout = FALSE;
2370 msg_col = 0;
2371 out_flush();
2372 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002373
Bram Moolenaar51485f02005-06-04 21:55:20 +00002374 /* Find the optional affix names. */
2375 afflist = vim_strchr(line, '/');
2376 if (afflist != NULL)
2377 *afflist++ = NUL;
2378
2379 /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */
2380 if (spin->si_ascii && has_non_ascii(line))
2381 {
2382 ++non_ascii;
Bram Moolenaar5482f332005-04-17 20:18:43 +00002383 continue;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002384 }
Bram Moolenaar5482f332005-04-17 20:18:43 +00002385
Bram Moolenaarb765d632005-06-07 21:00:02 +00002386#ifdef FEAT_MBYTE
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002387 /* Convert from "SET" to 'encoding' when needed. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002388 if (spin->si_conv.vc_type != CONV_NONE)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002389 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002390 pc = string_convert(&spin->si_conv, line, NULL);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002391 if (pc == NULL)
2392 {
2393 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
2394 fname, lnum, line);
2395 continue;
2396 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002397 w = pc;
2398 }
2399 else
Bram Moolenaarb765d632005-06-07 21:00:02 +00002400#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002401 {
2402 pc = NULL;
2403 w = line;
2404 }
2405
Bram Moolenaar51485f02005-06-04 21:55:20 +00002406 /* Store the word in the hashtable to be able to find duplicates. */
2407 dw = (char_u *)getroom_save(&spin->si_blocks, w);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002408 if (dw == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002409 retval = FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002410 vim_free(pc);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002411 if (retval == FAIL)
2412 break;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002413
Bram Moolenaar51485f02005-06-04 21:55:20 +00002414 hash = hash_hash(dw);
2415 hi = hash_lookup(&ht, dw, hash);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002416 if (!HASHITEM_EMPTY(hi))
2417 smsg((char_u *)_("Duplicate word in %s line %d: %s"),
Bram Moolenaar51485f02005-06-04 21:55:20 +00002418 fname, lnum, line);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002419 else
Bram Moolenaar51485f02005-06-04 21:55:20 +00002420 hash_add_item(&ht, hi, dw, hash);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002421
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002422 flags = 0;
2423 if (afflist != NULL)
2424 {
2425 /* Check for affix name that stands for keep-case word and stands
2426 * for rare word (if defined). */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002427 if (affile->af_kep != NUL
2428 && vim_strchr(afflist, affile->af_kep) != NULL)
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002429 flags |= WF_KEEPCAP;
2430 if (affile->af_rar != NUL
2431 && vim_strchr(afflist, affile->af_rar) != NULL)
2432 flags |= WF_RARE;
2433 }
2434
Bram Moolenaar51485f02005-06-04 21:55:20 +00002435 /* Add the word to the word tree(s). */
Bram Moolenaar3982c542005-06-08 21:56:31 +00002436 if (store_word(dw, spin, flags, spin->si_region) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002437 retval = FAIL;
2438
2439 if (afflist != NULL)
2440 {
2441 /* Find all matching suffixes and add the resulting words.
2442 * Additionally do matching prefixes that combine. */
2443 if (store_aff_word(dw, spin, afflist,
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002444 &affile->af_suff, &affile->af_pref,
2445 FALSE, flags) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002446 retval = FAIL;
2447
2448 /* Find all matching prefixes and add the resulting words. */
2449 if (store_aff_word(dw, spin, afflist,
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002450 &affile->af_pref, NULL, FALSE, flags) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002451 retval = FAIL;
2452 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002453 }
2454
Bram Moolenaar51485f02005-06-04 21:55:20 +00002455 if (spin->si_ascii && non_ascii > 0)
2456 smsg((char_u *)_("Ignored %d words with non-ASCII characters"),
2457 non_ascii);
2458 hash_clear(&ht);
2459
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002460 fclose(fd);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002461 return retval;
2462}
2463
2464/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00002465 * Apply affixes to a word and store the resulting words.
2466 * "ht" is the hashtable with affentry_T that need to be applied, either
2467 * prefixes or suffixes.
2468 * "xht", when not NULL, is the prefix hashtable, to be used additionally on
2469 * the resulting words for combining affixes.
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002470 *
2471 * Returns FAIL when out of memory.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002472 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002473 static int
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002474store_aff_word(word, spin, afflist, ht, xht, comb, flags)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002475 char_u *word; /* basic word start */
2476 spellinfo_T *spin; /* spell info */
2477 char_u *afflist; /* list of names of supported affixes */
2478 hashtab_T *ht;
2479 hashtab_T *xht;
2480 int comb; /* only use affixes that combine */
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002481 int flags; /* flags for the word */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002482{
2483 int todo;
2484 hashitem_T *hi;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002485 affheader_T *ah;
2486 affentry_T *ae;
2487 regmatch_T regmatch;
2488 char_u newword[MAXWLEN];
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002489 int retval = OK;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002490 int i;
2491 char_u *p;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002492
Bram Moolenaar51485f02005-06-04 21:55:20 +00002493 todo = ht->ht_used;
2494 for (hi = ht->ht_array; todo > 0 && retval == OK; ++hi)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002495 {
2496 if (!HASHITEM_EMPTY(hi))
2497 {
2498 --todo;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002499 ah = HI2AH(hi);
Bram Moolenaar5482f332005-04-17 20:18:43 +00002500
Bram Moolenaar51485f02005-06-04 21:55:20 +00002501 /* Check that the affix combines, if required, and that the word
2502 * supports this affix. */
2503 if ((!comb || ah->ah_combine)
2504 && vim_strchr(afflist, *ah->ah_key) != NULL)
Bram Moolenaar5482f332005-04-17 20:18:43 +00002505 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002506 /* Loop over all affix entries with this name. */
2507 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002508 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002509 /* Check the condition. It's not logical to match case
2510 * here, but it is required for compatibility with
2511 * Myspell. */
2512 regmatch.regprog = ae->ae_prog;
2513 regmatch.rm_ic = FALSE;
2514 if (ae->ae_prog == NULL
2515 || vim_regexec(&regmatch, word, (colnr_T)0))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002516 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002517 /* Match. Remove the chop and add the affix. */
2518 if (xht == NULL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002519 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002520 /* prefix: chop/add at the start of the word */
2521 if (ae->ae_add == NULL)
2522 *newword = NUL;
2523 else
2524 STRCPY(newword, ae->ae_add);
2525 p = word;
2526 if (ae->ae_chop != NULL)
Bram Moolenaarb765d632005-06-07 21:00:02 +00002527 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002528 /* Skip chop string. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002529#ifdef FEAT_MBYTE
2530 if (has_mbyte)
2531 i = mb_charlen(ae->ae_chop);
2532 else
2533#endif
2534 i = STRLEN(ae->ae_chop);
2535 for ( ; i > 0; --i)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002536 mb_ptr_adv(p);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002537 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002538 STRCAT(newword, p);
2539 }
2540 else
2541 {
2542 /* suffix: chop/add at the end of the word */
2543 STRCPY(newword, word);
2544 if (ae->ae_chop != NULL)
2545 {
2546 /* Remove chop string. */
2547 p = newword + STRLEN(newword);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002548#ifdef FEAT_MBYTE
2549 if (has_mbyte)
2550 i = mb_charlen(ae->ae_chop);
2551 else
2552#endif
2553 i = STRLEN(ae->ae_chop);
2554 for ( ; i > 0; --i)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002555 mb_ptr_back(newword, p);
2556 *p = NUL;
2557 }
2558 if (ae->ae_add != NULL)
2559 STRCAT(newword, ae->ae_add);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002560 }
2561
Bram Moolenaar51485f02005-06-04 21:55:20 +00002562 /* Store the modified word. */
Bram Moolenaar3982c542005-06-08 21:56:31 +00002563 if (store_word(newword, spin,
2564 flags, spin->si_region) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002565 retval = FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002566
Bram Moolenaar51485f02005-06-04 21:55:20 +00002567 /* When added a suffix and combining is allowed also
2568 * try adding prefixes additionally. */
2569 if (xht != NULL && ah->ah_combine)
2570 if (store_aff_word(newword, spin, afflist,
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002571 xht, NULL, TRUE, flags) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002572 retval = FAIL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002573 }
2574 }
2575 }
2576 }
2577 }
2578
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002579 return retval;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002580}
2581
2582/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00002583 * Read a file with a list of words.
2584 */
2585 static int
2586spell_read_wordfile(fname, spin)
2587 char_u *fname;
2588 spellinfo_T *spin;
2589{
2590 FILE *fd;
2591 long lnum = 0;
2592 char_u rline[MAXLINELEN];
2593 char_u *line;
2594 char_u *pc = NULL;
2595 int l;
2596 int retval = OK;
2597 int did_word = FALSE;
2598 int non_ascii = 0;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002599 int flags;
Bram Moolenaar3982c542005-06-08 21:56:31 +00002600 int regionmask;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002601
2602 /*
2603 * Open the file.
2604 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00002605 fd = mch_fopen((char *)fname, "r");
Bram Moolenaar51485f02005-06-04 21:55:20 +00002606 if (fd == NULL)
2607 {
2608 EMSG2(_(e_notopen), fname);
2609 return FAIL;
2610 }
2611
Bram Moolenaarb765d632005-06-07 21:00:02 +00002612 if (spin->si_verbose || p_verbose > 2)
2613 {
2614 if (!spin->si_verbose)
2615 verbose_enter();
2616 smsg((char_u *)_("Reading word file %s..."), fname);
2617 out_flush();
2618 if (!spin->si_verbose)
2619 verbose_leave();
2620 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002621
2622 /*
2623 * Read all the lines in the file one by one.
2624 */
2625 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
2626 {
2627 line_breakcheck();
2628 ++lnum;
2629
2630 /* Skip comment lines. */
2631 if (*rline == '#')
2632 continue;
2633
2634 /* Remove CR, LF and white space from the end. */
2635 l = STRLEN(rline);
2636 while (l > 0 && rline[l - 1] <= ' ')
2637 --l;
2638 if (l == 0)
2639 continue; /* empty or blank line */
2640 rline[l] = NUL;
2641
2642 /* Convert from "=encoding={encoding}" to 'encoding' when needed. */
2643 vim_free(pc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002644#ifdef FEAT_MBYTE
Bram Moolenaar51485f02005-06-04 21:55:20 +00002645 if (spin->si_conv.vc_type != CONV_NONE)
2646 {
2647 pc = string_convert(&spin->si_conv, rline, NULL);
2648 if (pc == NULL)
2649 {
2650 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
2651 fname, lnum, rline);
2652 continue;
2653 }
2654 line = pc;
2655 }
2656 else
Bram Moolenaarb765d632005-06-07 21:00:02 +00002657#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +00002658 {
2659 pc = NULL;
2660 line = rline;
2661 }
2662
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002663 flags = 0;
Bram Moolenaar3982c542005-06-08 21:56:31 +00002664 regionmask = spin->si_region;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002665
2666 if (*line == '/')
Bram Moolenaar51485f02005-06-04 21:55:20 +00002667 {
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002668 ++line;
Bram Moolenaar3982c542005-06-08 21:56:31 +00002669
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002670 if (STRNCMP(line, "encoding=", 9) == 0)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002671 {
2672 if (spin->si_conv.vc_type != CONV_NONE)
Bram Moolenaar3982c542005-06-08 21:56:31 +00002673 smsg((char_u *)_("Duplicate /encoding= line ignored in %s line %d: %s"),
2674 fname, lnum, line - 1);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002675 else if (did_word)
Bram Moolenaar3982c542005-06-08 21:56:31 +00002676 smsg((char_u *)_("/encoding= line after word ignored in %s line %d: %s"),
2677 fname, lnum, line - 1);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002678 else
2679 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00002680#ifdef FEAT_MBYTE
2681 char_u *enc;
2682
Bram Moolenaar51485f02005-06-04 21:55:20 +00002683 /* Setup for conversion to 'encoding'. */
Bram Moolenaar3982c542005-06-08 21:56:31 +00002684 line += 10;
2685 enc = enc_canonize(line);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002686 if (enc != NULL && !spin->si_ascii
2687 && convert_setup(&spin->si_conv, enc,
2688 p_enc) == FAIL)
2689 smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
Bram Moolenaar3982c542005-06-08 21:56:31 +00002690 fname, line, p_enc);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002691 vim_free(enc);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002692#else
2693 smsg((char_u *)_("Conversion in %s not supported"), fname);
2694#endif
Bram Moolenaar51485f02005-06-04 21:55:20 +00002695 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002696 continue;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002697 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002698
Bram Moolenaar3982c542005-06-08 21:56:31 +00002699 if (STRNCMP(line, "regions=", 8) == 0)
2700 {
2701 if (spin->si_region_count > 1)
2702 smsg((char_u *)_("Duplicate /regions= line ignored in %s line %d: %s"),
2703 fname, lnum, line);
2704 else
2705 {
2706 line += 8;
2707 if (STRLEN(line) > 16)
2708 smsg((char_u *)_("Too many regions in %s line %d: %s"),
2709 fname, lnum, line);
2710 else
2711 {
2712 spin->si_region_count = STRLEN(line) / 2;
2713 STRCPY(spin->si_region_name, line);
2714 }
2715 }
2716 continue;
2717 }
2718
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002719 if (*line == '=')
2720 {
2721 /* keep-case word */
2722 flags |= WF_KEEPCAP;
2723 ++line;
2724 }
2725
2726 if (*line == '!')
2727 {
2728 /* Bad, bad, wicked word. */
2729 flags |= WF_BANNED;
2730 ++line;
2731 }
2732 else if (*line == '?')
2733 {
2734 /* Rare word. */
2735 flags |= WF_RARE;
2736 ++line;
2737 }
2738
Bram Moolenaar3982c542005-06-08 21:56:31 +00002739 if (VIM_ISDIGIT(*line))
2740 {
2741 /* region number(s) */
2742 regionmask = 0;
2743 while (VIM_ISDIGIT(*line))
2744 {
2745 l = *line - '0';
2746 if (l > spin->si_region_count)
2747 {
2748 smsg((char_u *)_("Invalid region nr in %s line %d: %s"),
2749 fname, lnum, line);
2750 break;
2751 }
2752 regionmask |= 1 << (l - 1);
2753 ++line;
2754 }
2755 flags |= WF_REGION;
2756 }
2757
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002758 if (flags == 0)
2759 {
2760 smsg((char_u *)_("/ line ignored in %s line %d: %s"),
Bram Moolenaar51485f02005-06-04 21:55:20 +00002761 fname, lnum, line);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002762 continue;
2763 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002764 }
2765
2766 /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */
2767 if (spin->si_ascii && has_non_ascii(line))
2768 {
2769 ++non_ascii;
2770 continue;
2771 }
2772
2773 /* Normal word: store it. */
Bram Moolenaar3982c542005-06-08 21:56:31 +00002774 if (store_word(line, spin, flags, regionmask) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002775 {
2776 retval = FAIL;
2777 break;
2778 }
2779 did_word = TRUE;
2780 }
2781
2782 vim_free(pc);
2783 fclose(fd);
2784
Bram Moolenaarb765d632005-06-07 21:00:02 +00002785 if (spin->si_ascii && non_ascii > 0 && (spin->si_verbose || p_verbose > 2))
2786 {
2787 if (p_verbose > 2)
2788 verbose_enter();
Bram Moolenaar51485f02005-06-04 21:55:20 +00002789 smsg((char_u *)_("Ignored %d words with non-ASCII characters"),
2790 non_ascii);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002791 if (p_verbose > 2)
2792 verbose_leave();
2793 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002794 return retval;
2795}
2796
2797/*
2798 * Get part of an sblock_T, "len" bytes long.
2799 * This avoids calling free() for every little struct we use.
2800 * The memory is cleared to all zeros.
2801 * Returns NULL when out of memory.
2802 */
2803 static void *
2804getroom(blp, len)
2805 sblock_T **blp;
2806 size_t len; /* length needed */
2807{
2808 char_u *p;
2809 sblock_T *bl = *blp;
2810
2811 if (bl == NULL || bl->sb_used + len > SBLOCKSIZE)
2812 {
2813 /* Allocate a block of memory. This is not freed until much later. */
2814 bl = (sblock_T *)alloc_clear((unsigned)(sizeof(sblock_T) + SBLOCKSIZE));
2815 if (bl == NULL)
2816 return NULL;
2817 bl->sb_next = *blp;
2818 *blp = bl;
2819 bl->sb_used = 0;
2820 }
2821
2822 p = bl->sb_data + bl->sb_used;
2823 bl->sb_used += len;
2824
2825 return p;
2826}
2827
2828/*
2829 * Make a copy of a string into memory allocated with getroom().
2830 */
2831 static char_u *
2832getroom_save(blp, s)
2833 sblock_T **blp;
2834 char_u *s;
2835{
2836 char_u *sc;
2837
2838 sc = (char_u *)getroom(blp, STRLEN(s) + 1);
2839 if (sc != NULL)
2840 STRCPY(sc, s);
2841 return sc;
2842}
2843
2844
2845/*
2846 * Free the list of allocated sblock_T.
2847 */
2848 static void
2849free_blocks(bl)
2850 sblock_T *bl;
2851{
2852 sblock_T *next;
2853
2854 while (bl != NULL)
2855 {
2856 next = bl->sb_next;
2857 vim_free(bl);
2858 bl = next;
2859 }
2860}
2861
2862/*
2863 * Allocate the root of a word tree.
2864 */
2865 static wordnode_T *
2866wordtree_alloc(blp)
2867 sblock_T **blp;
2868{
2869 return (wordnode_T *)getroom(blp, sizeof(wordnode_T));
2870}
2871
2872/*
2873 * Store a word in the tree(s).
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002874 * Always store it in the case-folded tree. A keep-case word can also be used
2875 * with all caps.
Bram Moolenaar51485f02005-06-04 21:55:20 +00002876 * For a keep-case word also store it in the keep-case tree.
2877 */
2878 static int
Bram Moolenaar3982c542005-06-08 21:56:31 +00002879store_word(word, spin, flags, region)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002880 char_u *word;
2881 spellinfo_T *spin;
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002882 int flags; /* extra flags, WF_BANNED */
Bram Moolenaar3982c542005-06-08 21:56:31 +00002883 int region; /* supported region(s) */
Bram Moolenaar51485f02005-06-04 21:55:20 +00002884{
2885 int len = STRLEN(word);
2886 int ct = captype(word, word + len);
2887 char_u foldword[MAXWLEN];
2888 int res;
2889
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00002890 (void)spell_casefold(word, len, foldword, MAXWLEN);
2891 res = tree_add_word(foldword, spin->si_foldroot, ct | flags,
2892 region, &spin->si_blocks);
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002893
2894 if (res == OK && (ct == WF_KEEPCAP || flags & WF_KEEPCAP))
2895 res = tree_add_word(word, spin->si_keeproot, flags,
Bram Moolenaar3982c542005-06-08 21:56:31 +00002896 region, &spin->si_blocks);
Bram Moolenaar51485f02005-06-04 21:55:20 +00002897 return res;
2898}
2899
2900/*
2901 * Add word "word" to a word tree at "root".
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002902 * Returns FAIL when out of memory.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002903 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002904 static int
Bram Moolenaar51485f02005-06-04 21:55:20 +00002905tree_add_word(word, root, flags, region, blp)
2906 char_u *word;
2907 wordnode_T *root;
2908 int flags;
2909 int region;
2910 sblock_T **blp;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002911{
Bram Moolenaar51485f02005-06-04 21:55:20 +00002912 wordnode_T *node = root;
2913 wordnode_T *np;
2914 wordnode_T **prev = NULL;
2915 int i;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002916
Bram Moolenaar51485f02005-06-04 21:55:20 +00002917 /* Add each byte of the word to the tree, including the NUL at the end. */
2918 for (i = 0; ; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002919 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002920 /* Look for the sibling that has the same character. They are sorted
2921 * on byte value, thus stop searching when a sibling is found with a
2922 * higher byte value. For zero bytes (end of word) check that the
2923 * flags are equal, there is a separate zero byte for each flag value.
2924 */
2925 while (node != NULL && (node->wn_byte < word[i]
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00002926 || (node->wn_byte == 0 && node->wn_flags != (flags & 0xff))))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002927 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002928 prev = &node->wn_sibling;
2929 node = *prev;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002930 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002931 if (node == NULL || node->wn_byte != word[i])
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002932 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002933 /* Allocate a new node. */
2934 np = (wordnode_T *)getroom(blp, sizeof(wordnode_T));
2935 if (np == NULL)
2936 return FAIL;
2937 np->wn_byte = word[i];
2938 *prev = np;
2939 np->wn_sibling = node;
2940 node = np;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002941 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002942
Bram Moolenaar51485f02005-06-04 21:55:20 +00002943 if (word[i] == NUL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002944 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00002945 node->wn_flags = flags;
2946 node->wn_region |= region;
2947 break;
Bram Moolenaar63d5a1e2005-04-19 21:30:25 +00002948 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002949 prev = &node->wn_child;
2950 node = *prev;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002951 }
2952
2953 return OK;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002954}
2955
2956/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00002957 * Compress a tree: find tails that are identical and can be shared.
2958 */
2959 static void
Bram Moolenaarb765d632005-06-07 21:00:02 +00002960wordtree_compress(root, spin)
Bram Moolenaar51485f02005-06-04 21:55:20 +00002961 wordnode_T *root;
Bram Moolenaarb765d632005-06-07 21:00:02 +00002962 spellinfo_T *spin;
Bram Moolenaar51485f02005-06-04 21:55:20 +00002963{
2964 hashtab_T ht;
2965 int n;
2966 int tot = 0;
2967
2968 if (root != NULL)
2969 {
2970 hash_init(&ht);
2971 n = node_compress(root, &ht, &tot);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002972 if (spin->si_verbose || p_verbose > 2)
2973 {
2974 if (!spin->si_verbose)
2975 verbose_enter();
2976 smsg((char_u *)_("Compressed %d of %d nodes; %d%% remaining"),
Bram Moolenaar51485f02005-06-04 21:55:20 +00002977 n, tot, (tot - n) * 100 / tot);
Bram Moolenaarb765d632005-06-07 21:00:02 +00002978 if (p_verbose > 2)
2979 verbose_leave();
2980 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00002981 hash_clear(&ht);
2982 }
2983}
2984
2985/*
2986 * Compress a node, its siblings and its children, depth first.
2987 * Returns the number of compressed nodes.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002988 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00002989 static int
Bram Moolenaar51485f02005-06-04 21:55:20 +00002990node_compress(node, ht, tot)
2991 wordnode_T *node;
2992 hashtab_T *ht;
2993 int *tot; /* total count of nodes before compressing,
2994 incremented while going through the tree */
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00002995{
Bram Moolenaar51485f02005-06-04 21:55:20 +00002996 wordnode_T *np;
2997 wordnode_T *tp;
2998 wordnode_T *child;
2999 hash_T hash;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003000 hashitem_T *hi;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003001 int len = 0;
3002 unsigned nr, n;
3003 int compressed = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003004
Bram Moolenaar51485f02005-06-04 21:55:20 +00003005 /*
3006 * Go through the list of siblings. Compress each child and then try
3007 * finding an identical child to replace it.
3008 * Note that with "child" we mean not just the node that is pointed to,
3009 * but the whole list of siblings, of which the node is the first.
3010 */
3011 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003012 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003013 ++len;
3014 if ((child = np->wn_child) != NULL)
3015 {
3016 /* Compress the child. This fills wn_hashkey. */
3017 compressed += node_compress(child, ht, tot);
3018
3019 /* Try to find an identical child. */
3020 hash = hash_hash(child->wn_hashkey);
3021 hi = hash_lookup(ht, child->wn_hashkey, hash);
3022 tp = NULL;
3023 if (!HASHITEM_EMPTY(hi))
3024 {
3025 /* There are children with an identical hash value. Now check
3026 * if there is one that is really identical. */
3027 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_next)
3028 if (node_equal(child, tp))
3029 {
3030 /* Found one! Now use that child in place of the
3031 * current one. This means the current child is
3032 * dropped from the tree. */
3033 np->wn_child = tp;
3034 ++compressed;
3035 break;
3036 }
3037 if (tp == NULL)
3038 {
3039 /* No other child with this hash value equals the child of
3040 * the node, add it to the linked list after the first
3041 * item. */
3042 tp = HI2WN(hi);
3043 child->wn_next = tp->wn_next;
3044 tp->wn_next = child;
3045 }
3046 }
3047 else
3048 /* No other child has this hash value, add it to the
3049 * hashtable. */
3050 hash_add_item(ht, hi, child->wn_hashkey, hash);
3051 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003052 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003053 *tot += len;
3054
3055 /*
3056 * Make a hash key for the node and its siblings, so that we can quickly
3057 * find a lookalike node. This must be done after compressing the sibling
3058 * list, otherwise the hash key would become invalid by the compression.
3059 */
3060 node->wn_hashkey[0] = len;
3061 nr = 0;
3062 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003063 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003064 if (np->wn_byte == NUL)
3065 /* end node: only use wn_flags and wn_region */
3066 n = np->wn_flags + (np->wn_region << 8);
3067 else
3068 /* byte node: use the byte value and the child pointer */
3069 n = np->wn_byte + ((long_u)np->wn_child << 8);
3070 nr = nr * 101 + n;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003071 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003072
3073 /* Avoid NUL bytes, it terminates the hash key. */
3074 n = nr & 0xff;
3075 node->wn_hashkey[1] = n == 0 ? 1 : n;
3076 n = (nr >> 8) & 0xff;
3077 node->wn_hashkey[2] = n == 0 ? 1 : n;
3078 n = (nr >> 16) & 0xff;
3079 node->wn_hashkey[3] = n == 0 ? 1 : n;
3080 n = (nr >> 24) & 0xff;
3081 node->wn_hashkey[4] = n == 0 ? 1 : n;
3082 node->wn_hashkey[5] = NUL;
3083
3084 return compressed;
3085}
3086
3087/*
3088 * Return TRUE when two nodes have identical siblings and children.
3089 */
3090 static int
3091node_equal(n1, n2)
3092 wordnode_T *n1;
3093 wordnode_T *n2;
3094{
3095 wordnode_T *p1;
3096 wordnode_T *p2;
3097
3098 for (p1 = n1, p2 = n2; p1 != NULL && p2 != NULL;
3099 p1 = p1->wn_sibling, p2 = p2->wn_sibling)
3100 if (p1->wn_byte != p2->wn_byte
3101 || (p1->wn_byte == NUL
3102 ? (p1->wn_flags != p2->wn_flags
3103 || p1->wn_region != p2->wn_region)
3104 : (p1->wn_child != p2->wn_child)))
3105 break;
3106
3107 return p1 == NULL && p2 == NULL;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003108}
3109
3110/*
3111 * Write a number to file "fd", MSB first, in "len" bytes.
3112 */
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003113 void
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003114put_bytes(fd, nr, len)
3115 FILE *fd;
3116 long_u nr;
3117 int len;
3118{
3119 int i;
3120
3121 for (i = len - 1; i >= 0; --i)
3122 putc((int)(nr >> (i * 8)), fd);
3123}
3124
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003125static int
3126#ifdef __BORLANDC__
3127_RTLENTRYF
3128#endif
3129rep_compare __ARGS((const void *s1, const void *s2));
3130
3131/*
3132 * Function given to qsort() to sort the REP items on "from" string.
3133 */
3134 static int
3135#ifdef __BORLANDC__
3136_RTLENTRYF
3137#endif
3138rep_compare(s1, s2)
3139 const void *s1;
3140 const void *s2;
3141{
3142 fromto_T *p1 = (fromto_T *)s1;
3143 fromto_T *p2 = (fromto_T *)s2;
3144
3145 return STRCMP(p1->ft_from, p2->ft_from);
3146}
3147
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003148/*
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003149 * Write the Vim spell file "fname".
3150 */
3151 static void
Bram Moolenaar3982c542005-06-08 21:56:31 +00003152write_vim_spell(fname, spin)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003153 char_u *fname;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003154 spellinfo_T *spin;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003155{
Bram Moolenaar51485f02005-06-04 21:55:20 +00003156 FILE *fd;
3157 int regionmask;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003158 int round;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003159 wordnode_T *tree;
3160 int nodecount;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003161 int i;
3162 int l;
3163 garray_T *gap;
3164 fromto_T *ftp;
3165 char_u *p;
3166 int rr;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003167
Bram Moolenaarb765d632005-06-07 21:00:02 +00003168 fd = mch_fopen((char *)fname, "w");
Bram Moolenaar51485f02005-06-04 21:55:20 +00003169 if (fd == NULL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003170 {
3171 EMSG2(_(e_notopen), fname);
3172 return;
3173 }
3174
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003175 /* <HEADER>: <fileID> <regioncnt> <regionname> ...
3176 * <charflagslen> <charflags> <fcharslen> <fchars> */
Bram Moolenaar51485f02005-06-04 21:55:20 +00003177
3178 /* <fileID> */
3179 if (fwrite(VIMSPELLMAGIC, VIMSPELLMAGICL, (size_t)1, fd) != 1)
3180 EMSG(_(e_write));
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003181
3182 /* write the region names if there is more than one */
Bram Moolenaar3982c542005-06-08 21:56:31 +00003183 if (spin->si_region_count > 1)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003184 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00003185 putc(spin->si_region_count, fd); /* <regioncnt> <regionname> ... */
3186 fwrite(spin->si_region_name, (size_t)(spin->si_region_count * 2),
3187 (size_t)1, fd);
3188 regionmask = (1 << spin->si_region_count) - 1;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003189 }
3190 else
3191 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003192 putc(0, fd);
3193 regionmask = 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003194 }
3195
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003196 /*
3197 * Write the table with character flags and table for case folding.
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00003198 * <charflagslen> <charflags> <fcharlen> <fchars>
3199 * Skip this for ASCII, the table may conflict with the one used for
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003200 * 'encoding'.
3201 * Also skip this for an .add.spl file, the main spell file must contain
3202 * the table (avoids that it conflicts). File is shorter too.
3203 */
3204 if (spin->si_ascii || spin->si_add)
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00003205 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003206 putc(0, fd);
3207 putc(0, fd);
3208 putc(0, fd);
Bram Moolenaar6f3058f2005-04-24 21:58:05 +00003209 }
3210 else
Bram Moolenaar51485f02005-06-04 21:55:20 +00003211 write_spell_chartab(fd);
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003212
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003213 /* Sort the REP items. */
3214 qsort(spin->si_rep.ga_data, (size_t)spin->si_rep.ga_len,
3215 sizeof(fromto_T), rep_compare);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003216
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003217 /* <SUGGEST> : <repcount> <rep> ...
3218 * <salflags> <salcount> <sal> ...
3219 * <maplen> <mapstr> */
3220 for (round = 1; round <= 2; ++round)
3221 {
3222 if (round == 1)
3223 gap = &spin->si_rep;
3224 else
3225 {
3226 gap = &spin->si_sal;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003227
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003228 i = 0;
3229 if (spin->si_followup)
3230 i |= SAL_F0LLOWUP;
3231 if (spin->si_collapse)
3232 i |= SAL_COLLAPSE;
3233 if (spin->si_rem_accents)
3234 i |= SAL_REM_ACCENTS;
3235 putc(i, fd); /* <salflags> */
3236 }
3237
3238 put_bytes(fd, (long_u)gap->ga_len, 2); /* <repcount> or <salcount> */
3239 for (i = 0; i < gap->ga_len; ++i)
3240 {
3241 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */
3242 /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
3243 ftp = &((fromto_T *)gap->ga_data)[i];
3244 for (rr = 1; rr <= 2; ++rr)
3245 {
3246 p = rr == 1 ? ftp->ft_from : ftp->ft_to;
3247 l = STRLEN(p);
3248 putc(l, fd);
3249 fwrite(p, l, (size_t)1, fd);
3250 }
3251 }
3252 }
3253
3254 put_bytes(fd, (long_u)spin->si_map.ga_len, 2); /* <maplen> */
3255 if (spin->si_map.ga_len > 0) /* <mapstr> */
3256 fwrite(spin->si_map.ga_data, (size_t)spin->si_map.ga_len,
3257 (size_t)1, fd);
Bram Moolenaar50cde822005-06-05 21:54:54 +00003258
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003259 /*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003260 * <LWORDTREE> <KWORDTREE>
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003261 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003262 spin->si_memtot = 0;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003263 for (round = 1; round <= 2; ++round)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003264 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003265 tree = (round == 1) ? spin->si_foldroot : spin->si_keeproot;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003266
Bram Moolenaar51485f02005-06-04 21:55:20 +00003267 /* Count the number of nodes. Needed to be able to allocate the
3268 * memory when reading the nodes. Also fills in the index for shared
3269 * nodes. */
3270 nodecount = put_tree(NULL, tree, 0, regionmask);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003271
Bram Moolenaar51485f02005-06-04 21:55:20 +00003272 /* number of nodes in 4 bytes */
3273 put_bytes(fd, (long_u)nodecount, 4); /* <nodecount> */
Bram Moolenaar50cde822005-06-05 21:54:54 +00003274 spin->si_memtot += nodecount + nodecount * sizeof(int);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003275
Bram Moolenaar51485f02005-06-04 21:55:20 +00003276 /* Write the nodes. */
3277 (void)put_tree(fd, tree, 0, regionmask);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003278 }
3279
Bram Moolenaar51485f02005-06-04 21:55:20 +00003280 fclose(fd);
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00003281}
3282
3283/*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003284 * Dump a word tree at node "node".
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003285 *
Bram Moolenaar51485f02005-06-04 21:55:20 +00003286 * This first writes the list of possible bytes (siblings). Then for each
3287 * byte recursively write the children.
3288 *
3289 * NOTE: The code here must match the code in read_tree(), since assumptions
3290 * are made about the indexes (so that we don't have to write them in the
3291 * file).
3292 *
3293 * Returns the number of nodes used.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003294 */
Bram Moolenaar51485f02005-06-04 21:55:20 +00003295 static int
3296put_tree(fd, node, index, regionmask)
3297 FILE *fd; /* NULL when only counting */
3298 wordnode_T *node;
3299 int index;
3300 int regionmask;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003301{
Bram Moolenaar51485f02005-06-04 21:55:20 +00003302 int newindex = index;
3303 int siblingcount = 0;
3304 wordnode_T *np;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003305 int flags;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003306
Bram Moolenaar51485f02005-06-04 21:55:20 +00003307 /* If "node" is zero the tree is empty. */
3308 if (node == NULL)
3309 return 0;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003310
Bram Moolenaar51485f02005-06-04 21:55:20 +00003311 /* Store the index where this node is written. */
3312 node->wn_index = index;
3313
3314 /* Count the number of siblings. */
3315 for (np = node; np != NULL; np = np->wn_sibling)
3316 ++siblingcount;
3317
3318 /* Write the sibling count. */
3319 if (fd != NULL)
3320 putc(siblingcount, fd); /* <siblingcount> */
3321
3322 /* Write each sibling byte and optionally extra info. */
3323 for (np = node; np != NULL; np = np->wn_sibling)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003324 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003325 if (np->wn_byte == 0)
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00003326 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003327 if (fd != NULL)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003328 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003329 /* For a NUL byte (end of word) instead of the byte itself
3330 * we write the flag/region items. */
3331 flags = np->wn_flags;
3332 if (regionmask != 0 && np->wn_region != regionmask)
3333 flags |= WF_REGION;
3334 if (flags == 0)
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00003335 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003336 /* word without flags or region */
3337 putc(BY_NOFLAGS, fd); /* <byte> */
3338 }
3339 else
3340 {
3341 putc(BY_FLAGS, fd); /* <byte> */
3342 putc(flags, fd); /* <flags> */
3343 if (flags & WF_REGION)
3344 putc(np->wn_region, fd); /* <regionmask> */
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00003345 }
3346 }
Bram Moolenaar2cf8b302005-04-20 19:37:22 +00003347 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003348 else
3349 {
3350 if (np->wn_child->wn_index != 0 && np->wn_child->wn_wnode != node)
3351 {
3352 /* The child is written elsewhere, write the reference. */
3353 if (fd != NULL)
3354 {
3355 putc(BY_INDEX, fd); /* <byte> */
3356 /* <nodeidx> */
3357 put_bytes(fd, (long_u)np->wn_child->wn_index, 3);
3358 }
3359 }
3360 else if (np->wn_child->wn_wnode == NULL)
3361 /* We will write the child below and give it an index. */
3362 np->wn_child->wn_wnode = node;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003363
Bram Moolenaar51485f02005-06-04 21:55:20 +00003364 if (fd != NULL)
3365 if (putc(np->wn_byte, fd) == EOF) /* <byte> or <xbyte> */
3366 {
3367 EMSG(_(e_write));
3368 return 0;
3369 }
3370 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003371 }
Bram Moolenaar51485f02005-06-04 21:55:20 +00003372
3373 /* Space used in the array when reading: one for each sibling and one for
3374 * the count. */
3375 newindex += siblingcount + 1;
3376
3377 /* Recursively dump the children of each sibling. */
3378 for (np = node; np != NULL; np = np->wn_sibling)
3379 if (np->wn_byte != 0 && np->wn_child->wn_wnode == node)
3380 newindex = put_tree(fd, np->wn_child, newindex, regionmask);
3381
3382 return newindex;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003383}
3384
3385
3386/*
Bram Moolenaarb765d632005-06-07 21:00:02 +00003387 * ":mkspell [-ascii] outfile infile ..."
3388 * ":mkspell [-ascii] addfile"
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003389 */
3390 void
3391ex_mkspell(eap)
3392 exarg_T *eap;
3393{
3394 int fcount;
3395 char_u **fnames;
Bram Moolenaarb765d632005-06-07 21:00:02 +00003396 char_u *arg = eap->arg;
3397 int ascii = FALSE;
3398
3399 if (STRNCMP(arg, "-ascii", 6) == 0)
3400 {
3401 ascii = TRUE;
3402 arg = skipwhite(arg + 6);
3403 }
3404
3405 /* Expand all the remaining arguments (e.g., $VIMRUNTIME). */
3406 if (get_arglist_exp(arg, &fcount, &fnames) == OK)
3407 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003408 mkspell(fcount, fnames, ascii, eap->forceit, FALSE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003409 FreeWild(fcount, fnames);
3410 }
3411}
3412
3413/*
3414 * Create a Vim spell file from one or more word lists.
3415 * "fnames[0]" is the output file name.
3416 * "fnames[fcount - 1]" is the last input file name.
3417 * Exception: when "fnames[0]" ends in ".add" it's used as the input file name
3418 * and ".spl" is appended to make the output file name.
3419 */
3420 static void
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003421mkspell(fcount, fnames, ascii, overwrite, added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003422 int fcount;
3423 char_u **fnames;
3424 int ascii; /* -ascii argument given */
3425 int overwrite; /* overwrite existing output file */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003426 int added_word; /* invoked through "zg" */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003427{
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003428 char_u fname[MAXPATHL];
3429 char_u wfname[MAXPATHL];
Bram Moolenaarb765d632005-06-07 21:00:02 +00003430 char_u **innames;
3431 int incount;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003432 afffile_T *(afile[8]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003433 int i;
3434 int len;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003435 struct stat st;
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003436 int error = FALSE;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003437 spellinfo_T spin;
3438
3439 vim_memset(&spin, 0, sizeof(spin));
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003440 spin.si_verbose = !added_word;
Bram Moolenaarb765d632005-06-07 21:00:02 +00003441 spin.si_ascii = ascii;
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003442 spin.si_followup = TRUE;
3443 spin.si_rem_accents = TRUE;
3444 ga_init2(&spin.si_rep, (int)sizeof(fromto_T), 20);
3445 ga_init2(&spin.si_sal, (int)sizeof(fromto_T), 20);
3446 ga_init2(&spin.si_map, (int)sizeof(char_u), 100);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003447
Bram Moolenaarb765d632005-06-07 21:00:02 +00003448 /* default: fnames[0] is output file, following are input files */
3449 innames = &fnames[1];
3450 incount = fcount - 1;
3451
3452 if (fcount >= 1)
Bram Moolenaar5482f332005-04-17 20:18:43 +00003453 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003454 len = STRLEN(fnames[0]);
3455 if (fcount == 1 && len > 4 && STRCMP(fnames[0] + len - 4, ".add") == 0)
3456 {
3457 /* For ":mkspell path/en.latin1.add" output file is
3458 * "path/en.latin1.add.spl". */
3459 innames = &fnames[0];
3460 incount = 1;
3461 vim_snprintf((char *)wfname, sizeof(wfname), "%s.spl", fnames[0]);
3462 }
3463 else if (len > 4 && STRCMP(fnames[0] + len - 4, ".spl") == 0)
3464 {
3465 /* Name ends in ".spl", use as the file name. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003466 vim_strncpy(wfname, fnames[0], sizeof(wfname) - 1);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003467 }
3468 else
3469 /* Name should be language, make the file name from it. */
3470 vim_snprintf((char *)wfname, sizeof(wfname), "%s.%s.spl", fnames[0],
3471 spin.si_ascii ? (char_u *)"ascii" : spell_enc());
3472
3473 /* Check for .ascii.spl. */
3474 if (strstr((char *)gettail(wfname), ".ascii.") != NULL)
3475 spin.si_ascii = TRUE;
3476
3477 /* Check for .add.spl. */
3478 if (strstr((char *)gettail(wfname), ".add.") != NULL)
3479 spin.si_add = TRUE;
Bram Moolenaar5482f332005-04-17 20:18:43 +00003480 }
3481
Bram Moolenaarb765d632005-06-07 21:00:02 +00003482 if (incount <= 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003483 EMSG(_(e_invarg)); /* need at least output and input names */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003484 else if (incount > 8)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003485 EMSG(_("E754: Only up to 8 regions supported"));
3486 else
3487 {
3488 /* Check for overwriting before doing things that may take a lot of
3489 * time. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003490 if (!overwrite && mch_stat((char *)wfname, &st) >= 0)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003491 {
3492 EMSG(_(e_exists));
Bram Moolenaarb765d632005-06-07 21:00:02 +00003493 return;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003494 }
Bram Moolenaarb765d632005-06-07 21:00:02 +00003495 if (mch_isdir(wfname))
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003496 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003497 EMSG2(_(e_isadir2), wfname);
3498 return;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003499 }
3500
3501 /*
3502 * Init the aff and dic pointers.
3503 * Get the region names if there are more than 2 arguments.
3504 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003505 for (i = 0; i < incount; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003506 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003507 afile[i] = NULL;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003508
Bram Moolenaar3982c542005-06-08 21:56:31 +00003509 if (incount > 1)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003510 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003511 len = STRLEN(innames[i]);
3512 if (STRLEN(gettail(innames[i])) < 5
3513 || innames[i][len - 3] != '_')
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003514 {
Bram Moolenaarb765d632005-06-07 21:00:02 +00003515 EMSG2(_("E755: Invalid region in %s"), innames[i]);
3516 return;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003517 }
Bram Moolenaar3982c542005-06-08 21:56:31 +00003518 spin.si_region_name[i * 2] = TOLOWER_ASC(innames[i][len - 2]);
3519 spin.si_region_name[i * 2 + 1] =
3520 TOLOWER_ASC(innames[i][len - 1]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003521 }
3522 }
Bram Moolenaar3982c542005-06-08 21:56:31 +00003523 spin.si_region_count = incount;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003524
Bram Moolenaarb765d632005-06-07 21:00:02 +00003525 if (!spin.si_add)
3526 /* Clear the char type tables, don't want to use any of the
3527 * currently used spell properties. */
3528 init_spell_chartab();
Bram Moolenaar8fef2ad2005-04-23 20:42:23 +00003529
Bram Moolenaar51485f02005-06-04 21:55:20 +00003530 spin.si_foldroot = wordtree_alloc(&spin.si_blocks);
3531 spin.si_keeproot = wordtree_alloc(&spin.si_blocks);
3532 if (spin.si_foldroot == NULL || spin.si_keeproot == NULL)
3533 {
3534 error = TRUE;
Bram Moolenaarb765d632005-06-07 21:00:02 +00003535 return;
Bram Moolenaar51485f02005-06-04 21:55:20 +00003536 }
3537
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003538 /*
3539 * Read all the .aff and .dic files.
3540 * Text is converted to 'encoding'.
Bram Moolenaar51485f02005-06-04 21:55:20 +00003541 * Words are stored in the case-folded and keep-case trees.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003542 */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003543 for (i = 0; i < incount && !error; ++i)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003544 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003545 spin.si_conv.vc_type = CONV_NONE;
Bram Moolenaarb765d632005-06-07 21:00:02 +00003546 spin.si_region = 1 << i;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003547
Bram Moolenaarb765d632005-06-07 21:00:02 +00003548 vim_snprintf((char *)fname, sizeof(fname), "%s.aff", innames[i]);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003549 if (mch_stat((char *)fname, &st) >= 0)
3550 {
3551 /* Read the .aff file. Will init "spin->si_conv" based on the
3552 * "SET" line. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003553 afile[i] = spell_read_aff(fname, &spin);
3554 if (afile[i] == NULL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003555 error = TRUE;
3556 else
3557 {
3558 /* Read the .dic file and store the words in the trees. */
3559 vim_snprintf((char *)fname, sizeof(fname), "%s.dic",
Bram Moolenaarb765d632005-06-07 21:00:02 +00003560 innames[i]);
3561 if (spell_read_dic(fname, &spin, afile[i]) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003562 error = TRUE;
3563 }
3564 }
3565 else
3566 {
3567 /* No .aff file, try reading the file as a word list. Store
3568 * the words in the trees. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003569 if (spell_read_wordfile(innames[i], &spin) == FAIL)
Bram Moolenaar51485f02005-06-04 21:55:20 +00003570 error = TRUE;
3571 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003572
Bram Moolenaarb765d632005-06-07 21:00:02 +00003573#ifdef FEAT_MBYTE
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003574 /* Free any conversion stuff. */
Bram Moolenaar51485f02005-06-04 21:55:20 +00003575 convert_setup(&spin.si_conv, NULL, NULL);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003576#endif
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003577 }
3578
Bram Moolenaar51485f02005-06-04 21:55:20 +00003579 if (!error)
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003580 {
Bram Moolenaar51485f02005-06-04 21:55:20 +00003581 /*
3582 * Remove the dummy NUL from the start of the tree root.
3583 */
3584 spin.si_foldroot = spin.si_foldroot->wn_sibling;
3585 spin.si_keeproot = spin.si_keeproot->wn_sibling;
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003586
3587 /*
Bram Moolenaar51485f02005-06-04 21:55:20 +00003588 * Combine tails in the tree.
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003589 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003590 if (!added_word || p_verbose > 2)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003591 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003592 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003593 verbose_enter();
3594 MSG(_("Compressing word tree..."));
3595 out_flush();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003596 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003597 verbose_leave();
3598 }
3599 wordtree_compress(spin.si_foldroot, &spin);
3600 wordtree_compress(spin.si_keeproot, &spin);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003601 }
3602
Bram Moolenaar51485f02005-06-04 21:55:20 +00003603 if (!error)
3604 {
3605 /*
3606 * Write the info in the spell file.
3607 */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003608 if (!added_word || p_verbose > 2)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003609 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003610 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003611 verbose_enter();
3612 smsg((char_u *)_("Writing spell file %s..."), wfname);
3613 out_flush();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003614 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003615 verbose_leave();
3616 }
Bram Moolenaar50cde822005-06-05 21:54:54 +00003617
Bram Moolenaar3982c542005-06-08 21:56:31 +00003618 write_vim_spell(wfname, &spin);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003619
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003620 if (!added_word || p_verbose > 2)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003621 {
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003622 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003623 verbose_enter();
3624 MSG(_("Done!"));
3625 smsg((char_u *)_("Estimated runtime memory use: %d bytes"),
Bram Moolenaar50cde822005-06-05 21:54:54 +00003626 spin.si_memtot);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003627 out_flush();
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003628 if (added_word)
Bram Moolenaarb765d632005-06-07 21:00:02 +00003629 verbose_leave();
3630 }
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003631
Bram Moolenaarb765d632005-06-07 21:00:02 +00003632 /* If the file is loaded need to reload it. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003633 spell_reload_one(wfname, added_word);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003634 }
3635
3636 /* Free the allocated memory. */
3637 free_blocks(spin.si_blocks);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003638 ga_clear(&spin.si_rep);
3639 ga_clear(&spin.si_sal);
3640 ga_clear(&spin.si_map);
Bram Moolenaar51485f02005-06-04 21:55:20 +00003641
3642 /* Free the .aff file structures. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003643 for (i = 0; i < incount; ++i)
3644 if (afile[i] != NULL)
3645 spell_free_aff(afile[i]);
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003646 }
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003647}
3648
Bram Moolenaarb765d632005-06-07 21:00:02 +00003649
3650/*
3651 * ":spellgood {word}"
3652 * ":spellwrong {word}"
3653 */
3654 void
3655ex_spell(eap)
3656 exarg_T *eap;
3657{
3658 spell_add_word(eap->arg, STRLEN(eap->arg), eap->cmdidx == CMD_spellwrong);
3659}
3660
3661/*
3662 * Add "word[len]" to 'spellfile' as a good or bad word.
3663 */
3664 void
3665spell_add_word(word, len, bad)
3666 char_u *word;
3667 int len;
3668 int bad;
3669{
3670 FILE *fd;
3671 buf_T *buf;
3672
3673 if (*curbuf->b_p_spf == NUL)
3674 init_spellfile();
3675 if (*curbuf->b_p_spf == NUL)
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003676 EMSG(_("E764: 'spellfile' is not set"));
Bram Moolenaarb765d632005-06-07 21:00:02 +00003677 else
3678 {
3679 /* Check that the user isn't editing the .add file somewhere. */
3680 buf = buflist_findname_exp(curbuf->b_p_spf);
3681 if (buf != NULL && buf->b_ml.ml_mfp == NULL)
3682 buf = NULL;
3683 if (buf != NULL && bufIsChanged(buf))
3684 EMSG(_(e_bufloaded));
3685 else
3686 {
3687 fd = mch_fopen((char *)curbuf->b_p_spf, "a");
3688 if (fd == NULL)
3689 EMSG2(_(e_notopen), curbuf->b_p_spf);
3690 else
3691 {
3692 if (bad)
3693 fprintf(fd, "/!%.*s\n", len, word);
3694 else
3695 fprintf(fd, "%.*s\n", len, word);
3696 fclose(fd);
3697
3698 /* Update the .add.spl file. */
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003699 mkspell(1, &curbuf->b_p_spf, FALSE, TRUE, TRUE);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003700
3701 /* If the .add file is edited somewhere, reload it. */
3702 if (buf != NULL)
3703 buf_reload(buf);
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00003704
3705 redraw_all_later(NOT_VALID);
Bram Moolenaarb765d632005-06-07 21:00:02 +00003706 }
3707 }
3708 }
3709}
3710
3711/*
3712 * Initialize 'spellfile' for the current buffer.
3713 */
3714 static void
3715init_spellfile()
3716{
3717 char_u buf[MAXPATHL];
3718 int l;
3719 slang_T *sl;
3720 char_u *rtp;
3721
3722 if (*curbuf->b_p_spl != NUL && curbuf->b_langp.ga_len > 0)
3723 {
3724 /* Loop over all entries in 'runtimepath'. */
3725 rtp = p_rtp;
3726 while (*rtp != NUL)
3727 {
3728 /* Copy the path from 'runtimepath' to buf[]. */
3729 copy_option_part(&rtp, buf, MAXPATHL, ",");
3730 if (filewritable(buf) == 2)
3731 {
Bram Moolenaar3982c542005-06-08 21:56:31 +00003732 /* Use the first language name from 'spelllang' and the
3733 * encoding used in the first loaded .spl file. */
Bram Moolenaarb765d632005-06-07 21:00:02 +00003734 sl = LANGP_ENTRY(curbuf->b_langp, 0)->lp_slang;
3735 l = STRLEN(buf);
3736 vim_snprintf((char *)buf + l, MAXPATHL - l,
Bram Moolenaar3982c542005-06-08 21:56:31 +00003737 "/spell/%.*s.%s.add",
3738 2, curbuf->b_p_spl,
Bram Moolenaarb765d632005-06-07 21:00:02 +00003739 strstr((char *)gettail(sl->sl_fname), ".ascii.") != NULL
3740 ? (char_u *)"ascii" : spell_enc());
3741 set_option_value((char_u *)"spellfile", 0L, buf, OPT_LOCAL);
3742 break;
3743 }
3744 }
3745 }
3746}
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00003747
Bram Moolenaar51485f02005-06-04 21:55:20 +00003748
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003749/*
3750 * Init the chartab used for spelling for ASCII.
3751 * EBCDIC is not supported!
3752 */
3753 static void
3754clear_spell_chartab(sp)
3755 spelltab_T *sp;
3756{
3757 int i;
3758
3759 /* Init everything to FALSE. */
3760 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw));
3761 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu));
3762 for (i = 0; i < 256; ++i)
3763 sp->st_fold[i] = i;
3764
3765 /* We include digits. A word shouldn't start with a digit, but handling
3766 * that is done separately. */
3767 for (i = '0'; i <= '9'; ++i)
3768 sp->st_isw[i] = TRUE;
3769 for (i = 'A'; i <= 'Z'; ++i)
3770 {
3771 sp->st_isw[i] = TRUE;
3772 sp->st_isu[i] = TRUE;
3773 sp->st_fold[i] = i + 0x20;
3774 }
3775 for (i = 'a'; i <= 'z'; ++i)
3776 sp->st_isw[i] = TRUE;
3777}
3778
3779/*
3780 * Init the chartab used for spelling. Only depends on 'encoding'.
3781 * Called once while starting up and when 'encoding' changes.
3782 * The default is to use isalpha(), but the spell file should define the word
3783 * characters to make it possible that 'encoding' differs from the current
3784 * locale.
3785 */
3786 void
3787init_spell_chartab()
3788{
3789 int i;
3790
3791 did_set_spelltab = FALSE;
3792 clear_spell_chartab(&spelltab);
3793
3794#ifdef FEAT_MBYTE
3795 if (enc_dbcs)
3796 {
3797 /* DBCS: assume double-wide characters are word characters. */
3798 for (i = 128; i <= 255; ++i)
3799 if (MB_BYTE2LEN(i) == 2)
3800 spelltab.st_isw[i] = TRUE;
3801 }
3802 else
3803#endif
3804 {
3805 /* Rough guess: use isalpha() and isupper() for characters above 128. */
3806 for (i = 128; i < 256; ++i)
3807 {
3808 spelltab.st_isw[i] = MB_ISUPPER(i) || MB_ISLOWER(i);
3809 if (MB_ISUPPER(i))
3810 {
3811 spelltab.st_isu[i] = TRUE;
3812 spelltab.st_fold[i] = MB_TOLOWER(i);
3813 }
3814 }
3815 }
3816}
3817
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003818static char *e_affform = N_("E761: Format error in affix file FOL, LOW or UPP");
3819static char *e_affrange = N_("E762: Character in FOL, LOW or UPP is out of range");
3820
3821/*
3822 * Set the spell character tables from strings in the affix file.
3823 */
3824 static int
3825set_spell_chartab(fol, low, upp)
3826 char_u *fol;
3827 char_u *low;
3828 char_u *upp;
3829{
3830 /* We build the new tables here first, so that we can compare with the
3831 * previous one. */
3832 spelltab_T new_st;
3833 char_u *pf = fol, *pl = low, *pu = upp;
3834 int f, l, u;
3835
3836 clear_spell_chartab(&new_st);
3837
3838 while (*pf != NUL)
3839 {
3840 if (*pl == NUL || *pu == NUL)
3841 {
3842 EMSG(_(e_affform));
3843 return FAIL;
3844 }
3845#ifdef FEAT_MBYTE
3846 f = mb_ptr2char_adv(&pf);
3847 l = mb_ptr2char_adv(&pl);
3848 u = mb_ptr2char_adv(&pu);
3849#else
3850 f = *pf++;
3851 l = *pl++;
3852 u = *pu++;
3853#endif
3854 /* Every character that appears is a word character. */
3855 if (f < 256)
3856 new_st.st_isw[f] = TRUE;
3857 if (l < 256)
3858 new_st.st_isw[l] = TRUE;
3859 if (u < 256)
3860 new_st.st_isw[u] = TRUE;
3861
3862 /* if "LOW" and "FOL" are not the same the "LOW" char needs
3863 * case-folding */
3864 if (l < 256 && l != f)
3865 {
3866 if (f >= 256)
3867 {
3868 EMSG(_(e_affrange));
3869 return FAIL;
3870 }
3871 new_st.st_fold[l] = f;
3872 }
3873
3874 /* if "UPP" and "FOL" are not the same the "UPP" char needs
3875 * case-folding and it's upper case. */
3876 if (u < 256 && u != f)
3877 {
3878 if (f >= 256)
3879 {
3880 EMSG(_(e_affrange));
3881 return FAIL;
3882 }
3883 new_st.st_fold[u] = f;
3884 new_st.st_isu[u] = TRUE;
3885 }
3886 }
3887
3888 if (*pl != NUL || *pu != NUL)
3889 {
3890 EMSG(_(e_affform));
3891 return FAIL;
3892 }
3893
3894 return set_spell_finish(&new_st);
3895}
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003896
3897/*
3898 * Set the spell character tables from strings in the .spl file.
3899 */
3900 static int
3901set_spell_charflags(flags, cnt, upp)
3902 char_u *flags;
3903 int cnt;
3904 char_u *upp;
3905{
3906 /* We build the new tables here first, so that we can compare with the
3907 * previous one. */
3908 spelltab_T new_st;
3909 int i;
3910 char_u *p = upp;
3911
3912 clear_spell_chartab(&new_st);
3913
3914 for (i = 0; i < cnt; ++i)
3915 {
3916 new_st.st_isw[i + 128] = (flags[i] & SPELL_ISWORD) != 0;
3917 new_st.st_isu[i + 128] = (flags[i] & SPELL_ISUPPER) != 0;
3918
3919 if (*p == NUL)
3920 return FAIL;
3921#ifdef FEAT_MBYTE
3922 new_st.st_fold[i + 128] = mb_ptr2char_adv(&p);
3923#else
3924 new_st.st_fold[i + 128] = *p++;
3925#endif
3926 }
3927
3928 return set_spell_finish(&new_st);
3929}
3930
3931 static int
3932set_spell_finish(new_st)
3933 spelltab_T *new_st;
3934{
3935 int i;
3936
3937 if (did_set_spelltab)
3938 {
3939 /* check that it's the same table */
3940 for (i = 0; i < 256; ++i)
3941 {
3942 if (spelltab.st_isw[i] != new_st->st_isw[i]
3943 || spelltab.st_isu[i] != new_st->st_isu[i]
3944 || spelltab.st_fold[i] != new_st->st_fold[i])
3945 {
3946 EMSG(_("E763: Word characters differ between spell files"));
3947 return FAIL;
3948 }
3949 }
3950 }
3951 else
3952 {
3953 /* copy the new spelltab into the one being used */
3954 spelltab = *new_st;
3955 did_set_spelltab = TRUE;
3956 }
3957
3958 return OK;
3959}
3960
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003961/*
3962 * Write the current tables into the .spl file.
3963 * This makes sure the same characters are recognized as word characters when
3964 * generating an when using a spell file.
3965 */
3966 static void
3967write_spell_chartab(fd)
3968 FILE *fd;
3969{
3970 char_u charbuf[256 * 4];
3971 int len = 0;
3972 int flags;
3973 int i;
3974
3975 fputc(128, fd); /* <charflagslen> */
3976 for (i = 128; i < 256; ++i)
3977 {
3978 flags = 0;
3979 if (spelltab.st_isw[i])
3980 flags |= SPELL_ISWORD;
3981 if (spelltab.st_isu[i])
3982 flags |= SPELL_ISUPPER;
3983 fputc(flags, fd); /* <charflags> */
3984
Bram Moolenaarb765d632005-06-07 21:00:02 +00003985#ifdef FEAT_MBYTE
3986 if (has_mbyte)
3987 len += mb_char2bytes(spelltab.st_fold[i], charbuf + len);
3988 else
3989#endif
3990 charbuf[len++] = spelltab.st_fold[i];
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003991 }
3992
3993 put_bytes(fd, (long_u)len, 2); /* <fcharlen> */
3994 fwrite(charbuf, (size_t)len, (size_t)1, fd); /* <fchars> */
3995}
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00003996
3997/*
3998 * Return TRUE if "c" is an upper-case character for spelling.
3999 */
4000 static int
4001spell_isupper(c)
4002 int c;
4003{
4004# ifdef FEAT_MBYTE
4005 if (enc_utf8)
4006 {
4007 /* For Unicode we can call utf_isupper(), but don't do that for ASCII,
4008 * because we don't want to use 'casemap' here. */
4009 if (c >= 128)
4010 return utf_isupper(c);
4011 }
4012 else if (has_mbyte && c > 256)
4013 {
4014 /* For characters above 255 we don't have something specfied.
4015 * Fall back to locale-dependent iswupper(). If not available
4016 * simply return FALSE. */
4017# ifdef HAVE_ISWUPPER
4018 return iswupper(c);
4019# else
4020 return FALSE;
4021# endif
4022 }
4023# endif
4024 return spelltab.st_isu[c];
4025}
4026
4027/*
4028 * Case-fold "p[len]" into "buf[buflen]". Used for spell checking.
4029 * When using a multi-byte 'encoding' the length may change!
4030 * Returns FAIL when something wrong.
4031 */
4032 static int
4033spell_casefold(p, len, buf, buflen)
4034 char_u *p;
4035 int len;
4036 char_u *buf;
4037 int buflen;
4038{
4039 int i;
4040
4041 if (len >= buflen)
4042 {
4043 buf[0] = NUL;
4044 return FAIL; /* result will not fit */
4045 }
4046
4047#ifdef FEAT_MBYTE
4048 if (has_mbyte)
4049 {
4050 int c;
4051 int outi = 0;
4052
4053 /* Fold one character at a time. */
4054 for (i = 0; i < len; i += mb_ptr2len_check(p + i))
4055 {
4056 c = mb_ptr2char(p + i);
4057 if (enc_utf8)
4058 /* For Unicode case folding is always the same, no need to use
4059 * the table from the spell file. */
4060 c = utf_fold(c);
4061 else if (c < 256)
4062 /* Use the table from the spell file. */
4063 c = spelltab.st_fold[c];
4064# ifdef HAVE_TOWLOWER
4065 else
4066 /* We don't know what to do, fall back to towlower(), it
4067 * depends on the current locale. */
4068 c = towlower(c);
4069# endif
4070 if (outi + MB_MAXBYTES > buflen)
4071 {
4072 buf[outi] = NUL;
4073 return FAIL;
4074 }
4075 outi += mb_char2bytes(c, buf + outi);
4076 }
4077 buf[outi] = NUL;
4078 }
4079 else
4080#endif
4081 {
4082 /* Be quick for non-multibyte encodings. */
4083 for (i = 0; i < len; ++i)
4084 buf[i] = spelltab.st_fold[p[i]];
4085 buf[i] = NUL;
4086 }
4087
4088 return OK;
4089}
4090
Bram Moolenaar9ba0eb82005-06-13 22:28:56 +00004091/*
4092 * "z?": Find badly spelled word under or after the cursor.
4093 * Give suggestions for the properly spelled word.
4094 * This is based on the mechanisms of Aspell, but completely reimplemented.
4095 */
4096 void
4097spell_suggest()
4098{
4099 char_u *line;
4100 pos_T prev_cursor = curwin->w_cursor;
4101 int attr;
4102 char_u wcopy[MAXWLEN + 2];
4103 char_u *p;
4104 int i;
4105 int c;
4106 suginfo_T sug;
4107 suggest_T *stp;
4108
4109 /*
4110 * Find the start of the badly spelled word.
4111 */
4112 if (spell_move_to(FORWARD, TRUE, TRUE) == FAIL)
4113 {
4114 beep_flush();
4115 return;
4116 }
4117
4118 /*
4119 * Set the info in "sug".
4120 */
4121 vim_memset(&sug, 0, sizeof(sug));
4122 ga_init2(&sug.su_ga, (int)sizeof(suggest_T), 10);
4123 hash_init(&sug.su_banned);
4124 line = ml_get_curline();
4125 sug.su_badptr = line + curwin->w_cursor.col;
4126 sug.su_badlen = spell_check(curwin, sug.su_badptr, &attr);
4127 if (sug.su_badlen >= MAXWLEN)
4128 sug.su_badlen = MAXWLEN - 1; /* just in case */
4129 vim_strncpy(sug.su_badword, sug.su_badptr, sug.su_badlen);
4130 (void)spell_casefold(sug.su_badptr, sug.su_badlen,
4131 sug.su_fbadword, MAXWLEN);
4132
4133 /* Ban the bad word itself. It may appear in another region. */
4134 add_banned(&sug, sug.su_badword);
4135
4136 /*
4137 * 1. Try inserting/deleting/swapping/changing a letter, use REP entries
4138 * from the .aff file and inserting a space (split the word).
4139 */
4140 /* Set a maximum score to limit the combination of operations that is
4141 * tried. */
4142 sug.su_maxscore = SCORE_MAXINIT;
4143 spell_try_change(&sug);
4144 cleanup_suggestions(&sug);
4145
4146 /*
4147 * 2. Try finding sound-a-like words.
4148 */
4149 /* Allow a higher score if we don't have many suggestions yet. */
4150 if (sug.su_maxscore == SCORE_MAXINIT)
4151 sug.su_maxscore = SCORE_MAXMAX;
4152 spell_try_soundalike(&sug);
4153
4154 /* When CTRL-C was hit while searching do show the results. */
4155 if (got_int)
4156 {
4157 (void)vgetc();
4158 got_int = FALSE;
4159 }
4160
4161 if (sug.su_ga.ga_len == 0)
4162 MSG(_("Sorry, no suggestions"));
4163 else
4164 {
4165 /* Cleanup, sort the suggestions and truncate at SUG_PROMPT_COUNT. */
4166 cleanup_suggestions(&sug);
4167
4168 /* List the suggestions. */
4169 msg_start();
4170 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
4171 sug.su_badlen, sug.su_badptr);
4172 msg_puts(IObuff);
4173 msg_clr_eos();
4174 msg_putchar('\n');
4175 msg_scroll = TRUE;
4176 for (i = 0; i < sug.su_ga.ga_len; ++i)
4177 {
4178 stp = &SUG(&sug, i);
4179
4180 /* The suggested word may replace only part of the bad word, add
4181 * the not replaced part. */
4182 STRCPY(wcopy, stp->st_word);
4183 if (sug.su_badlen > stp->st_orglen)
4184 vim_strncpy(wcopy + STRLEN(wcopy),
4185 sug.su_badptr + stp->st_orglen,
4186 sug.su_badlen - stp->st_orglen);
4187 /* TODO: remove score */
4188 vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\" (%d)"),
4189 i + 1, wcopy, stp->st_score);
4190 msg_puts(IObuff);
4191 lines_left = 3; /* avoid more prompt */
4192 msg_putchar('\n');
4193 }
4194
4195 /* Ask for choice. */
4196 i = prompt_for_number();
4197 if (i > 0 && i <= sug.su_ga.ga_len && u_save_cursor())
4198 {
4199 /* Replace the word. */
4200 stp = &SUG(&sug, i - 1);
4201 p = alloc(STRLEN(line) - stp->st_orglen + STRLEN(stp->st_word) + 1);
4202 if (p != NULL)
4203 {
4204 c = sug.su_badptr - line;
4205 mch_memmove(p, line, c);
4206 STRCPY(p + c, stp->st_word);
4207 STRCAT(p, sug.su_badptr + stp->st_orglen);
4208 ml_replace(curwin->w_cursor.lnum, p, FALSE);
4209 curwin->w_cursor.col = c;
4210 changed_bytes(curwin->w_cursor.lnum, c);
4211 }
4212 }
4213 else
4214 curwin->w_cursor = prev_cursor;
4215 }
4216
4217 /* Free the suggestions. */
4218 for (i = 0; i < sug.su_ga.ga_len; ++i)
4219 vim_free(SUG(&sug, i).st_word);
4220 ga_clear(&sug.su_ga);
4221
4222 /* Free the banned words. */
4223 free_banned(&sug);
4224}
4225
4226/*
4227 * Make a copy of "word[len]", with the first letter upper or lower cased,
4228 * to "wcopy[MAXWLEN]".
4229 */
4230 static void
4231onecap_copy(word, len, wcopy, upper)
4232 char_u *word;
4233 int len;
4234 char_u *wcopy;
4235 int upper; /* TRUE: first letter made upper case */
4236{
4237 char_u *p;
4238 int c;
4239 int l;
4240
4241 p = word;
4242#ifdef FEAT_MBYTE
4243 if (has_mbyte)
4244 c = mb_ptr2char_adv(&p);
4245 else
4246#endif
4247 c = *p++;
4248 if (upper)
4249 c = MB_TOUPPER(c);
4250 else
4251 c = MB_TOLOWER(c);
4252#ifdef FEAT_MBYTE
4253 if (has_mbyte)
4254 l = mb_char2bytes(c, wcopy);
4255 else
4256#endif
4257 {
4258 l = 1;
4259 wcopy[0] = c;
4260 }
4261 vim_strncpy(wcopy + l, p, len - (p - word));
4262}
4263
4264/*
4265 * Make a copy of "word[len]" with all the letters upper cased into
4266 * "wcopy[MAXWLEN]".
4267 */
4268 static void
4269allcap_copy(word, wcopy)
4270 char_u *word;
4271 char_u *wcopy;
4272{
4273 char_u *s;
4274 char_u *d;
4275 int c;
4276
4277 d = wcopy;
4278 for (s = word; *s != NUL; )
4279 {
4280#ifdef FEAT_MBYTE
4281 if (has_mbyte)
4282 c = mb_ptr2char_adv(&s);
4283 else
4284#endif
4285 c = *s++;
4286
4287 c = MB_TOUPPER(c); /* TODO: use spell toupper */
4288
4289#ifdef FEAT_MBYTE
4290 if (has_mbyte)
4291 {
4292 if (d - wcopy >= MAXWLEN - MB_MAXBYTES)
4293 break;
4294 d += mb_char2bytes(c, d);
4295 }
4296 else
4297#endif
4298 {
4299 if (d - wcopy >= MAXWLEN - 1)
4300 break;
4301 *d++ = c;
4302 }
4303 }
4304 *d = NUL;
4305}
4306
4307/*
4308 * Try finding suggestions by adding/removing/swapping letters.
4309 */
4310 static void
4311spell_try_change(su)
4312 suginfo_T *su;
4313{
4314 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */
4315 char_u tword[MAXWLEN]; /* good word collected so far */
4316 trystate_T stack[MAXWLEN];
4317 char_u preword[MAXWLEN * 3]; /* word found with proper case (appended
4318 * to for word split) */
4319 char_u prewordlen = 0; /* length of word in "preword" */
4320 int splitoff = 0; /* index in tword after last split */
4321 trystate_T *sp;
4322 int newscore;
4323 langp_T *lp;
4324 char_u *byts;
4325 int *idxs;
4326 int depth;
4327 int c;
4328 int n;
4329 int flags;
4330 int badflags;
4331 garray_T *gap;
4332 int arridx;
4333 int len;
4334 char_u *p;
4335 fromto_T *ftp;
4336 int fl, tl;
4337
4338 /* get caps flags for bad word */
4339 badflags = captype(su->su_badptr, su->su_badptr + su->su_badlen);
4340
4341 /* We make a copy of the case-folded bad word, so that we can modify it
4342 * to find matches (esp. REP items). */
4343 STRCPY(fword, su->su_fbadword);
4344
4345 /*
4346 * At each node in the tree these states are tried:
4347 */
4348#define STATE_START 0 /* At start of node, check if word may end or
4349 * split word. */
4350#define STATE_SPLITUNDO 1 /* Undo word split. */
4351#define STATE_ENDNUL 2 /* Past NUL bytes at start of the node. */
4352#define STATE_PLAIN 3 /* Use each byte of the node. */
4353#define STATE_DEL 4 /* Delete a byte from the bad word. */
4354#define STATE_INS 5 /* Insert a byte in the bad word. */
4355#define STATE_SWAP 6 /* Swap two bytes. */
4356#define STATE_SWAP3A 7 /* Swap two bytes over three. */
4357#define STATE_ROT3L 8 /* Rotate three bytes left */
4358#define STATE_ROT3R 9 /* Rotate three bytes right */
4359#define STATE_ROT_UNDO 10 /* undo rotating */
4360#define STATE_REP_INI 11 /* Prepare for using REP items. */
4361#define STATE_REP 12 /* Use matching REP items from the .aff file. */
4362#define STATE_REP_UNDO 13 /* Undo a REP item replacement. */
4363#define STATE_FINAL 99 /* End of this node. */
4364
4365
4366 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
4367 lp->lp_slang != NULL; ++lp)
4368 {
4369#ifdef SOUNDFOLD_SCORE
4370 su->su_slang = lp->lp_slang;
4371 if (lp->lp_slang->sl_sal.ga_len > 0)
4372 /* soundfold the bad word */
4373 spell_soundfold(lp->lp_slang, su->su_fbadword, su->su_salword);
4374#endif
4375
4376 /*
4377 * Go through the whole case-fold tree, try changes at each node.
4378 * "tword[]" contains the word collected from nodes in the tree.
4379 * "fword[]" the word we are trying to match with (initially the bad
4380 * word).
4381 */
4382 byts = lp->lp_slang->sl_fbyts;
4383 idxs = lp->lp_slang->sl_fidxs;
4384
4385 depth = 0;
4386 stack[0].ts_state = STATE_START;
4387 stack[0].ts_score = 0;
4388 stack[0].ts_curi = 1;
4389 stack[0].ts_fidx = 0;
4390 stack[0].ts_fidxtry = 0;
4391 stack[0].ts_twordlen = 0;
4392 stack[0].ts_arridx = 0;
4393
4394 while (depth >= 0 && !got_int)
4395 {
4396 sp = &stack[depth];
4397 switch (sp->ts_state)
4398 {
4399 case STATE_START:
4400 /*
4401 * Start of node: Deal with NUL bytes, which means
4402 * tword[] may end here.
4403 */
4404 arridx = sp->ts_arridx; /* current node in the tree */
4405 len = byts[arridx]; /* bytes in this node */
4406 arridx += sp->ts_curi; /* index of current byte */
4407
4408 if (sp->ts_curi > len || (c = byts[arridx]) != 0)
4409 {
4410 /* Past bytes in node and/or past NUL bytes. */
4411 sp->ts_state = STATE_ENDNUL;
4412 break;
4413 }
4414
4415 /*
4416 * End of word in tree.
4417 */
4418 ++sp->ts_curi; /* eat one NUL byte */
4419
4420 flags = idxs[arridx];
4421
4422 /*
4423 * Form the word with proper case in preword.
4424 * If there is a word from a previous split, append.
4425 */
4426 tword[sp->ts_twordlen] = NUL;
4427 if (flags & WF_KEEPCAP)
4428 /* Must find the word in the keep-case tree. */
4429 find_keepcap_word(lp->lp_slang, tword + splitoff,
4430 preword + prewordlen);
4431 else
4432 /* Include badflags: if the badword is onecap or allcap
4433 * use that for the goodword too. */
4434 make_case_word(tword + splitoff,
4435 preword + prewordlen, flags | badflags);
4436
4437 /* Don't use a banned word. It may appear again as a good
4438 * word, thus remember it. */
4439 if (flags & WF_BANNED)
4440 {
4441 add_banned(su, preword + prewordlen);
4442 break;
4443 }
4444 if (was_banned(su, preword + prewordlen))
4445 break;
4446
4447 newscore = 0;
4448 if ((flags & WF_REGION)
4449 && (((unsigned)flags >> 8) & lp->lp_region) == 0)
4450 newscore += SCORE_REGION;
4451 if (flags & WF_RARE)
4452 newscore += SCORE_RARE;
4453
4454 if (!spell_valid_case(badflags,
4455 captype(preword + prewordlen, NULL)))
4456 newscore += SCORE_ICASE;
4457
4458 if (fword[sp->ts_fidx] == 0)
4459 {
4460 /* The badword also ends: add suggestions, */
4461 add_suggestion(su, preword, sp->ts_score + newscore);
4462 }
4463 else if (sp->ts_fidx >= sp->ts_fidxtry)
4464 {
4465 /* The word in the tree ends but the badword
4466 * continues: try inserting a space and check that a valid
4467 * words starts at fword[sp->ts_fidx]. */
4468 if (try_deeper(su, stack, depth, newscore + SCORE_SPLIT))
4469 {
4470 /* Save things to be restored at STATE_SPLITUNDO. */
4471 sp->ts_save_prewordlen = prewordlen;
4472 sp->ts_save_badflags = badflags;
4473 sp->ts_save_splitoff = splitoff;
4474
4475 /* Append a space to preword. */
4476 STRCAT(preword, " ");
4477 prewordlen = STRLEN(preword);
4478 splitoff = sp->ts_twordlen;
4479 /* TODO: when case-folding changed the number of bytes
4480 * this doesn't work... */
4481 badflags = captype(su->su_badptr + sp->ts_fidx,
4482 su->su_badptr + su->su_badlen);
4483
4484 sp->ts_state = STATE_SPLITUNDO;
4485 ++depth;
4486 /* Restart at top of the tree. */
4487 stack[depth].ts_arridx = 0;
4488 }
4489 }
4490 break;
4491
4492 case STATE_SPLITUNDO:
4493 /* Fixup the changes done for word split. */
4494 badflags = sp->ts_save_badflags;
4495 splitoff = sp->ts_save_splitoff;
4496 prewordlen = sp->ts_save_prewordlen;
4497
4498 /* Continue looking for NUL bytes. */
4499 sp->ts_state = STATE_START;
4500 break;
4501
4502 case STATE_ENDNUL:
4503 /* Past the NUL bytes in the node. */
4504 if (fword[sp->ts_fidx] == 0)
4505 {
4506 /* The badword ends, can't use the bytes in this node. */
4507 sp->ts_state = STATE_DEL;
4508 break;
4509 }
4510 sp->ts_state = STATE_PLAIN;
4511 /*FALLTHROUGH*/
4512
4513 case STATE_PLAIN:
4514 /*
4515 * Go over all possible bytes at this node, add each to
4516 * tword[] and use child node. "ts_curi" is the index.
4517 */
4518 arridx = sp->ts_arridx;
4519 if (sp->ts_curi > byts[arridx])
4520 {
4521 /* Done all bytes at this node, do next state. When still
4522 * at already changed bytes skip the other tricks. */
4523 if (sp->ts_fidx >= sp->ts_fidxtry)
4524 sp->ts_state = STATE_DEL;
4525 else
4526 sp->ts_state = STATE_FINAL;
4527 }
4528 else
4529 {
4530 arridx += sp->ts_curi++;
4531 c = byts[arridx];
4532
4533 /* Normal byte, go one level deeper. If it's not equal to
4534 * the byte in the bad word adjust the score. But don't
4535 * even try when the byte was already changed. */
4536 if (c == fword[sp->ts_fidx])
4537 newscore = 0;
4538 /* TODO: multi-byte characters */
4539 else if (lp->lp_slang->sl_map != NULL
4540 && similar_chars(lp->lp_slang,
4541 c, fword[sp->ts_fidx]))
4542 newscore = SCORE_SIMILAR;
4543 else
4544 newscore = SCORE_SUBST;
4545 if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry)
4546 && try_deeper(su, stack, depth, newscore))
4547 {
4548 ++depth;
4549 ++stack[depth].ts_fidx;
4550 tword[stack[depth].ts_twordlen++] = c;
4551 stack[depth].ts_arridx = idxs[arridx];
4552 }
4553 }
4554 break;
4555
4556 case STATE_DEL:
4557 /* Try skipping one byte in the bad word (delete it). */
4558 sp->ts_state = STATE_INS;
4559 sp->ts_curi = 1;
4560 if (fword[sp->ts_fidx] != NUL
4561 && try_deeper(su, stack, depth, SCORE_DEL))
4562 {
4563 ++depth;
4564 ++stack[depth].ts_fidx;
4565 break;
4566 }
4567 /*FALLTHROUGH*/
4568
4569 case STATE_INS:
4570 /* Insert one byte. Do this for each possible bytes at this
4571 * node. */
4572 n = sp->ts_arridx;
4573 if (sp->ts_curi > byts[n])
4574 {
4575 /* Done all bytes at this node, do next state. */
4576 sp->ts_state = STATE_SWAP;
4577 sp->ts_curi = 1;
4578 }
4579 else
4580 {
4581 /* Do one more byte at this node. */
4582 n += sp->ts_curi++;
4583 c = byts[n];
4584 if (c != 0 && try_deeper(su, stack, depth, SCORE_INS))
4585 {
4586 ++depth;
4587 tword[stack[depth].ts_twordlen++] = c;
4588 stack[depth].ts_arridx = idxs[n];
4589 }
4590 }
4591 break;
4592
4593 case STATE_SWAP:
4594 /* Swap two bytes: "12" -> "21". This means looking for the
4595 * following byte at the current node and the current byte at
4596 * its child node. We change "fword" here, it's changed back
4597 * afterwards. TODO: should swap characters instead of bytes.
4598 * */
4599 c = fword[sp->ts_fidx];
4600 if (c != NUL && fword[sp->ts_fidx + 1] != NUL
4601 && try_deeper(su, stack, depth, SCORE_SWAP))
4602 {
4603 sp->ts_state = STATE_SWAP3A;
4604 ++depth;
4605 fword[sp->ts_fidx] = fword[sp->ts_fidx + 1];
4606 fword[sp->ts_fidx + 1] = c;
4607 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
4608 }
4609 else
4610 /* If this swap doesn't work then SWAP3 won't either. */
4611 sp->ts_state = STATE_REP_INI;
4612 break;
4613
4614 case STATE_SWAP3A:
4615 /* First undo the STATE_SWAP swap: "21" -> "12". */
4616 c = fword[sp->ts_fidx];
4617 fword[sp->ts_fidx] = fword[sp->ts_fidx + 1];
4618 fword[sp->ts_fidx + 1] = c;
4619
4620 /* Swap two bytes, skipping one: "123" -> "321". We change
4621 * "fword" here, it's changed back afterwards. TODO: should
4622 * swap characters instead of bytes. */
4623 c = fword[sp->ts_fidx];
4624 if (c != NUL && fword[sp->ts_fidx + 1] != NUL
4625 && fword[sp->ts_fidx + 2] != NUL
4626 && try_deeper(su, stack, depth, SCORE_SWAP3))
4627 {
4628 sp->ts_state = STATE_ROT3L;
4629 ++depth;
4630 fword[sp->ts_fidx] = fword[sp->ts_fidx + 2];
4631 fword[sp->ts_fidx + 2] = c;
4632 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
4633 }
4634 else
4635 sp->ts_state = STATE_REP_INI;
4636 break;
4637
4638 case STATE_ROT3L:
4639 /* First undo STATE_SWAP3A: "321" -> "123" */
4640 c = fword[sp->ts_fidx];
4641 fword[sp->ts_fidx] = fword[sp->ts_fidx + 2];
4642 fword[sp->ts_fidx + 2] = c;
4643
4644 /* Rotate three bytes left: "123" -> "231". We change
4645 * "fword" here, it's changed back afterwards. TODO: should
4646 * swap characters instead of bytes. */
4647 if (try_deeper(su, stack, depth, SCORE_SWAP3))
4648 {
4649 sp->ts_state = STATE_ROT3R;
4650 ++depth;
4651 c = fword[sp->ts_fidx];
4652 fword[sp->ts_fidx] = fword[sp->ts_fidx + 1];
4653 fword[sp->ts_fidx + 1] = fword[sp->ts_fidx + 2];
4654 fword[sp->ts_fidx + 2] = c;
4655 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
4656 }
4657 else
4658 sp->ts_state = STATE_REP_INI;
4659 break;
4660
4661 case STATE_ROT3R:
4662 /* First undo STATE_ROT3L: "231" -> "123" */
4663 c = fword[sp->ts_fidx + 2];
4664 fword[sp->ts_fidx + 2] = fword[sp->ts_fidx + 1];
4665 fword[sp->ts_fidx + 1] = fword[sp->ts_fidx];
4666 fword[sp->ts_fidx] = c;
4667
4668 /* Rotate three bytes right: "123" -> "312". We change
4669 * "fword" here, it's changed back afterwards. TODO: should
4670 * swap characters instead of bytes. */
4671 if (try_deeper(su, stack, depth, SCORE_SWAP3))
4672 {
4673 sp->ts_state = STATE_ROT_UNDO;
4674 ++depth;
4675 c = fword[sp->ts_fidx + 2];
4676 fword[sp->ts_fidx + 2] = fword[sp->ts_fidx + 1];
4677 fword[sp->ts_fidx + 1] = fword[sp->ts_fidx];
4678 fword[sp->ts_fidx] = c;
4679 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
4680 }
4681 else
4682 sp->ts_state = STATE_REP_INI;
4683 break;
4684
4685 case STATE_ROT_UNDO:
4686 /* Undo STATE_ROT3R: "312" -> "123" */
4687 c = fword[sp->ts_fidx];
4688 fword[sp->ts_fidx] = fword[sp->ts_fidx + 1];
4689 fword[sp->ts_fidx + 1] = fword[sp->ts_fidx + 2];
4690 fword[sp->ts_fidx + 2] = c;
4691 /*FALLTHROUGH*/
4692
4693 case STATE_REP_INI:
4694 /* Check if matching with REP items from the .aff file would
4695 * work. Quickly skip if there are no REP items or the score
4696 * is going to be too high anyway. */
4697 gap = &lp->lp_slang->sl_rep;
4698 if (gap->ga_len == 0
4699 || sp->ts_score + SCORE_REP >= su->su_maxscore)
4700 {
4701 sp->ts_state = STATE_FINAL;
4702 break;
4703 }
4704
4705 /* Use the first byte to quickly find the first entry that
4706 * matches. If the index is -1 there is none. */
4707 sp->ts_curi = lp->lp_slang->sl_rep_first[fword[sp->ts_fidx]];
4708 if (sp->ts_curi < 0)
4709 {
4710 sp->ts_state = STATE_FINAL;
4711 break;
4712 }
4713
4714 sp->ts_state = STATE_REP;
4715 /*FALLTHROUGH*/
4716
4717 case STATE_REP:
4718 /* Try matching with REP items from the .aff file. For each
4719 * match replace the charactes and check if the resulting word
4720 * is valid. */
4721 p = fword + sp->ts_fidx;
4722
4723 gap = &lp->lp_slang->sl_rep;
4724 while (sp->ts_curi < gap->ga_len)
4725 {
4726 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
4727 if (*ftp->ft_from != *p)
4728 {
4729 /* past possible matching entries */
4730 sp->ts_curi = gap->ga_len;
4731 break;
4732 }
4733 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
4734 && try_deeper(su, stack, depth, SCORE_REP))
4735 {
4736 /* Need to undo this afterwards. */
4737 sp->ts_state = STATE_REP_UNDO;
4738
4739 /* Change the "from" to the "to" string. */
4740 ++depth;
4741 fl = STRLEN(ftp->ft_from);
4742 tl = STRLEN(ftp->ft_to);
4743 if (fl != tl)
4744 mch_memmove(p + tl, p + fl, STRLEN(p + fl) + 1);
4745 mch_memmove(p, ftp->ft_to, tl);
4746 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
4747 break;
4748 }
4749 }
4750
4751 if (sp->ts_curi >= gap->ga_len)
4752 /* No (more) matches. */
4753 sp->ts_state = STATE_FINAL;
4754
4755 break;
4756
4757 case STATE_REP_UNDO:
4758 /* Undo a REP replacement and continue with the next one. */
4759 ftp = (fromto_T *)lp->lp_slang->sl_rep.ga_data
4760 + sp->ts_curi - 1;
4761 fl = STRLEN(ftp->ft_from);
4762 tl = STRLEN(ftp->ft_to);
4763 p = fword + sp->ts_fidx;
4764 if (fl != tl)
4765 mch_memmove(p + fl, p + tl, STRLEN(p + tl) + 1);
4766 mch_memmove(p, ftp->ft_from, fl);
4767 sp->ts_state = STATE_REP;
4768 break;
4769
4770 default:
4771 /* Did all possible states at this level, go up one level. */
4772 --depth;
4773 }
4774
4775 line_breakcheck();
4776 }
4777 }
4778}
4779
4780/*
4781 * Try going one level deeper in the tree.
4782 */
4783 static int
4784try_deeper(su, stack, depth, score_add)
4785 suginfo_T *su;
4786 trystate_T *stack;
4787 int depth;
4788 int score_add;
4789{
4790 int newscore;
4791
4792 /* Refuse to go deeper if the scrore is getting too big. */
4793 newscore = stack[depth].ts_score + score_add;
4794 if (newscore >= su->su_maxscore)
4795 return FALSE;
4796
4797 stack[depth + 1].ts_state = STATE_START;
4798 stack[depth + 1].ts_score = newscore;
4799 stack[depth + 1].ts_curi = 1; /* start just after length byte */
4800 stack[depth + 1].ts_fidx = stack[depth].ts_fidx;
4801 stack[depth + 1].ts_fidxtry = stack[depth].ts_fidxtry;
4802 stack[depth + 1].ts_twordlen = stack[depth].ts_twordlen;
4803 stack[depth + 1].ts_arridx = stack[depth].ts_arridx;
4804 return TRUE;
4805}
4806
4807/*
4808 * "fword" is a good word with case folded. Find the matching keep-case
4809 * words and put it in "kword".
4810 * Theoretically there could be several keep-case words that result in the
4811 * same case-folded word, but we only find one...
4812 */
4813 static void
4814find_keepcap_word(slang, fword, kword)
4815 slang_T *slang;
4816 char_u *fword;
4817 char_u *kword;
4818{
4819 char_u uword[MAXWLEN]; /* "fword" in upper-case */
4820 int depth;
4821 int tryidx;
4822
4823 /* The following arrays are used at each depth in the tree. */
4824 int arridx[MAXWLEN];
4825 int round[MAXWLEN];
4826 int fwordidx[MAXWLEN];
4827 int uwordidx[MAXWLEN];
4828 int kwordlen[MAXWLEN];
4829
4830 int flen, ulen;
4831 int l;
4832 int len;
4833 int c;
4834 unsigned lo, hi, m;
4835 char_u *p;
4836 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */
4837 int *idxs = slang->sl_kidxs; /* array with indexes */
4838
4839 if (byts == NULL)
4840 {
4841 /* array is empty: "cannot happen" */
4842 *kword = NUL;
4843 return;
4844 }
4845
4846 /* Make an all-cap version of "fword". */
4847 allcap_copy(fword, uword);
4848
4849 /*
4850 * Each character needs to be tried both case-folded and upper-case.
4851 * All this gets very complicated if we keep in mind that changing case
4852 * may change the byte length of a multi-byte character...
4853 */
4854 depth = 0;
4855 arridx[0] = 0;
4856 round[0] = 0;
4857 fwordidx[0] = 0;
4858 uwordidx[0] = 0;
4859 kwordlen[0] = 0;
4860 while (depth >= 0)
4861 {
4862 if (fword[fwordidx[depth]] == NUL)
4863 {
4864 /* We are at the end of "fword". If the tree allows a word to end
4865 * here we have found a match. */
4866 if (byts[arridx[depth] + 1] == 0)
4867 {
4868 kword[kwordlen[depth]] = NUL;
4869 return;
4870 }
4871
4872 /* kword is getting too long, continue one level up */
4873 --depth;
4874 }
4875 else if (++round[depth] > 2)
4876 {
4877 /* tried both fold-case and upper-case character, continue one
4878 * level up */
4879 --depth;
4880 }
4881 else
4882 {
4883 /*
4884 * round[depth] == 1: Try using the folded-case character.
4885 * round[depth] == 2: Try using the upper-case character.
4886 */
4887#ifdef FEAT_MBYTE
4888 if (has_mbyte)
4889 {
4890 flen = mb_ptr2len_check(fword + fwordidx[depth]);
4891 ulen = mb_ptr2len_check(uword + uwordidx[depth]);
4892 }
4893 else
4894#endif
4895 ulen = flen = 1;
4896 if (round[depth] == 1)
4897 {
4898 p = fword + fwordidx[depth];
4899 l = flen;
4900 }
4901 else
4902 {
4903 p = uword + uwordidx[depth];
4904 l = ulen;
4905 }
4906
4907 for (tryidx = arridx[depth]; l > 0; --l)
4908 {
4909 /* Perform a binary search in the list of accepted bytes. */
4910 len = byts[tryidx++];
4911 c = *p++;
4912 lo = tryidx;
4913 hi = tryidx + len - 1;
4914 while (lo < hi)
4915 {
4916 m = (lo + hi) / 2;
4917 if (byts[m] > c)
4918 hi = m - 1;
4919 else if (byts[m] < c)
4920 lo = m + 1;
4921 else
4922 {
4923 lo = hi = m;
4924 break;
4925 }
4926 }
4927
4928 /* Stop if there is no matching byte. */
4929 if (hi < lo || byts[lo] != c)
4930 break;
4931
4932 /* Continue at the child (if there is one). */
4933 tryidx = idxs[lo];
4934 }
4935
4936 if (l == 0)
4937 {
4938 /*
4939 * Found the matching char. Copy it to "kword" and go a
4940 * level deeper.
4941 */
4942 if (round[depth] == 1)
4943 {
4944 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
4945 flen);
4946 kwordlen[depth + 1] = kwordlen[depth] + flen;
4947 }
4948 else
4949 {
4950 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
4951 ulen);
4952 kwordlen[depth + 1] = kwordlen[depth] + ulen;
4953 }
4954 fwordidx[depth + 1] = fwordidx[depth] + flen;
4955 uwordidx[depth + 1] = uwordidx[depth] + ulen;
4956
4957 ++depth;
4958 arridx[depth] = tryidx;
4959 round[depth] = 0;
4960 }
4961 }
4962 }
4963
4964 /* Didn't find it: "cannot happen". */
4965 *kword = NUL;
4966}
4967
4968/*
4969 * Find suggestions by comparing the word in a sound-a-like form.
4970 */
4971 static void
4972spell_try_soundalike(su)
4973 suginfo_T *su;
4974{
4975 char_u salword[MAXWLEN];
4976 char_u tword[MAXWLEN];
4977 char_u tfword[MAXWLEN];
4978 char_u tsalword[MAXWLEN];
4979 int arridx[MAXWLEN];
4980 int curi[MAXWLEN];
4981 langp_T *lp;
4982 char_u *byts;
4983 int *idxs;
4984 int depth;
4985 int c;
4986 int n;
4987 int round;
4988 int flags;
4989
4990 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
4991 lp->lp_slang != NULL; ++lp)
4992 {
4993 if (lp->lp_slang->sl_sal.ga_len > 0)
4994 {
4995 /* soundfold the bad word */
4996 spell_soundfold(lp->lp_slang, su->su_fbadword, salword);
4997
4998 /*
4999 * Go through the whole tree, soundfold each word and compare.
5000 * round 1: use the case-folded tree.
5001 * round 2: use the keep-case tree.
5002 */
5003 for (round = 1; round <= 2; ++round)
5004 {
5005 if (round == 1)
5006 {
5007 byts = lp->lp_slang->sl_fbyts;
5008 idxs = lp->lp_slang->sl_fidxs;
5009 }
5010 else
5011 {
5012 byts = lp->lp_slang->sl_kbyts;
5013 idxs = lp->lp_slang->sl_kidxs;
5014 }
5015
5016 depth = 0;
5017 arridx[0] = 0;
5018 curi[0] = 1;
5019 while (depth >= 0 && !got_int)
5020 {
5021 if (curi[depth] > byts[arridx[depth]])
5022 /* Done all bytes at this node, go up one level. */
5023 --depth;
5024 else
5025 {
5026 /* Do one more byte at this node. */
5027 n = arridx[depth] + curi[depth];
5028 ++curi[depth];
5029 c = byts[n];
5030 if (c == 0)
5031 {
5032 /* End of word, deal with the word. */
5033 flags = idxs[n];
5034 if (round == 2 || (flags & WF_KEEPCAP) == 0)
5035 {
5036 tword[depth] = NUL;
5037 if (round == 1)
5038 spell_soundfold(lp->lp_slang,
5039 tword, tsalword);
5040 else
5041 {
5042 /* In keep-case tree need to case-fold the
5043 * word. */
5044 (void)spell_casefold(tword, depth,
5045 tfword, MAXWLEN);
5046 spell_soundfold(lp->lp_slang,
5047 tfword, tsalword);
5048 }
5049
5050 /* TODO: also compare with small changes
5051 * (insert char, swap char, etc.) */
5052 if (STRCMP(salword, tsalword) == 0)
5053 {
5054 if (round == 1 && flags != 0)
5055 {
5056 char_u cword[MAXWLEN];
5057
5058 make_case_word(tword, cword, flags);
5059 add_suggestion(su, cword, 0);
5060 }
5061 else
5062 add_suggestion(su, tword, 0);
5063 }
5064 }
5065
5066 /* Skip over other NUL bytes. */
5067 while (byts[n + 1] == 0)
5068 {
5069 ++n;
5070 ++curi[depth];
5071 }
5072 }
5073 else
5074 {
5075 /* Normal char, go one level deeper. */
5076 tword[depth++] = c;
5077 arridx[depth] = idxs[n];
5078 curi[depth] = 1;
5079 }
5080 }
5081 }
5082 line_breakcheck();
5083 }
5084 }
5085 }
5086}
5087
5088/*
5089 * Copy "fword" to "cword", fixing according to "flags".
5090 */
5091 static void
5092make_case_word(fword, cword, flags)
5093 char_u *fword;
5094 char_u *cword;
5095 int flags;
5096{
5097 if (flags & WF_ALLCAP)
5098 /* Make it all upper-case */
5099 allcap_copy(fword, cword);
5100 else if (flags & WF_ONECAP)
5101 /* Make the first letter upper-case */
5102 onecap_copy(fword, STRLEN(fword), cword, TRUE);
5103 else
5104 /* Use goodword as-is. */
5105 STRCPY(cword, fword);
5106}
5107
5108/*
5109 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
5110 * lines in the .aff file.
5111 */
5112 static int
5113similar_chars(slang, c1, c2)
5114 slang_T *slang;
5115 int c1;
5116 int c2;
5117{
5118 char_u *p1;
5119 char_u *p2;
5120
5121 /* The similar characters are stored separated with slashes:
5122 * "aaa/bbb/ccc/". Search for each character and if the next slash is the
5123 * same one they are in the same MAP entry. */
5124 p1 = vim_strchr(slang->sl_map, c1);
5125 if (p1 == NULL)
5126 return FALSE;
5127 p2 = vim_strchr(slang->sl_map, c2);
5128 if (p2 == NULL)
5129 return FALSE;
5130 return vim_strchr(p1, '/') == vim_strchr(p2, '/');
5131}
5132
5133/*
5134 * Add a suggestion to the list of suggestions.
5135 * Do not add a duplicate suggestion or suggestions with a bad score.
5136 * When "use_score" is not zero it's used, otherwise the score is computed
5137 * with spell_edit_score().
5138 */
5139 static void
5140add_suggestion(su, goodword, use_score)
5141 suginfo_T *su;
5142 char_u *goodword;
5143 int use_score;
5144{
5145 suggest_T *stp;
5146 int score;
5147 int i;
5148#ifdef SOUNDFOLD_SCORE
5149 char_u fword[MAXWLEN];
5150 char_u salword[MAXWLEN];
5151#endif
5152
5153 /* Check that the word wasn't banned. */
5154 if (was_banned(su, goodword))
5155 return;
5156
5157 /* Compute the score and add the suggestion if it's good enough. */
5158 if (use_score != 0)
5159 score = use_score;
5160 else
5161 score = spell_edit_score(su->su_badword, goodword);
5162
5163 if (score <= su->su_maxscore)
5164 {
5165#ifdef SOUNDFOLD_SCORE
5166 /* Add to the score when the word sounds differently.
5167 * This is slow... */
5168 if (su->su_slang->sl_sal.ga_len > 0)
5169 {
5170 (void)spell_casefold(goodword, STRLEN(goodword), fword, MAXWLEN);
5171 spell_soundfold(su->su_slang, fword, salword);
5172 score += spell_edit_score(su->su_salword, salword);
5173 }
5174#endif
5175
5176 /* Check if the word is already there. */
5177 stp = &SUG(su, 0);
5178 for (i = su->su_ga.ga_len - 1; i >= 0; --i)
5179 if (STRCMP(stp[i].st_word, goodword) == 0)
5180 {
5181 /* Found it. Remember the lowest score. */
5182 if (stp[i].st_score > score)
5183 stp[i].st_score = score;
5184 break;
5185 }
5186
5187 if (i < 0 && ga_grow(&su->su_ga, 1) == OK)
5188 {
5189 /* Add a suggestion. */
5190 stp = &SUG(su, su->su_ga.ga_len);
5191 stp->st_word = vim_strsave(goodword);
5192 if (stp->st_word != NULL)
5193 {
5194 stp->st_score = score;
5195 stp->st_orglen = su->su_badlen;
5196 ++su->su_ga.ga_len;
5197
5198 /* If we have too many suggestions now, sort the list and keep
5199 * the best suggestions. */
5200 if (su->su_ga.ga_len > SUG_CLEANUP_COUNT)
5201 cleanup_suggestions(su);
5202 }
5203 }
5204 }
5205}
5206
5207/*
5208 * Add a word to be banned.
5209 */
5210 static void
5211add_banned(su, word)
5212 suginfo_T *su;
5213 char_u *word;
5214{
5215 char_u *s = vim_strsave(word);
5216 hash_T hash;
5217 hashitem_T *hi;
5218
5219 if (s != NULL)
5220 {
5221 hash = hash_hash(s);
5222 hi = hash_lookup(&su->su_banned, s, hash);
5223 if (HASHITEM_EMPTY(hi))
5224 hash_add_item(&su->su_banned, hi, s, hash);
5225 }
5226}
5227
5228/*
5229 * Return TRUE if a word appears in the list of banned words.
5230 */
5231 static int
5232was_banned(su, word)
5233 suginfo_T *su;
5234 char_u *word;
5235{
5236 return !HASHITEM_EMPTY(hash_find(&su->su_banned, word));
5237}
5238
5239/*
5240 * Free the banned words in "su".
5241 */
5242 static void
5243free_banned(su)
5244 suginfo_T *su;
5245{
5246 int todo;
5247 hashitem_T *hi;
5248
5249 todo = su->su_banned.ht_used;
5250 for (hi = su->su_banned.ht_array; todo > 0; ++hi)
5251 {
5252 if (!HASHITEM_EMPTY(hi))
5253 {
5254 vim_free(hi->hi_key);
5255 --todo;
5256 }
5257 }
5258 hash_clear(&su->su_banned);
5259}
5260
5261static int
5262#ifdef __BORLANDC__
5263_RTLENTRYF
5264#endif
5265sug_compare __ARGS((const void *s1, const void *s2));
5266
5267/*
5268 * Function given to qsort() to sort the suggestions on st_score.
5269 */
5270 static int
5271#ifdef __BORLANDC__
5272_RTLENTRYF
5273#endif
5274sug_compare(s1, s2)
5275 const void *s1;
5276 const void *s2;
5277{
5278 suggest_T *p1 = (suggest_T *)s1;
5279 suggest_T *p2 = (suggest_T *)s2;
5280
5281 return p1->st_score - p2->st_score;
5282}
5283
5284/*
5285 * Cleanup the suggestions:
5286 * - Sort on score.
5287 * - Remove words that won't be displayed.
5288 */
5289 static void
5290cleanup_suggestions(su)
5291 suginfo_T *su;
5292{
5293 suggest_T *stp = &SUG(su, 0);
5294 int i;
5295
5296 /* Sort the list. */
5297 qsort(su->su_ga.ga_data, (size_t)su->su_ga.ga_len,
5298 sizeof(suggest_T), sug_compare);
5299
5300 /* Truncate the list to the number of suggestions that will be displayed. */
5301 if (su->su_ga.ga_len > SUG_PROMPT_COUNT)
5302 {
5303 for (i = SUG_PROMPT_COUNT; i < su->su_ga.ga_len; ++i)
5304 vim_free(stp[i].st_word);
5305 su->su_ga.ga_len = SUG_PROMPT_COUNT;
5306 su->su_maxscore = stp[SUG_PROMPT_COUNT - 1].st_score;
5307 }
5308}
5309
5310/*
5311 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
5312 */
5313 static void
5314spell_soundfold(slang, inword, res)
5315 slang_T *slang;
5316 char_u *inword;
5317 char_u *res;
5318{
5319 fromto_T *ftp;
5320 char_u word[MAXWLEN];
5321#ifdef FEAT_MBYTE
5322 int l;
5323#endif
5324 char_u *s;
5325 char_u *t;
5326 int i, j, z;
5327 int n, k = 0;
5328 int z0;
5329 int k0;
5330 int n0;
5331 int c;
5332 int pri;
5333 int p0 = -333;
5334 int c0;
5335
5336 /* Remove accents, if wanted.
5337 * We actually remove all non-word characters. */
5338 if (slang->sl_rem_accents)
5339 {
5340 t = word;
5341 for (s = inword; *s != NUL; )
5342 {
5343#ifdef FEAT_MBYTE
5344 if (has_mbyte)
5345 {
5346 l = mb_ptr2len_check(s);
5347 if (SPELL_ISWORDP(s))
5348 {
5349 mch_memmove(t, s, l);
5350 t += l;
5351 }
5352 s += l;
5353 }
5354 else
5355#endif
5356 {
5357 if (SPELL_ISWORDP(s))
5358 *t++ = *s;
5359 ++s;
5360 }
5361 }
5362 *t = NUL;
5363 }
5364 else
5365 STRCPY(word, inword);
5366
5367 ftp = (fromto_T *)slang->sl_sal.ga_data;
5368
5369 /*
5370 * This comes from Aspell phonet.cpp. Converted from C++ to C.
5371 * TODO: support for multi-byte chars.
5372 */
5373 i = j = z = 0;
5374 while ((c = word[i]) != NUL)
5375 {
5376 n = slang->sl_sal_first[c];
5377 z0 = 0;
5378
5379 if (n >= 0)
5380 {
5381 /* check all rules for the same letter */
5382 while (ftp[n].ft_from[0] == c)
5383 {
5384 /* check whole string */
5385 k = 1; /* number of found letters */
5386 pri = 5; /* default priority */
5387 s = ftp[n].ft_from;
5388 s++; /* important for (see below) "*(s-1)" */
5389
5390 /* Skip over normal letters that match with the word. */
5391 while (*s != NUL && word[i + k] == *s
5392 && !vim_isdigit(*s) && strchr("(-<^$", *s) == NULL)
5393 {
5394 k++;
5395 s++;
5396 }
5397
5398 if (*s == '(')
5399 {
5400 /* check alternate letters in "(..)" */
5401 for (t = s + 1; *t != ')' && *t != NUL; ++t)
5402 if (*t == word[i + k])
5403 {
5404 /* match */
5405 ++k;
5406 for (s = t + 1; *s != NUL; ++s)
5407 if (*s == ')')
5408 {
5409 ++s;
5410 break;
5411 }
5412 break;
5413 }
5414 }
5415
5416 p0 = *s;
5417 k0 = k;
5418 while (*s == '-' && k > 1)
5419 {
5420 k--;
5421 s++;
5422 }
5423 if (*s == '<')
5424 s++;
5425 if (vim_isdigit(*s))
5426 {
5427 /* determine priority */
5428 pri = *s - '0';
5429 s++;
5430 }
5431 if (*s == '^' && *(s + 1) == '^')
5432 s++;
5433
5434 if (*s == NUL
5435 || (*s == '^'
5436 && (i == 0 || !SPELL_ISWORDP(word + i - 1))
5437 && (*(s + 1) != '$'
5438 || (!SPELL_ISWORDP(word + i + k0))))
5439 || (*s == '$' && i > 0
5440 && SPELL_ISWORDP(word + i - 1)
5441 && (!SPELL_ISWORDP(word + i + k0))))
5442 {
5443 /* search for followup rules, if: */
5444 /* followup and k > 1 and NO '-' in searchstring */
5445 c0 = word[i + k - 1];
5446 n0 = slang->sl_sal_first[c0];
5447
5448 if (slang->sl_followup && k > 1 && n0 >= 0
5449 && p0 != '-' && word[i + k] != NUL)
5450 {
5451 /* test follow-up rule for "word[i + k]" */
5452 while (ftp[n0].ft_from[0] == c0)
5453 {
5454
5455 /* check whole string */
5456 k0 = k;
5457 p0 = 5;
5458 s = ftp[n0].ft_from;
5459 s++;
5460 while (*s != NUL && word[i+k0] == *s
5461 && !vim_isdigit(*s)
5462 && strchr("(-<^$",*s) == NULL)
5463 {
5464 k0++;
5465 s++;
5466 }
5467 if (*s == '(')
5468 {
5469 /* check alternate letters in "(..)" */
5470 for (t = s + 1; *t != ')' && *t != NUL; ++t)
5471 if (*t == word[i + k0])
5472 {
5473 /* match */
5474 ++k0;
5475 for (s = t + 1; *s != NUL; ++s)
5476 if (*s == ')')
5477 {
5478 ++s;
5479 break;
5480 }
5481 break;
5482 }
5483 }
5484 while (*s == '-')
5485 {
5486 /* "k0" gets NOT reduced */
5487 /* because "if (k0 == k)" */
5488 s++;
5489 }
5490 if (*s == '<')
5491 s++;
5492 if (vim_isdigit(*s))
5493 {
5494 p0 = *s - '0';
5495 s++;
5496 }
5497
5498 if (*s == NUL
5499 /* *s == '^' cuts */
5500 || (*s == '$'
5501 && !SPELL_ISWORDP(word + i + k0)))
5502 {
5503 if (k0 == k)
5504 {
5505 /* this is just a piece of the string */
5506 ++n0;
5507 continue;
5508 }
5509
5510 if (p0 < pri)
5511 {
5512 /* priority too low */
5513 ++n0;
5514 continue;
5515 }
5516 /* rule fits; stop search */
5517 break;
5518 }
5519 ++n0;
5520 }
5521
5522 if (p0 >= pri && ftp[n0].ft_from[0] == c0)
5523 {
5524 ++n;
5525 continue;
5526 }
5527 }
5528
5529 /* replace string */
5530 s = ftp[n].ft_to;
5531 p0 = (ftp[n].ft_from[0] != NUL
5532 && vim_strchr(ftp[n].ft_from + 1,
5533 '<') != NULL) ? 1 : 0;
5534 if (p0 == 1 && z == 0)
5535 {
5536 /* rule with '<' is used */
5537 if (j > 0 && *s != NUL
5538 && (res[j - 1] == c || res[j - 1] == *s))
5539 j--;
5540 z0 = 1;
5541 z = 1;
5542 k0 = 0;
5543 while (*s != NUL && word[i+k0] != NUL)
5544 {
5545 word[i + k0] = *s;
5546 k0++;
5547 s++;
5548 }
5549 if (k > k0)
5550 mch_memmove(word + i + k0, word + i + k,
5551 STRLEN(word + i + k) + 1);
5552
5553 /* new "actual letter" */
5554 c = word[i];
5555 }
5556 else
5557 {
5558 /* no '<' rule used */
5559 i += k - 1;
5560 z = 0;
5561 while (*s != NUL && s[1] != NUL && j < MAXWLEN)
5562 {
5563 if (j == 0 || res[j - 1] != *s)
5564 {
5565 res[j] = *s;
5566 j++;
5567 }
5568 s++;
5569 }
5570 /* new "actual letter" */
5571 c = *s;
5572 if (ftp[n].ft_from[0] != NUL
5573 && strstr((char *)ftp[n].ft_from + 1,
5574 "^^") != NULL)
5575 {
5576 if (c != NUL)
5577 {
5578 res[j] = c;
5579 j++;
5580 }
5581 mch_memmove(word, word + i + 1,
5582 STRLEN(word + i + 1) + 1);
5583 i = 0;
5584 z0 = 1;
5585 }
5586 }
5587 break;
5588 }
5589 ++n;
5590 }
5591 }
5592
5593 if (z0 == 0)
5594 {
5595 if (k && !p0 && j < MAXWLEN && c != NUL
5596 && (!slang->sl_collapse || j == 0 || res[j - 1] != c))
5597 {
5598 /* condense only double letters */
5599 res[j] = c;
5600 j++;
5601 }
5602
5603 i++;
5604 z = 0;
5605 k = 0;
5606 }
5607 }
5608
5609 res[j] = NUL;
5610}
5611
5612/*
5613 * Compute the "edit distance" to turn "badword" into "goodword". The less
5614 * deletes/inserts/swaps are required the lower the score.
5615 * The algorithm comes from Aspell editdist.cpp, edit_distance().
5616 * TODO: make this work with multi-byte chars.
5617 */
5618 static int
5619spell_edit_score(badword, goodword)
5620 char_u *badword;
5621 char_u *goodword;
5622{
5623 int *cnt;
5624 int badlen, goodlen;
5625 int j, i;
5626 int t;
5627 int bc, gc;
5628
5629 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */
5630#define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
5631 badlen = STRLEN(badword) + 1;
5632 goodlen = STRLEN(goodword) + 1;
5633 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)),
5634 TRUE);
5635 if (cnt == 0)
5636 return 0;
5637
5638 CNT(0, 0) = 0;
5639 for (j = 1; j <= goodlen; ++j)
5640 CNT(0, j) = CNT(0, j - 1) + SCORE_DEL;
5641
5642 for (i = 1; i <= badlen; ++i)
5643 {
5644 CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS;
5645 for (j = 1; j <= goodlen; ++j)
5646 {
5647 bc = badword[i - 1];
5648 gc = goodword[j - 1];
5649 if (bc == gc)
5650 CNT(i, j) = CNT(i - 1, j - 1);
5651 else
5652 {
5653 /* Use a better score when there is only a case difference. */
5654 if (spelltab.st_fold[bc] == spelltab.st_fold[gc])
5655 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
5656 else
5657 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
5658
5659 if (i > 1 && j > 1 && bc == goodword[j - 2]
5660 && badword[i - 2] == gc)
5661 {
5662 t = SCORE_SWAP + CNT(i - 2, j - 2);
5663 if (t < CNT(i, j))
5664 CNT(i, j) = t;
5665 }
5666 t = SCORE_DEL + CNT(i - 1, j);
5667 if (t < CNT(i, j))
5668 CNT(i, j) = t;
5669 t = SCORE_INS + CNT(i, j - 1);
5670 if (t < CNT(i, j))
5671 CNT(i, j) = t;
5672 }
5673 }
5674 }
5675 return CNT(badlen - 1, goodlen - 1);
5676}
Bram Moolenaarcfc6c432005-06-06 21:50:35 +00005677
Bram Moolenaar402d2fe2005-04-15 21:00:38 +00005678#endif /* FEAT_SYN_HL */