| /* vi:set ts=8 sts=4 sw=4: |
| * |
| * VIM - Vi IMproved by Bram Moolenaar |
| * |
| * Do ":help uganda" in Vim to read copying and usage conditions. |
| * Do ":help credits" in Vim to see a list of people who contributed. |
| * See README.txt for an overview of the Vim source code. |
| */ |
| |
| /* |
| * spell.c: code for spell checking |
| */ |
| |
| #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64) |
| # include <io.h> /* for lseek(), must be before vim.h */ |
| #endif |
| |
| #include "vim.h" |
| |
| #if defined(FEAT_SYN_HL) || defined(PROTO) |
| |
| #ifdef HAVE_FCNTL_H |
| # include <fcntl.h> |
| #endif |
| |
| /* |
| * Structure that is used to store the text from the language file. This |
| * avoids the need to allocate each individual word and copying it. It's |
| * allocated in big chunks for speed. |
| */ |
| #define SBLOCKSIZE 4096 /* default size of sb_data */ |
| typedef struct sblock_S sblock_T; |
| struct sblock_S |
| { |
| sblock_T *sb_next; /* next block in list */ |
| char_u sb_data[1]; /* data, actually longer */ |
| }; |
| |
| /* |
| * Structure used to store words and other info for one language. |
| */ |
| typedef struct slang_S slang_T; |
| |
| struct slang_S |
| { |
| slang_T *sl_next; /* next language */ |
| char_u sl_name[2]; /* language name "en", "nl", etc. */ |
| hashtab_T sl_ht; /* hashtable with all words */ |
| garray_T sl_match; /* table with pointers to matches */ |
| garray_T sl_add; /* table with pointers to additions */ |
| char_u sl_regions[13]; /* table with up to 6 region names */ |
| sblock_T *sl_block; /* list with allocated memory blocks */ |
| }; |
| |
| static slang_T *first_lang = NULL; |
| |
| /* |
| * Structure used in "b_langp", filled from 'spelllang'. |
| */ |
| typedef struct langp_S |
| { |
| slang_T *lp_slang; /* info for this language (NULL for last one) */ |
| int lp_region; /* bitmask for region or REGION_ALL */ |
| } langp_T; |
| |
| #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i)) |
| #define MATCH_ENTRY(gap, i) *(((char_u **)(gap)->ga_data) + i) |
| |
| /* |
| * The byte before a word in the hashtable indicates the type of word. |
| * Also used for the byte just before a match. |
| * The top two bits are used to indicate rare and case-sensitive words. |
| * The lower bits are used to indicate the region in which the word is valid. |
| * Words valid in all regions use REGION_ALL. |
| */ |
| #define REGION_MASK 0x3f |
| #define REGION_ALL 0x3f |
| #define CASE_MASK 0x40 |
| #define RARE_MASK 0x80 |
| |
| #define SP_OK 0 |
| #define SP_BAD 1 |
| #define SP_RARE 2 |
| #define SP_LOCAL 3 |
| |
| static slang_T *spell_load_lang __ARGS((char_u *lang)); |
| static void spell_load_file __ARGS((char_u *fname)); |
| static int find_region __ARGS((char_u *rp, char_u *region)); |
| |
| /* |
| * Main spell-checking function. |
| * "ptr" points to the start of a word. |
| * "*attrp" is set to the attributes for a badly spelled word. For a non-word |
| * or when it's OK it remains unchanged. |
| * This must only be called when 'spelllang' is not empty. |
| * Returns the length of the word in bytes, also when it's OK, so that the |
| * caller can skip over the word. |
| */ |
| int |
| spell_check(wp, ptr, attrp) |
| win_T *wp; /* current window */ |
| char_u *ptr; |
| int *attrp; |
| { |
| char_u *e; |
| langp_T *lp; |
| int result; |
| int len = 0; |
| hash_T hash; |
| hashitem_T *hi; |
| int c; |
| #define MAXWLEN 80 /* assume max. word len is 80 */ |
| char_u word[MAXWLEN + 1]; |
| garray_T *gap; |
| int l, h, t; |
| char_u *p; |
| int n; |
| |
| /* Find the end of the word. We already know that *ptr is a word char. */ |
| e = ptr; |
| do |
| { |
| mb_ptr_adv(e); |
| ++len; |
| } while (*e != NUL && vim_iswordc_buf(e, wp->w_buffer)); |
| |
| /* The word is bad unless we find it in the dictionary. */ |
| result = SP_BAD; |
| |
| /* Words are always stored with folded case. */ |
| (void)str_foldcase(ptr, e - ptr, word, MAXWLEN + 1); |
| hash = hash_hash(word); |
| |
| /* |
| * Loop over the languages specified in 'spelllang'. |
| * We check them all, because a match may find a longer word. |
| */ |
| for (lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); lp->lp_slang != NULL; |
| ++lp) |
| { |
| /* Check words when it wasn't recognized as a good word yet. */ |
| if (result != SP_OK) |
| { |
| /* Word lookup. Using a hash table is fast. */ |
| hi = hash_lookup(&lp->lp_slang->sl_ht, word, hash); |
| if (!HASHITEM_EMPTY(hi)) |
| { |
| /* The character before the key indicates the type of word. */ |
| c = hi->hi_key[-1]; |
| if ((c & CASE_MASK) != 0) |
| { |
| /* Need to check first letter is uppercase. If it is, |
| * check region. If it isn't it may be a rare word. */ |
| if ( |
| #ifdef FEAT_MBYTE |
| MB_ISUPPER(mb_ptr2char(ptr)) |
| #else |
| MB_ISUPPER(*ptr) |
| #endif |
| ) |
| { |
| if ((c & lp->lp_region) == 0) |
| result = SP_LOCAL; |
| else |
| result = SP_OK; |
| } |
| else if (c & RARE_MASK) |
| result = SP_RARE; |
| } |
| else |
| { |
| if ((c & lp->lp_region) == 0) |
| result = SP_LOCAL; |
| else if (c & RARE_MASK) |
| result = SP_RARE; |
| else |
| result = SP_OK; |
| } |
| } |
| } |
| |
| /* Match lookup. Uses a binary search. If there is a match adjust |
| * "e" to the end. This is also done when a word matched, because |
| * "you've" is longer than "you". */ |
| gap = &lp->lp_slang->sl_match; |
| l = 0; /* low index */ |
| h = gap->ga_len - 1; /* high index */ |
| /* keep searching, the match must be between "l" and "h" (inclusive) */ |
| while (h >= l) |
| { |
| t = (h + l) / 2; |
| p = MATCH_ENTRY(gap, t) + 1; |
| for (n = 0; p[n] != 0 && p[n] == ptr[n]; ++n) |
| ; |
| if (p[n] == 0) |
| { |
| if ((ptr[n] == 0 || !vim_iswordc_buf(ptr + n, wp->w_buffer))) |
| { |
| /* match! */ |
| e = ptr + n; |
| if (result != SP_OK) |
| { |
| if ((lp->lp_region & p[-1]) == 0) |
| result = SP_LOCAL; |
| else |
| result = SP_OK; |
| } |
| break; |
| } |
| /* match is too short, next item is new low index */ |
| l = t + 1; |
| } |
| else if (p[n] < ptr[n]) |
| /* match is before word, next item is new low index */ |
| l = t + 1; |
| else |
| /* match is after word, previous item is new high index */ |
| h = t - 1; |
| } |
| |
| /* Addition lookup. Uses a linear search, there should be very few. |
| * If there is a match adjust "e" to the end. This doesn't change |
| * whether a word was good or bad, only the length. */ |
| gap = &lp->lp_slang->sl_add; |
| for (t = 0; t < gap->ga_len; ++t) |
| { |
| p = MATCH_ENTRY(gap, t) + 1; |
| for (n = 0; p[n] != 0 && p[n] == e[n]; ++n) |
| ; |
| if (p[n] == 0 |
| && (e[n] == 0 || !vim_iswordc_buf(e + n, wp->w_buffer))) |
| { |
| /* match */ |
| e += n; |
| break; |
| } |
| } |
| } |
| |
| if (result != SP_OK) |
| { |
| if (result == SP_BAD) |
| *attrp = highlight_attr[HLF_SPB]; |
| else if (result == SP_RARE) |
| *attrp = highlight_attr[HLF_SPR]; |
| else |
| *attrp = highlight_attr[HLF_SPL]; |
| } |
| |
| return (int)(e - ptr); |
| } |
| |
| static slang_T *load_lp; /* passed from spell_load_lang() to |
| spell_load_file() */ |
| |
| /* |
| * Load language "lang[2]". |
| */ |
| static slang_T * |
| spell_load_lang(lang) |
| char_u *lang; |
| { |
| slang_T *lp; |
| char_u fname_enc[80]; |
| char_u fname_ascii[20]; |
| char_u *p; |
| |
| lp = (slang_T *)alloc(sizeof(slang_T)); |
| if (lp != NULL) |
| { |
| lp->sl_name[0] = lang[0]; |
| lp->sl_name[1] = lang[1]; |
| hash_init(&lp->sl_ht); |
| ga_init2(&lp->sl_match, sizeof(char_u *), 20); |
| ga_init2(&lp->sl_add, sizeof(char_u *), 4); |
| lp->sl_regions[0] = NUL; |
| lp->sl_block = NULL; |
| |
| /* Find all spell files for "lang" in 'runtimepath' and load them. |
| * Use 'encoding', except that we use "latin1" for "latin9". */ |
| #ifdef FEAT_MBYTE |
| if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0) |
| p = p_enc; |
| else |
| #endif |
| p = (char_u *)"latin1"; |
| load_lp = lp; |
| sprintf((char *)fname_enc, "spell/%c%c.%s.spl", lang[0], lang[1], p); |
| if (do_in_runtimepath(fname_enc, TRUE, spell_load_file) == FAIL) |
| { |
| /* Try again to find an ASCII spell file. */ |
| sprintf((char *)fname_ascii, "spell/%c%c.spl", lang[0], lang[1]); |
| if (do_in_runtimepath(fname_ascii, TRUE, spell_load_file) == FAIL) |
| { |
| vim_free(lp); |
| lp = NULL; |
| smsg((char_u *)_("Warning: Cannot find dictionary \"%s\""), |
| fname_enc + 6); |
| } |
| } |
| else |
| { |
| lp->sl_next = first_lang; |
| first_lang = lp; |
| } |
| } |
| |
| return lp; |
| } |
| |
| /* |
| * Load one spell file into "load_lp". |
| * Invoked through do_in_runtimepath(). |
| */ |
| static void |
| spell_load_file(fname) |
| char_u *fname; |
| { |
| int fd; |
| size_t len; |
| size_t l; |
| size_t rest = 0; |
| char_u *p = NULL, *np; |
| sblock_T *bl; |
| hash_T hash; |
| hashitem_T *hi; |
| int c; |
| int region = REGION_ALL; |
| char_u word[MAXWLEN + 1]; |
| int n; |
| |
| fd = mch_open((char *)fname, O_RDONLY | O_EXTRA, 0); |
| if (fd < 0) |
| { |
| EMSG2(_(e_notopen), fname); |
| return; |
| } |
| |
| /* Get the length of the whole file. */ |
| len = lseek(fd, (off_t)0, SEEK_END); |
| lseek(fd, (off_t)0, SEEK_SET); |
| |
| /* Loop, reading the file one block at a time. |
| * "rest" is the length of an incomplete line at the previous block. |
| * "p" points to the remainder. */ |
| while (len > 0) |
| { |
| /* Allocate a block of memory to store the info in. This is not freed |
| * until spell_reload() is called. */ |
| if (len > SBLOCKSIZE) |
| l = SBLOCKSIZE; |
| else |
| l = len; |
| len -= l; |
| bl = (sblock_T *)alloc((unsigned)(sizeof(sblock_T) - 1 + l + rest)); |
| if (bl == NULL) |
| break; |
| bl->sb_next = load_lp->sl_block; |
| load_lp->sl_block = bl; |
| |
| /* Read a block from the file. Prepend the remainder of the previous |
| * block. */ |
| if (rest > 0) |
| mch_memmove(bl->sb_data, p, rest); |
| if (read(fd, bl->sb_data + rest, l) != l) |
| { |
| EMSG2(_(e_notread), fname); |
| break; |
| } |
| l += rest; |
| rest = 0; |
| |
| /* Deal with each line that was read until we finish the block. */ |
| for (p = bl->sb_data; l > 0; p = np) |
| { |
| /* "np" points to the char after the line (CR or NL). */ |
| for (np = p; l > 0 && *np >= ' '; ++np) |
| --l; |
| if (l == 0) |
| { |
| /* Incomplete line (or end of file). */ |
| rest = np - p; |
| if (len == 0) |
| EMSG2(_("E751: Truncated spell file: %s"), fname); |
| break; |
| } |
| *np = NUL; /* terminate the line with a NUL */ |
| |
| /* Skip comment and empty lines. */ |
| c = *p; |
| if (c != '#' && np > p) |
| { |
| if (c == '=' || c == '+') |
| { |
| garray_T *gap; |
| |
| /* Match or Add item. */ |
| if (c == '=') |
| gap = &load_lp->sl_match; |
| else |
| gap = &load_lp->sl_add; |
| |
| if (ga_grow(gap, 1) == OK) |
| { |
| for (n = 0; n < gap->ga_len; ++n) |
| if ((c = STRCMP(p + 1, |
| MATCH_ENTRY(gap, n) + 1)) < 0) |
| break; |
| if (c == 0) |
| { |
| if (p_verbose > 0) |
| smsg((char_u *)_("Warning: duplicate match \"%s\" in %s"), |
| p + 1, fname); |
| } |
| else |
| { |
| mch_memmove((char_u **)gap->ga_data + n + 1, |
| (char_u **)gap->ga_data + n, |
| (gap->ga_len - n) * sizeof(char_u *)); |
| *(((char_u **)gap->ga_data) + n) = p; |
| *p = region; |
| ++gap->ga_len; |
| } |
| } |
| } |
| else if (c == '-') |
| { |
| /* region item */ |
| ++p; |
| if (*p == '-') |
| /* end of a region */ |
| region = REGION_ALL; |
| else |
| { |
| char_u *rp = load_lp->sl_regions; |
| int r; |
| |
| /* The region may be repeated: "-ca-uk". Fill |
| * "region" with the bit mask for the ones we find. */ |
| region = 0; |
| for (;;) |
| { |
| /* start of a region */ |
| r = find_region(rp, p); |
| if (r == REGION_ALL) |
| { |
| /* new region, add it */ |
| r = STRLEN(rp); |
| if (r >= 12) |
| { |
| EMSG2(_("E752: Too many regions in %s"), |
| fname); |
| r = REGION_ALL; |
| } |
| else |
| { |
| rp[r] = p[0]; |
| rp[r + 1] = p[1]; |
| rp[r + 2] = NUL; |
| r = 1 << (r / 2); |
| } |
| } |
| else |
| r = 1 << r; |
| |
| region |= r; |
| if (p[2] != '-') |
| { |
| if (p[2] != NUL) |
| EMSG2(_("E753: Invalid character in \"%s\""), |
| p - 1); |
| break; |
| } |
| p += 3; |
| } |
| } |
| } |
| else |
| { |
| /* add the word */ |
| if (c == '>') |
| c = region | RARE_MASK; |
| else |
| { |
| if (c != ' ') |
| EMSG2(_("E753: Invalid character in \"%s\""), p); |
| c = region; |
| } |
| #ifdef FEAT_MBYTE |
| if (MB_ISUPPER(mb_ptr2char(p + 1))) |
| #else |
| if (MB_ISUPPER(p[1])) |
| #endif |
| c |= CASE_MASK; |
| *p++ = c; |
| (void)str_foldcase(p, np - p, word, MAXWLEN + 1); |
| n = STRLEN(word); |
| if (n > np - p) |
| { |
| sblock_T *s; |
| |
| /* Folding case made word longer! We need to allocate |
| * memory for it. */ |
| s = (sblock_T *)alloc((unsigned)sizeof(sblock_T) |
| + n + 1); |
| if (s != NULL) |
| { |
| s->sb_next = load_lp->sl_block; |
| load_lp->sl_block = s; |
| s->sb_data[0] = p[-1]; |
| p = s->sb_data + 1; |
| } |
| } |
| mch_memmove(p, word, n + 1); |
| |
| hash = hash_hash(p); |
| hi = hash_lookup(&load_lp->sl_ht, p, hash); |
| if (!HASHITEM_EMPTY(hi)) |
| { |
| c = hi->hi_key[-1]; |
| if ((c & (CASE_MASK | RARE_MASK)) |
| == (p[-1] & (CASE_MASK | RARE_MASK))) |
| { |
| if (p_verbose > 0) |
| smsg((char_u *)_("Warning: duplicate word \"%s\" in %s"), |
| p, fname); |
| } |
| else |
| hi->hi_key[-1] |= (p[-1] & (CASE_MASK | RARE_MASK)); |
| } |
| else |
| hash_add_item(&load_lp->sl_ht, hi, p, hash); |
| } |
| } |
| |
| while (l > 0 && *np < ' ') |
| { |
| ++np; |
| --l; |
| } |
| } |
| } |
| |
| close(fd); |
| } |
| |
| /* |
| * Parse 'spelllang' and set buf->b_langp accordingly. |
| * Returns an error message or NULL. |
| */ |
| char_u * |
| did_set_spelllang(buf) |
| buf_T *buf; |
| { |
| garray_T ga; |
| char_u *lang; |
| char_u *e; |
| char_u *region; |
| int region_mask; |
| slang_T *lp; |
| int c; |
| |
| ga_init2(&ga, sizeof(langp_T), 2); |
| |
| /* loop over comma separated languages. */ |
| for (lang = buf->b_p_spl; *lang != NUL; lang = e) |
| { |
| e = vim_strchr(lang, ','); |
| if (e == NULL) |
| e = lang + STRLEN(lang); |
| if (e > lang + 2) |
| { |
| if (lang[2] != '_' || e - lang != 5) |
| { |
| ga_clear(&ga); |
| return e_invarg; |
| } |
| region = lang + 3; |
| } |
| else |
| region = NULL; |
| |
| for (lp = first_lang; lp != NULL; lp = lp->sl_next) |
| if (STRNICMP(lp->sl_name, lang, 2) == 0) |
| break; |
| |
| if (lp == NULL) |
| /* Not found, load the language. */ |
| lp = spell_load_lang(lang); |
| |
| if (lp != NULL) |
| { |
| if (region == NULL) |
| region_mask = REGION_ALL; |
| else |
| { |
| /* find region in sl_regions */ |
| c = find_region(lp->sl_regions, region); |
| if (c == REGION_ALL) |
| { |
| c = lang[5]; |
| lang[5] = NUL; |
| smsg((char_u *)_("Warning: region %s not supported"), lang); |
| lang[5] = c; |
| region_mask = REGION_ALL; |
| } |
| else |
| region_mask = 1 << c; |
| } |
| |
| if (ga_grow(&ga, 1) == FAIL) |
| { |
| ga_clear(&ga); |
| return e_outofmem; |
| } |
| LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp; |
| LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; |
| ++ga.ga_len; |
| } |
| |
| if (*e == ',') |
| ++e; |
| } |
| |
| /* Add a NULL entry to mark the end of the list. */ |
| if (ga_grow(&ga, 1) == FAIL) |
| { |
| ga_clear(&ga); |
| return e_outofmem; |
| } |
| LANGP_ENTRY(ga, ga.ga_len)->lp_slang = NULL; |
| ++ga.ga_len; |
| |
| /* Everything is fine, store the new b_langp value. */ |
| ga_clear(&buf->b_langp); |
| buf->b_langp = ga; |
| |
| return NULL; |
| } |
| |
| /* |
| * Find the region "region[2]" in "rp" (points to "sl_regions"). |
| * Each region is simply stored as the two characters of it's name. |
| * Returns the index if found, REGION_ALL if not found. |
| */ |
| static int |
| find_region(rp, region) |
| char_u *rp; |
| char_u *region; |
| { |
| int i; |
| |
| for (i = 0; ; i += 2) |
| { |
| if (rp[i] == NUL) |
| return REGION_ALL; |
| if (rp[i] == region[0] && rp[i + 1] == region[1]) |
| break; |
| } |
| return i / 2; |
| } |
| |
| # if defined(FEAT_MBYTE) || defined(PROTO) |
| /* |
| * Clear all spelling tables and reload them. |
| * Used after 'encoding' is set. |
| */ |
| void |
| spell_reload() |
| { |
| buf_T *buf; |
| slang_T *lp; |
| sblock_T *sp; |
| |
| /* Unload all allocated memory. */ |
| while (first_lang != NULL) |
| { |
| lp = first_lang; |
| first_lang = lp->sl_next; |
| |
| hash_clear(&lp->sl_ht); |
| ga_clear(&lp->sl_match); |
| ga_clear(&lp->sl_add); |
| while (lp->sl_block != NULL) |
| { |
| sp = lp->sl_block; |
| lp->sl_block = sp->sb_next; |
| vim_free(sp); |
| } |
| } |
| |
| /* Go through all buffers and handle 'spelllang'. */ |
| for (buf = firstbuf; buf != NULL; buf = buf->b_next) |
| { |
| ga_clear(&buf->b_langp); |
| if (*buf->b_p_spl != NUL) |
| did_set_spelllang(buf); |
| } |
| } |
| # endif |
| |
| #endif /* FEAT_SYN_HL */ |