updated for version 7.0068

commit: 63d5a1e57b225dd90ac6d5f24876e737683961ee [log] [tgz]
author: Bram Moolenaar <Bram@vim.org> Tue Apr 19 21:30:25 2005 +0000
committer: Bram Moolenaar <Bram@vim.org> Tue Apr 19 21:30:25 2005 +0000
tree: c78fcd0186ca5d7ac3f69b818c1ce9ae2dd5191c
parent: 0e21a3f623bc9766953882f30326783f76df39a0 [diff] [blame]
diff --git a/src/spell.c b/src/spell.c
index 9f1f00c..618b4d3 100644
--- a/src/spell.c
+++ b/src/spell.c

@@ -10,13 +10,21 @@
 /*
  * spell.c: code for spell checking
  *
- * Terminology:
- * "dword" is a dictionary word, made out of letters and digits.
- * "nword" is a word with a character that's not a letter or digit.
- * "word"  is either a "dword" or an "nword".
- */
-
-/*
+ * The basic spell checking mechanism is:
+ * 1. Isolate a word, up to the next non-word character.
+ * 2. Find the word in the hashtable of basic words.
+ * 3. If not found, look in the hashtable with "prewords".  These are prefixes
+ *    with a non-word character following a word character, e.g., "de-".
+ * 4. If still not found, for each matching a prefix try if the word matches
+ *    without the prefix (and with the "chop" string added back).
+ * 5. If still still not found, for each matching suffix try if the word
+ *    matches without the suffix (and with the "chop" string added back).
+ *
+ * Matching involves checking the caps type: Onecap ALLCAP KeepCap.
+ * After finding a matching word check for a leadstring (non-word characters
+ * before the word) and addstring (more text following, starting with a
+ * non-word character).
+ *
  * Why doesn't Vim use aspell/ispell/myspell/etc.?
  * See ":help develop-spell".
  */
@@ -36,9 +44,10 @@
 #define MAXWLEN 100		/* assume max. word len is this many bytes */
 
 /*
- * Structure that is used to store the text from the language file.  This
- * avoids the need to allocate space for each individual word.  It's allocated
- * in big chunks for speed.
+ * Structure that is used to store the structures and strings from the
+ * language file.  This avoids the need to allocate space for each individual
+ * word.  It's allocated in big chunks for speed.  It's freed all at once when
+ * 'encoding' changes.
  */
 #define  SBLOCKSIZE 4096	/* default size of sb_data */
 typedef struct sblock_S sblock_T;
@@ -48,7 +57,8 @@
     char_u	sb_data[1];	/* data, actually longer */
 };
 
-/* Info from "REP" entries in ".aff" file used in af_rep. */
+/* Info from "REP" entries in ".aff" file used in af_rep.
+ * TODO: This is not used yet.  Either use it or remove it. */
 typedef struct repentry_S
 {
     char_u	*re_from;
@@ -63,19 +73,30 @@
 {
     affitem_T	*ai_next;	/* next affix with same ai_add[] or NULL */
     short_u	ai_nr;		/* affix number */
-    char_u	ai_combine;	/* prefix combines with suffix */
-    char_u	ai_choplen;	/* length of ai_chop in bytes */
+    char_u	ai_flags;	/* AFF_ flags */
+    char_u	ai_choplen;	/* length of chop string in bytes */
     char_u	ai_addlen;	/* length of ai_add in bytes */
-    char_u	*ai_chop;	/* text chopped off basic word (can be NULL) */
-    char_u	ai_add[1];	/* text added to basic word (actually longer) */
+    char_u	ai_leadlen;	/* for AFF_PREWORD: length of lead string */
+    char_u	ai_taillen;	/* for AFF_PREWORD: length of tail string */
+    char_u	ai_add[1];	/* Text added to basic word. This stores:
+				 * 0: word for AFF_PREWORD or whole addition
+				 * ai_addlen + 1: chop string
+				 * + ai_choplen + 1: lead string for AFF_PREWORD
+				 * + ai_leadlen + 1: trail string f. AFF_PREWORD
+				 */
 };
 
 /* Get affitem_T pointer from hashitem that uses ai_add */
 static affitem_T dumai;
 #define HI2AI(hi)	((affitem_T *)((hi)->hi_key - (dumai.ai_add - (char_u *)&dumai)))
 
+/* ai_flags: Affix item flags */
+#define AFF_COMBINE	0x01	/* prefix combines with suffix */
+#define AFF_PREWORD	0x02	/* prefix includes word */
+
 /*
- * Structure used to store words and other info for one language.
+ * Structure used to store words and other info for one language, loaded from
+ * a .spl file.
  */
 typedef struct slang_S slang_T;
 struct slang_S
@@ -86,49 +107,65 @@
     int		sl_prefcnt;	/* number of prefix NRs */
     garray_T	sl_preftab;	/* list of hashtables to lookup prefixes */
     affitem_T	*sl_prefzero;	/* list of prefixes with zero add length */
+    hashtab_T	sl_prewords;	/* prefixes that include a word */
     int		sl_suffcnt;	/* number of suffix NRs */
     garray_T	sl_sufftab;	/* list of hashtables to lookup suffixes */
     affitem_T	*sl_suffzero;	/* list of suffixes with zero add length */
-    char_u	*sl_try;	/* "TRY" from .aff file */
-    garray_T	sl_rep;		/* list of repentry_T entries from REP lines */
+    char_u	*sl_try;	/* "TRY" from .aff file  TODO: not used */
+    garray_T	sl_rep;		/* list of repentry_T entries from REP lines
+				 * TODO not used */
     char_u	sl_regions[17];	/* table with up to 8 region names plus NUL */
     sblock_T	*sl_block;	/* list with allocated memory blocks */
     int		sl_error;	/* error while loading */
 };
 
+/* First language that is loaded, start of the linked list of loaded
+ * languages. */
 static slang_T *first_lang = NULL;
 
 /*
  * Structure to store an addition to a basic word.
+ * There are many of these, keep it small!
  */
 typedef struct addword_S addword_T;
 struct addword_S
 {
     addword_T	*aw_next;	/* next addition */
     char_u	aw_flags;	/* ADD_ flags */
-    char_u	aw_leadlen;	/* length of lead in bytes */
-    char_u	aw_wordlen;	/* length of aw_word in bytes */
     char_u	aw_region;	/* region for word with this addition */
+    char_u	aw_leadlen;	/* byte length of lead in aw_word */
+    char_u	aw_wordlen;	/* byte length of first word in aw_word */
+    char_u	aw_saveb;	/* saved byte where aw_word[] is truncated at
+				   end of hashtable key; NUL when not using
+				   hashtable */
     char_u	aw_word[1];	/* text, actually longer: case-folded addition
 				   plus, with ADD_KEEPCAP: keep-case addition */
 };
 
+/* Get addword_T pointer from hashitem that uses aw_word */
+static addword_T dumaw;
+#define HI2ADDWORD(hi)	((addword_T *)((hi)->hi_key - (dumaw.aw_word - (char_u *)&dumaw)))
+
 /*
  * Structure to store a basic word.
+ * There are many of these, keep it small!
  */
 typedef struct fword_S fword_T;
 struct fword_S
 {
-    fword_T	*fw_next;	/* same basic word with different caps */
-    char_u	fw_region;	/* region bits */
-    char_u	fw_prefixcnt;	/* number of prefix numbers */
-    char_u	fw_suffixcnt;	/* number of suffix numbers */
-    short_u	fw_flags;	/* BWF_ flags */
-    void	*fw_prefix;	/* table with prefix numbers */
-    void	*fw_suffix;	/* table with suffix numbers */
+    fword_T	*fw_next;	/* same basic word with different caps and/or
+				 * affixes */
     addword_T	*fw_adds;	/* first addword_T entry */
-    char_u	fw_word[1];	/* actually longer: case folded word, or
-				   keep-case word when (flags & BWF_KEEPCAP) */
+    short_u	fw_flags;	/* BWF_ flags */
+    char_u	fw_region;	/* region bits */
+    char_u	fw_prefixcnt;	/* number of prefix NRs */
+    char_u	fw_suffixcnt;	/* number of suffix NRs */
+    char_u	fw_word[1];	/* actually longer:
+				 * 0:  case folded word or keep-case word when
+				 *     (flags & BWF_KEEPCAP)
+				 * + word length + 1: list of prefix NRs
+				 * + fw_prefixcnt [* 2]: list of suffix NRs
+				 */
 };
 
 /* Get fword_T pointer from hashitem that uses fw_word */
@@ -167,6 +204,11 @@
 				       for single-letter words) */
 #define BWF_KEEPCAP	0x0800	    /* Keep case as-is */
 
+#define BWF_ADDHASH	0x8000	    /* Internal: use hashtab for additions */
+
+#define NOWC_KEY (char_u *)"x"	    /* hashtab key used for additions without
+				       any word character */
+
 /* flags used for addition in the spell file */
 #define ADD_REGION	0x02	    /* region byte follows */
 #define ADD_ONECAP	0x04	    /* first letter must be capital */
@@ -178,7 +220,7 @@
  * (Needed to keep ADD_ flags in one byte.) */
 #define ADD2BWF(x)	(((x) & 0x0f) | (((x) & 0xf0) << 4))
 
-#define VIMSPELLMAGIC "VIMspell01"  /* string at start of Vim spell file */
+#define VIMSPELLMAGIC "VIMspell02"  /* string at start of Vim spell file */
 #define VIMSPELLMAGICL 10
 
 /*
@@ -188,31 +230,44 @@
 {
     langp_T	*mi_lp;			/* info for language and region */
     slang_T	*mi_slang;		/* info for the language */
+
+    /* pointers to original text to be checked */
     char_u	*mi_line;		/* start of line containing word */
     char_u	*mi_word;		/* start of word being checked */
     char_u	*mi_end;		/* first non-word char after mi_word */
-    char_u	*mi_wend;		/* end of matching word (is "mi_end"
+    char_u	*mi_wend;		/* end of matching word (is mi_end
 					 * or further) */
-    char_u	*mi_cword;		/* word to check, can be "mi_fword" */
-    char_u	mi_fword[MAXWLEN + 1];	/* "mi_word" to "mi_end" case-folded */
-    int		mi_faddlen;		/* length of valid bytes in "mi_fadd" */
-    char_u	*mi_faddp;		/* next char to be added to "mi_fadd" */
-    char_u	mi_fadd[MAXWLEN + 1];	/* "mi_end" and further case-folded */
+    char_u	*mi_fend;		/* next char to be added to mi_fword */
+
+    /* case-folded text */
+    char_u	mi_fword[MAXWLEN + 1];	/* mi_word case-folded */
+    int		mi_fendlen;		/* byte length of first word in
+					   mi_fword */
+    int		mi_faddlen;		/* byte length of text in mi_fword
+					   after first word */
+    char_u	*mi_cword;		/* word to check, points in mi_fword */
+    char_u	*mi_awend;		/* after next word, to check for
+					   addition (NULL when not done yet) */
+    int		mi_did_awend;		/* did compute mi_awend */
+
+    /* others */
     int		mi_result;		/* result so far: SP_BAD, SP_OK, etc. */
     int		mi_capflags;		/* BWF_ONECAP BWF_ALLCAP BWF_KEEPCAP */
 } matchinf_T;
 
 static int word_match __ARGS((matchinf_T *mip));
 static int check_adds __ARGS((matchinf_T *mip, fword_T *fw, int req_pref, int req_suf));
-static int supports_afffix __ARGS((int cnt, void *afffix, int afffixcnt, int nr));
+static void fill_awend __ARGS((matchinf_T *mip));
+static void fold_addchars __ARGS((matchinf_T *mip, int addlen));
+static int supports_affix __ARGS((int cnt, char_u *afflist, int afflistlen, int nr));
 static int prefix_match __ARGS((matchinf_T *mip));
+static int noprefix_match __ARGS((matchinf_T *mip, char_u *pword, char_u *cstart, affitem_T *ai));
 static int suffix_match __ARGS((matchinf_T *mip));
 static int match_caps __ARGS((int flags, char_u *caseword, matchinf_T *mip, char_u *cword, char_u *end));
 static slang_T *slang_alloc __ARGS((char_u *lang));
 static void slang_free __ARGS((slang_T *lp));
 static slang_T *spell_load_lang __ARGS((char_u *lang));
 static void spell_load_file __ARGS((char_u *fname, void *cookie));
-static int spell_load_affixes __ARGS((FILE *fd, slang_T *lp, int *bl_usedp, int affm, void **affp));
 static void *getroom __ARGS((slang_T *lp, int *bl_used, int len));
 static int find_region __ARGS((char_u *rp, char_u *region));
 static int captype __ARGS((char_u *word, char_u *end));
@@ -248,16 +303,22 @@
     if (*ptr >= '0' && *ptr <= '9')
 	return (int)(mi.mi_end - ptr);
 
-    /* Make case-folded copy of the Word.  Compute its hash value. */
+    /* Make case-folded copy of the word. */
     (void)str_foldcase(ptr, mi.mi_end - ptr, mi.mi_fword, MAXWLEN + 1);
     mi.mi_cword = mi.mi_fword;
+    mi.mi_fendlen = STRLEN(mi.mi_fword);
+    mi.mi_faddlen = 0;
+    mi.mi_fend = mi.mi_end;
 
-    /* The word is bad unless we find it in the dictionary. */
+    /* Check the caps type of the word. */
+    mi.mi_capflags = captype(ptr, mi.mi_end);
+
+    /* The word is bad unless we recognize it. */
     mi.mi_result = SP_BAD;
     mi.mi_wend = mi.mi_end;
-    mi.mi_faddp = mi.mi_end;
-    mi.mi_faddlen = 0;
-    mi.mi_capflags = captype(ptr, mi.mi_end);
+
+    mi.mi_awend = NULL;
+    mi.mi_did_awend = FALSE;
     mi.mi_line = line;
 
     /*
@@ -294,29 +355,158 @@
 }
 
 /*
- * Check if the word "mip->mi_cword" matches.
+ * Check if the word "mip->mi_word" matches.
+ * "mip->mi_fword" is the same word case-folded;
+ *
+ * This checks the word as a whole and for prefixes that include a word.
+ *
+ * Note that when called mi_fword only contains the word up to mip->mi_end,
+ * but when checking additions it gets longer.
  */
     static int
 word_match(mip)
     matchinf_T *mip;
 {
-    hash_T	fhash = hash_hash(mip->mi_cword);
+    hash_T	fhash = hash_hash(mip->mi_fword);
     hashitem_T	*hi;
     fword_T	*fw;
     int		valid = FALSE;
+    char_u	*p;
+    char_u	pword[MAXWLEN + 1];
+    int		charlen;
+    int		capflags_save;
+    affitem_T	*ai;
+    char_u	*cstart;
+    int		addlen;
+    int		n;
+    char_u	*save_end;
+    int		cc;
 
-    hi = hash_lookup(&mip->mi_slang->sl_words, mip->mi_cword, fhash);
-    if (HASHITEM_EMPTY(hi))
-	return FALSE;
+    hi = hash_lookup(&mip->mi_slang->sl_words, mip->mi_fword, fhash);
+    if (!HASHITEM_EMPTY(hi))
+    {
+	/*
+	 * Find a basic word for which the case of "mi_word" is correct.
+	 * If it is, check additions and use the longest one.
+	 */
+	for (fw = HI2FWORD(hi); fw != NULL; fw = fw->fw_next)
+	    if (match_caps(fw->fw_flags, fw->fw_word, mip,
+						   mip->mi_word, mip->mi_end))
+		valid |= check_adds(mip, fw, -1, -1);
+    }
 
     /*
-     * Find a basic word for which the case of word "cword" is correct.
-     * If it is, check additions and use the longest one.
+     * Try finding a matching preword for "mip->mi_word".  These are
+     * prefixes that have a non-word character after a word character:
+     * "d'", "de-", "'s-", "l'de-".  But not "'s".
+     * Also need to do this when a matching word was already found, because we
+     * might find a longer match this way (French: "qu" and "qu'a-t-elle").
      */
-    for (fw = HI2FWORD(hi); fw != NULL; fw = fw->fw_next)
-	if (match_caps(fw->fw_flags, fw->fw_word, mip,
-						   mip->mi_word, mip->mi_end))
-	    valid |= check_adds(mip, fw, -1, -1);
+    cc = mip->mi_fword[mip->mi_fendlen];
+    mip->mi_fword[mip->mi_fendlen] = NUL;
+    hi = hash_lookup(&mip->mi_slang->sl_prewords, mip->mi_fword, fhash);
+    mip->mi_fword[mip->mi_fendlen] = cc;
+    if (!HASHITEM_EMPTY(hi))
+    {
+	capflags_save = mip->mi_capflags;
+
+	/* Go through the list of matching prewords. */
+	for (ai = HI2AI(hi); ai != NULL; ai = ai->ai_next)
+	{
+	    /* Check that the lead string matches before the word. */
+	    p = ai->ai_add + ai->ai_addlen + ai->ai_choplen + 2;
+	    if (ai->ai_leadlen > 0)
+	    {
+		if (mip->mi_word - mip->mi_line < ai->ai_leadlen
+			|| STRNCMP(mip->mi_word - ai->ai_leadlen, p,
+						     ai->ai_leadlen) != 0)
+		    continue;
+		p += ai->ai_leadlen + 1;	/* advance "p" to tail */
+	    }
+	    else
+		++p;			/* advance "p" to tail */
+
+	    /* Check that the tail string matches after the word.  Need
+	     * to fold case first.  */
+	    if (ai->ai_taillen > 0)
+	    {
+		if (ai->ai_taillen >= mip->mi_faddlen)
+		{
+		    fold_addchars(mip, ai->ai_taillen);
+		    if (ai->ai_taillen > mip->mi_faddlen)
+			continue;	/* not enough chars, can't match */
+		}
+		if (STRNCMP(mip->mi_fword + mip->mi_fendlen,
+						  p, ai->ai_taillen) != 0)
+		    continue;
+	    }
+
+	    /*
+	     * This preword matches.  Remove the preword and check that
+	     * the resulting word exits.
+	     */
+
+	    /* Find the place in the original word where the tail ends,
+	     * needed for case checks. */
+#ifdef FEAT_MBYTE
+	    charlen = mb_charlen(p);
+#else
+	    charlen = ai->ai_taillen;
+#endif
+	    cstart = mip->mi_end;
+	    for (n = 0; n < charlen; ++n)
+		mb_ptr_adv(cstart);
+
+	    /* The new word starts with the chop. Then add up to the next
+	     * non-word char. */
+	    mch_memmove(pword, ai->ai_add + ai->ai_addlen + 1,
+							  ai->ai_choplen);
+	    p = mip->mi_fword + mip->mi_fendlen + ai->ai_taillen;
+	    addlen = ai->ai_taillen;
+	    while (spell_iswordc(p))
+	    {
+		++charlen;
+#ifdef FEAT_MBYTE
+		addlen += (*mb_ptr2len_check)(p);
+#else
+		++addlen;
+#endif
+		mb_ptr_adv(p);
+		if (addlen >= mip->mi_faddlen)
+		{
+		    /* Get more folded characters in mip->mi_fword. */
+		    fold_addchars(mip, addlen);
+		    if (addlen >= mip->mi_faddlen)
+			break;	/* not enough chars, can't match */
+		}
+	    }
+	    mch_memmove(pword + ai->ai_choplen,
+		    mip->mi_fword + mip->mi_fendlen + ai->ai_taillen,
+						 addlen - ai->ai_taillen);
+	    pword[ai->ai_choplen + addlen - ai->ai_taillen] = NUL;
+
+	    /* Need to set mi_end to find additions.  Also set mi_fendlen
+	     * and mi_faddlen. */
+	    save_end = mip->mi_end;
+	    while (--charlen >= 0)
+		mb_ptr_adv(mip->mi_end);
+	    mip->mi_fendlen += addlen;
+	    mip->mi_faddlen -= addlen;
+
+	    /* Find the word "pword", caseword "cstart". */
+	    n = noprefix_match(mip, pword, cstart, ai);
+	    mip->mi_end = save_end;
+	    mip->mi_fendlen -= addlen;
+	    mip->mi_faddlen += addlen;
+	    if (n)
+		valid = TRUE;
+
+	    /* If we found a valid word, we still need to try other
+	     * suffixes, because it may have an addition that's longer. */
+	}
+
+	mip->mi_capflags = capflags_save;
+    }
 
     return valid;
 }
@@ -334,16 +524,41 @@
 {
     int		valid = FALSE;
     addword_T	*aw;
+    addword_T	*naw = NULL;
     char_u	*p;
     int		addlen;
-    int		fl;
+    int		cc;
+    hashitem_T	*hi;
+    char_u	*cp = NULL;
+    int		n;
 
-    /* A word may be valid without additions. */
-    if ((fw->fw_flags & BWF_VALID)
-	    && (req_pref < 0 || supports_afffix(mip->mi_slang->sl_prefcnt,
-				   fw->fw_prefix, fw->fw_prefixcnt, req_pref))
-	    && (req_suf < 0 || supports_afffix(mip->mi_slang->sl_suffcnt,
-				   fw->fw_suffix, fw->fw_suffixcnt, req_suf)))
+    /* Check if required prefixes and suffixes are supported.  These are on
+     * the basic word, not on each addition. */
+    if (req_pref >= 0 || req_suf >= 0)
+    {
+	/* Prefix NRs are stored just after the word in fw_word. */
+	cp = fw->fw_word + STRLEN(fw->fw_word) + 1;
+	if (req_pref >= 0 && !supports_affix(mip->mi_slang->sl_prefcnt,
+					      cp, fw->fw_prefixcnt, req_pref))
+	    return FALSE;
+	if (req_suf >= 0)
+	{
+	    /* Suffix NRs are stored just after the Prefix NRs. */
+	    if (fw->fw_prefixcnt > 0)
+	    {
+		if (mip->mi_slang->sl_prefcnt > 256)
+		    cp += fw->fw_prefixcnt * 2;
+		else
+		    cp += fw->fw_prefixcnt;
+	    }
+	    if (!supports_affix(mip->mi_slang->sl_suffcnt,
+					       cp, fw->fw_suffixcnt, req_suf))
+		return FALSE;
+	}
+    }
+
+    /* A word may be valid without an addition. */
+    if (fw->fw_flags & BWF_VALID)
     {
 	valid = TRUE;
 	if (mip->mi_result != SP_OK)
@@ -353,15 +568,56 @@
 	    else
 		mip->mi_result = SP_OK;
 	}
+	/* Set word end, required when matching a word after a preword. */
+	if (mip->mi_wend < mip->mi_end)
+	    mip->mi_wend = mip->mi_end;
     }
 
     /*
      * Check additions, both before and after the word.
      * This may make the word longer, thus we also need to check
      * when we already found a matching word.
+     * When the BWF_ADDHASH flag is present then fw_adds points to a hashtable
+     * for quick lookup.  Otherwise it points to the list of all possible
+     * additions.
      */
-    for (aw = fw->fw_adds; aw != NULL; aw = aw->aw_next)
+    if (fw->fw_flags & BWF_ADDHASH)
     {
+	/* Locate the text up to the next end-of-word. */
+	if (!mip->mi_did_awend)
+	    fill_awend(mip);
+	if (mip->mi_awend == NULL)
+	    return valid;	    /* there is no next word */
+
+	cc = *mip->mi_awend;
+	*mip->mi_awend = NUL;
+	hi = hash_find((hashtab_T *)fw->fw_adds,
+					     mip->mi_fword + mip->mi_fendlen);
+	*mip->mi_awend = cc;
+	if (HASHITEM_EMPTY(hi))
+	    return valid;		/* no matching addition */
+	aw = HI2ADDWORD(hi);
+
+	/* Also check additions without word characters.  If they are there,
+	 * skip the first dummy entry. */
+	hi = hash_find((hashtab_T *)fw->fw_adds, NOWC_KEY);
+	if (!HASHITEM_EMPTY(hi))
+	    naw = HI2ADDWORD(hi)->aw_next;
+    }
+    else
+	aw = fw->fw_adds;
+
+    for ( ; ; aw = aw->aw_next)
+    {
+	if (aw == NULL)
+	{
+	    /* At end of list: may also try additions without word chars. */
+	    if (naw == NULL)
+		break;
+	    aw = naw;
+	    naw = NULL;
+	}
+
 	if (aw->aw_leadlen > 0)
 	{
 	    /* There is a leader, verify that it matches. */
@@ -391,28 +647,21 @@
 	    /* Check for matching addition and no word character after it.
 	     * First make sure we have enough case-folded chars to compare
 	     * with. */
-	    while (mip->mi_faddlen <= addlen)
-	    {
-		if (*mip->mi_faddp == NUL)
-		{
-		    mip->mi_fadd[mip->mi_faddlen] = NUL;
-		    break;
-		}
-#ifdef FEAT_MBYTE
-		fl = (*mb_ptr2len_check)(mip->mi_faddp);
-#else
-		fl = 1;
-#endif
-		(void)str_foldcase(mip->mi_faddp, fl,
-				   mip->mi_fadd + mip->mi_faddlen,
-						   MAXWLEN - mip->mi_faddlen);
-		mip->mi_faddp += fl;
-		mip->mi_faddlen += STRLEN(mip->mi_fadd + mip->mi_faddlen);
-	    }
+	    if (addlen >= mip->mi_faddlen)
+		fold_addchars(mip, addlen);
 
-	    if (STRNCMP(mip->mi_fadd, p, addlen) != 0
-		    || (mip->mi_fadd[addlen] != NUL
-				     && spell_iswordc(mip->mi_fadd + addlen)))
+	    /* Put back the saved char, if needed. */
+	    if (aw->aw_saveb != NUL)
+	    {
+		cp = p + STRLEN(p);
+		*cp = aw->aw_saveb;
+	    }
+	    n = STRNCMP(mip->mi_fword + mip->mi_fendlen, p, addlen);
+	    if (aw->aw_saveb != NUL)
+		*cp = NUL;
+
+	    if (n != 0 || (mip->mi_fword[mip->mi_fendlen + addlen] != NUL
+		  && spell_iswordc(mip->mi_fword + mip->mi_fendlen + addlen)))
 		continue;
 
 	    /* Compute the length in the original word, before case folding. */
@@ -422,8 +671,8 @@
 		int	l;
 
 		p = mip->mi_end;
-		for (l = 0; l < addlen;
-				   l += (*mb_ptr2len_check)(mip->mi_fadd + l))
+		for (l = 0; l < addlen; l += (*mb_ptr2len_check)(mip->mi_fword
+						       + mip->mi_fendlen + l))
 		    mb_ptr_adv(p);
 		addlen = p - mip->mi_end;
 	    }
@@ -454,34 +703,111 @@
 }
 
 /*
- * Return TRUE if word "fw" supports afffix "nr".
+ * Locate the text up to the next end-of-word after mip->mi_end.
+ */
+    static void
+fill_awend(mip)
+    matchinf_T	*mip;
+{
+    char_u	*p = mip->mi_end;
+    int		addlen = 0;
+    int		find_word = TRUE;
+
+    mip->mi_did_awend = TRUE;
+    if (mip->mi_faddlen == 0)
+	fold_addchars(mip, 0);	    /* need to fold first char */
+
+    /* 1: find_word == TRUE: skip over non-word characters after mi_end.
+     * 2: find_word == FALSE: skip over following word characters. */
+    for (p = mip->mi_fword + mip->mi_fendlen; *p != NUL; mb_ptr_adv(p))
+    {
+	if (spell_iswordc(p) == find_word)
+	{
+	    if (!find_word)
+		break;		    /* done */
+	    find_word = !find_word;
+	}
+#ifdef FEAT_MBYTE
+	addlen += (*mb_ptr2len_check)(p);
+#else
+	++addlen;
+#endif
+	if (addlen >= mip->mi_faddlen)
+	    fold_addchars(mip, addlen);	    /* need to fold more chars */
+    }
+
+    /* If there are extra chars store the result. */
+    if (addlen != 0)
+	mip->mi_awend = p;
+}
+
+/*
+ * Fold enough characters of the checked text to be able to compare with an
+ * addition of length "addlen" plus one character (to be able to check the
+ * next character to be a non-word char).
+ * When there are not enough characters (end of line) mip->mi_faddlen will be
+ * smaller than "addlen".
+ */
+    static void
+fold_addchars(mip, addlen)
+    matchinf_T	*mip;
+    int		addlen;
+{
+    int		l;
+    char_u	*p = mip->mi_fword + mip->mi_fendlen;
+
+    while (mip->mi_faddlen <= addlen)
+    {
+	if (*mip->mi_fend == NUL)	/* end of the line */
+	{
+	    p[mip->mi_faddlen] = NUL;
+	    break;
+	}
+#ifdef FEAT_MBYTE
+	if (has_mbyte)
+	    l = (*mb_ptr2len_check)(mip->mi_fend);
+	else
+#endif
+	    l = 1;
+	(void)str_foldcase(mip->mi_fend, l, p + mip->mi_faddlen,
+				 MAXWLEN - mip->mi_fendlen - mip->mi_faddlen);
+	mip->mi_fend += l;
+	mip->mi_faddlen += STRLEN(p + mip->mi_faddlen);
+    }
+}
+
+/*
+ * Return TRUE if affix "nr" appears in affix list "afflist[afflistlen]".
  */
     static int
-supports_afffix(cnt, afffix, afffixcnt, nr)
-    int		cnt;
-    void	*afffix;
-    int		afffixcnt;
+supports_affix(cnt, afflist, afflistlen, nr)
+    int		cnt;		/* total affix NR count */
+    char_u	*afflist;
+    int		afflistlen;	/* affix count in "afflist" */
     int		nr;
 {
-    char_u	*pc;
-    short_u	*ps;
+    char_u	*pc = afflist;
     int		i;
+    int		nr_msb, nr_lsb;
 
     if (cnt <= 256)
     {
-	/* char_u affix numbers */
-	pc = afffix;
-	for (i = afffixcnt; --i >= 0; )
+	/* one byte affix numbers */
+	for (i = afflistlen; --i >= 0; )
 	    if (*pc++ == nr)
 		return TRUE;
     }
     else
     {
-	/* short_u affix numbers */
-	ps = afffix;
-	for (i = afffixcnt; --i >= 0; )
-	    if (*ps++ == nr)
+	/* two byte affix numbers, MSB first */
+	nr_msb = (unsigned)nr >> 8;
+	nr_lsb = nr & 0xff;
+	for (i = afflistlen; --i >= 0; )
+	{
+	    if (*pc++ == nr_msb && *pc == nr_lsb)
 		return TRUE;
+	    ++pc;
+	}
     }
     return FALSE;
 }
@@ -498,15 +824,12 @@
     int		cc;
     affitem_T	*ai;
     char_u	pword[MAXWLEN + 1];
-    fword_T	*fw;
     hashtab_T	*ht;
     hashitem_T	*hi;
-    int		i;
     int		found_valid = FALSE;
     int		cstart_charlen = 0;
     char_u	*cstart = mip->mi_word;
     int		capflags_save = mip->mi_capflags;
-    char_u	*p;
 
     /*
      * Check for prefixes with different character lengths.
@@ -518,7 +841,7 @@
 	{
 #ifdef FEAT_MBYTE
 	    if (has_mbyte)
-		len += mb_ptr2len_check(mip->mi_cword + len);
+		len += (*mb_ptr2len_check)(mip->mi_cword + len);
 	    else
 #endif
 		len += 1;
@@ -549,10 +872,12 @@
 	/* Loop over all matching prefixes. */
 	for ( ; ai != NULL; ai = ai->ai_next)
 	{
-	    /* Create the basic word by removing the prefix and adding the
-	     * chop string. */
-	    mch_memmove(pword, ai->ai_chop, ai->ai_choplen);
-	    STRCPY(pword + ai->ai_choplen, mip->mi_cword + ai->ai_addlen);
+	    /* Create the basic word from the chop string and the word after
+	     * the matching add string. */
+	    mch_memmove(pword, ai->ai_add + ai->ai_addlen + 1, ai->ai_choplen);
+	    mch_memmove(pword + ai->ai_choplen, mip->mi_cword + ai->ai_addlen,
+					     mip->mi_fendlen - ai->ai_addlen);
+	    pword[mip->mi_fendlen - ai->ai_addlen] = NUL;
 
 	    /* Adjust the word start for case checks, we only check the
 	     * part after the prefix. */
@@ -562,44 +887,14 @@
 		++cstart_charlen;
 	    }
 
-	    /* Removing the prefix may change the caps, e.g. for
-	     * "deAlf" removing "de" makes it ONECAP. */
-	    mip->mi_capflags = captype(cstart, mip->mi_end);
+	    /* Find the word "pword", caseword "cstart". */
+	    found_valid |= noprefix_match(mip, pword, cstart, ai);
 
-	    /* Find the basic word. */
-	    hi = hash_find(&mip->mi_slang->sl_words, pword);
-	    if (!HASHITEM_EMPTY(hi))
+	    if (found_valid && mip->mi_result == SP_OK)
 	    {
-		/* Check if the word supports this prefix. */
-		for (fw = HI2FWORD(hi); fw != NULL; fw = fw->fw_next)
-		    if (match_caps(fw->fw_flags, fw->fw_word, mip,
-							 cstart, mip->mi_end))
-			found_valid |= check_adds(mip, fw, ai->ai_nr, -1);
-
-		if (found_valid && mip->mi_result == SP_OK)
-		{
-		    /* Found a valid word, no need to try other suffixes. */
-		    mip->mi_capflags = capflags_save;
-		    return TRUE;
-		}
-	    }
-
-	    /* No matching basic word without prefix.  When combining is
-	     * allowed try with suffixes. */
-	    if (ai->ai_combine)
-	    {
-		/* Pass the word with prefix removed to suffix_match(). */
-		mip->mi_cword = pword;
-		p = mip->mi_word;
-		mip->mi_word = cstart;
-		i = suffix_match(mip);
-		mip->mi_cword = mip->mi_fword;
-		mip->mi_word = p;
-		if (i)
-		{
-		    mip->mi_capflags = capflags_save;
-		    return TRUE;
-		}
+		/* Found a valid word, no need to try other suffixes. */
+		mip->mi_capflags = capflags_save;
+		return TRUE;
 	    }
 	}
     }
@@ -609,6 +904,64 @@
 }
 
 /*
+ * Check for matching word after removing a prefix.
+ * Return TRUE if found.
+ */
+    static int
+noprefix_match(mip, pword, cstart, ai)
+    matchinf_T	*mip;
+    char_u	*pword;	    /* case-folded word */
+    char_u	*cstart;    /* original word after removed prefix */
+    affitem_T	*ai;	    /* the prefix item */
+{
+    hashitem_T	*hi;
+    fword_T	*fw;
+    int		found_valid = FALSE;
+    char_u	*word;
+    int		i;
+    int		fendlen;
+
+    /* Removing the prefix may change the caps, e.g. for
+     * "deAlf" removing "de" makes it ONECAP. */
+    mip->mi_capflags = captype(cstart, mip->mi_end);
+
+    /* Find the basic word. */
+    hi = hash_find(&mip->mi_slang->sl_words, pword);
+    if (!HASHITEM_EMPTY(hi))
+    {
+	/* Check if the word supports this prefix. */
+	for (fw = HI2FWORD(hi); fw != NULL; fw = fw->fw_next)
+	    if (match_caps(fw->fw_flags, fw->fw_word, mip,
+						 cstart, mip->mi_end))
+		found_valid |= check_adds(mip, fw, ai->ai_nr, -1);
+
+	if (found_valid && mip->mi_result == SP_OK)
+	    /* Found a valid word, no need to try other suffixes. */
+	    return TRUE;
+    }
+
+    /* No matching basic word without prefix.  When combining is
+     * allowed try with suffixes. */
+    if (ai->ai_flags & AFF_COMBINE)
+    {
+	/* Pass the word with prefix removed to suffix_match(). */
+	mip->mi_cword = pword;
+	word = mip->mi_word;
+	mip->mi_word = cstart;
+	fendlen = mip->mi_fendlen;
+	mip->mi_fendlen = STRLEN(pword);
+	i = suffix_match(mip);
+	mip->mi_cword = mip->mi_fword;
+	mip->mi_word = word;
+	mip->mi_fendlen = fendlen;
+	if (i)
+	    return TRUE;
+    }
+
+    return FALSE;
+}
+
+/*
  * Try finding a match for "mip->mi_cword" by removing suffixes.
  */
     static int
@@ -616,6 +969,8 @@
     matchinf_T	*mip;
 {
     char_u	*sufp;
+    char_u	*endw = mip->mi_cword + mip->mi_fendlen;
+    int		endw_c = *endw;
     int		charlen;
     affitem_T	*ai;
     char_u	pword[MAXWLEN + 1];
@@ -633,7 +988,7 @@
      * only, thus adds something).
      * Stop checking if there are no suffixes with so many characters.
      */
-    sufp = mip->mi_cword + STRLEN(mip->mi_cword);
+    sufp = endw;
     for (charlen = 0; charlen <= mip->mi_slang->sl_sufftab.ga_len; ++charlen)
     {
 	/* Move the pointer to the possible suffix back one character, unless
@@ -654,11 +1009,13 @@
 	    if (ht->ht_used == 0)
 		continue;
 
+	    *endw = NUL;	/* truncate after possible suffix */
 	    hi = hash_find(ht, sufp);
 	    if (HASHITEM_EMPTY(hi))
 		ai = NULL;
 	    else
 		ai = HI2AI(hi);
+	    *endw = endw_c;
 	}
 
 	if (ai != NULL)
@@ -675,7 +1032,8 @@
 		if (ai->ai_choplen == 0)
 		    pword[tlen] = NUL;
 		else
-		    mch_memmove(pword + tlen, ai->ai_chop, ai->ai_choplen + 1);
+		    mch_memmove(pword + tlen, ai->ai_add + ai->ai_addlen + 1,
+							  ai->ai_choplen + 1);
 
 		/* Find the basic word. */
 		hi = hash_find(&mip->mi_slang->sl_words, pword);
@@ -734,8 +1092,8 @@
 
     if ((capflags & BWF_KEEPCAP) == 0 && end > mip->mi_end)
     {
-	/* If "end" is past "mip->mi_end" we need to check the characters
-	 * after the basic word. */
+	/* If "end" is past "mip->mi_end" we need to adjust the caps type for
+	 * characters after the basic word. */
 #ifdef FEAT_MBYTE
 	past_second = (mip->mi_word + (*mb_ptr2len_check)(mip->mi_word)
 							       < mip->mi_end);
@@ -944,6 +1302,7 @@
 	lp->sl_name = vim_strsave(lang);
 	hash_init(&lp->sl_words);
 	ga_init2(&lp->sl_preftab, sizeof(hashtab_T), 4);
+	hash_init(&lp->sl_prewords);
 	ga_init2(&lp->sl_sufftab, sizeof(hashtab_T), 4);
 	lp->sl_prefzero = NULL;
 	lp->sl_suffzero = NULL;
@@ -965,12 +1324,31 @@
 {
     sblock_T	*sp;
     int		i;
+    fword_T	*fw;
+    int		todo;
+    hashitem_T	*hi;
 
     vim_free(lp->sl_name);
+
+    /* The words themselves are in memory blocks referenced by "sl_block".
+     * Only the hashtables for additions need to be cleared. */
+    todo = lp->sl_words.ht_used;
+    for (hi = lp->sl_words.ht_array; todo > 0; ++hi)
+    {
+	if (!HASHITEM_EMPTY(hi))
+	{
+	    --todo;
+	    fw = HI2FWORD(hi);
+	    if (fw->fw_flags & BWF_ADDHASH)
+		hash_clear((hashtab_T *)fw->fw_adds);
+	}
+    }
     hash_clear(&lp->sl_words);
+
     for (i = 0; i < lp->sl_preftab.ga_len; ++i)
 	hash_clear(((hashtab_T *)lp->sl_preftab.ga_data) + i);
     ga_clear(&lp->sl_preftab);
+    hash_clear(&lp->sl_prewords);
     for (i = 0; i < lp->sl_sufftab.ga_len; ++i)
 	hash_clear(((hashtab_T *)lp->sl_sufftab.ga_data) + i);
     ga_clear(&lp->sl_sufftab);
@@ -999,6 +1377,7 @@
     char_u	buf[MAXWLEN + 1];
     char_u	cbuf[MAXWLEN + 1];
     char_u	fbuf[MAXWLEN + 1];
+    char_u	affixbuf[256 * 2 * 2]; /* max 2 * 256 affix nrs of 2 bytes */
     char_u	*p;
     int		itm;
     int		i;
@@ -1006,6 +1385,7 @@
     int		affnr;
     int		affflags;
     int		affitemcnt;
+    int		prefixcnt, suffixcnt;
     int		bl_used = SBLOCKSIZE;
     int		widx;
     int		prefm = 0;  /* 1 if <= 256 prefixes, sizeof(short_u) otherw. */
@@ -1027,8 +1407,9 @@
     hashitem_T	*hi;
     hash_T	hash;
     int		adds;
-    addword_T	*aw;
+    addword_T	*aw, *naw;
     int		flen;
+    int		xlen;
 
     fd = fopen((char *)fname, "r");
     if (fd == NULL)
@@ -1082,14 +1463,14 @@
 	    gap = &lp->sl_preftab;
 	    aip = &lp->sl_prefzero;
 	    lp->sl_prefcnt = affcount;
-	    prefm = affcount > 256 ? sizeof(short_u) : 1;
+	    prefm = affcount > 256 ? 2 : 1;
 	}
 	else
 	{
 	    gap = &lp->sl_sufftab;
 	    aip = &lp->sl_suffzero;
 	    lp->sl_suffcnt = affcount;
-	    suffm = affcount > 256 ? sizeof(short_u) : 1;
+	    suffm = affcount > 256 ? 2 : 1;
 	}
 
 	i = (getc(fd) << 8) + getc(fd);		/* <afftotcnt> */
@@ -1100,16 +1481,15 @@
 	 */
 	for (affnr = 0; affnr < affcount; ++affnr)
 	{
-	    /* <affix>: <affflags> <affitemcnt> <affitem> ... */
-	    affflags = getc(fd);			/* <affflags> */
-	    if (affflags == EOF)
-		goto truncerr;
+	    /* <affix>: <affitemcnt> <affitem> ... */
 	    affitemcnt = (getc(fd) << 8) + getc(fd);	/* <affitemcnt> */
 	    if (affitemcnt < 0)
 		goto truncerr;
 	    for (itm = 0; itm < affitemcnt; ++itm)
 	    {
-		/* <affitem>: <affchoplen> <affchop> <affaddlen> <affadd> */
+		/* <affitem>: <affflags> <affchoplen> <affchop>
+		 *				    <affaddlen> <affadd> */
+		affflags = getc(fd);			/* <affflags> */
 		choplen = getc(fd);			/* <affchoplen> */
 		if (choplen == EOF)
 		    goto truncerr;
@@ -1121,20 +1501,25 @@
 		addlen = getc(fd);			/* <affaddlen> */
 		if (addlen == EOF)
 		    goto truncerr;
+		if (affflags & AFF_PREWORD)
+		    xlen = addlen + 2;	/* space for lead and trail string */
+		else
+		    xlen = 0;
+
 		/* Get room to store the affitem_T, chop and add strings. */
 		p = (char_u *)getroom(lp, &bl_used,
-				    sizeof(affitem_T) + choplen + addlen + 1);
+			     sizeof(affitem_T) + addlen + choplen + 1 + xlen);
 		if (p == NULL)
 		    goto errorend;
 
 		ai = (affitem_T *)p;
 		ai->ai_nr = affnr;
-		ai->ai_combine = affflags;
+		ai->ai_flags = affflags;
 		ai->ai_choplen = choplen;
 		ai->ai_addlen = addlen;
 
-		p += sizeof(affitem_T) + addlen;
-		ai->ai_chop = p;
+		/* Chop string is at ai_add[ai_addlen + 1]. */
+		p = ai->ai_add + addlen + 1;
 		STRCPY(p, buf);
 
 		p = ai->ai_add;
@@ -1142,65 +1527,116 @@
 		    p[i] = getc(fd);
 		p[i] = NUL;
 
-		/*
-		 * Add the affix to a hashtable.  Which one depends on the
-		 * length of the added string in characters.
-		 */
-#ifdef FEAT_MBYTE
-		/* Change "addlen" from length in bytes to length in chars. */
-		if (has_mbyte)
-		    addlen = mb_charlen(p);
-#endif
-		if (addlen == 0)
+		if (affflags & AFF_PREWORD)
 		{
-		    /* Link in list of zero length affixes. */
-		    ai->ai_next = *aip;
-		    *aip = ai;
-		}
-		else
-		{
-		    if (gap->ga_len < addlen)
-		    {
-			/* Longer affix, need more hashtables. */
-			if (ga_grow(gap, addlen - gap->ga_len) == FAIL)
-			    goto errorend;
+		    int	    l, leadoff, trailoff;
 
-			/* Re-allocating ga_data means that an ht_array
-			 * pointing to ht_smallarray becomes invalid.  We can
-			 * recognize this: ht_mask is at its init value. */
-			for (i = 0; i < gap->ga_len; ++i)
-			{
-			    ht = ((hashtab_T *)gap->ga_data) + i;
-			    if (ht->ht_mask == HT_INIT_SIZE - 1)
-				ht->ht_array = ht->ht_smallarray;
-			}
+		    /*
+		     * Separate lead and trail string, put word at ai_add, so
+		     * that it can be used as hashtable key.
+		     */
+		    /* lead string: up to first word char */
+		    while (*p != NUL && !spell_iswordc(p))
+			mb_ptr_adv(p);
+		    ai->ai_leadlen = p - ai->ai_add;
+		    leadoff = addlen + choplen + 2;
+		    mch_memmove(ai->ai_add + leadoff, ai->ai_add,
+							      ai->ai_leadlen);
+		    ai->ai_add[leadoff + ai->ai_leadlen] = NUL;
 
-			/* Init the newly used hashtable(s). */
-			while (gap->ga_len < addlen)
-			{
-			    hash_init(((hashtab_T *)gap->ga_data)
-							       + gap->ga_len);
-			    ++gap->ga_len;
-			}
-		    }
-		    ht = ((hashtab_T *)gap->ga_data) + addlen - 1;
-		    hash = hash_hash(p);
-		    hi = hash_lookup(ht, p, hash);
+		    /* trail string: after last word char */
+		    while (*p != NUL && spell_iswordc(p))
+			mb_ptr_adv(p);
+		    trailoff = leadoff + ai->ai_leadlen + 1;
+		    STRCPY(ai->ai_add + trailoff, p);
+		    ai->ai_taillen = STRLEN(p);
+
+		    /* word itself */
+		    l = (p - ai->ai_add) - ai->ai_leadlen;
+		    mch_memmove(ai->ai_add, ai->ai_add + ai->ai_leadlen, l);
+		    ai->ai_add[l] = NUL;
+		    hash = hash_hash(ai->ai_add);
+		    hi = hash_lookup(&lp->sl_prewords, ai->ai_add, hash);
 		    if (HASHITEM_EMPTY(hi))
 		    {
-			/* First affix with this "ai_add", add to hashtable. */
-			hash_add_item(ht, hi, p, hash);
+			/* First affix with this word, add to hashtable. */
+			hash_add_item(&lp->sl_prewords, hi, ai->ai_add, hash);
 			ai->ai_next = NULL;
 		    }
 		    else
 		    {
-			/* There already is an affix with this "ai_add", link
-			 * in the list.  */
+			/* There already is an affix with this word, link in
+			 * the list.  */
 			ai2 = HI2AI(hi);
 			ai->ai_next = ai2->ai_next;
 			ai2->ai_next = ai;
 		    }
 		}
+		else
+		{
+		    /*
+		     * Add the affix to a hashtable.  Which one depends on the
+		     * length of the added string in characters.
+		     */
+#ifdef FEAT_MBYTE
+		    /* Change "addlen" from length in bytes to length in
+		     * chars. */
+		    if (has_mbyte)
+			addlen = mb_charlen(p);
+#endif
+		    if (addlen == 0)
+		    {
+			/* Link in list of zero length affixes. */
+			ai->ai_next = *aip;
+			*aip = ai;
+		    }
+		    else
+		    {
+			if (gap->ga_len < addlen)
+			{
+			    /* Longer affix, need more hashtables. */
+			    if (ga_grow(gap, addlen - gap->ga_len) == FAIL)
+				goto errorend;
+
+			    /* Re-allocating ga_data means that an ht_array
+			     * pointing to ht_smallarray becomes invalid.  We
+			     * can recognize this: ht_mask is at its init
+			     * value. */
+			    for (i = 0; i < gap->ga_len; ++i)
+			    {
+				ht = ((hashtab_T *)gap->ga_data) + i;
+				if (ht->ht_mask == HT_INIT_SIZE - 1)
+				    ht->ht_array = ht->ht_smallarray;
+			    }
+
+			    /* Init the newly used hashtable(s). */
+			    while (gap->ga_len < addlen)
+			    {
+				hash_init(((hashtab_T *)gap->ga_data)
+							       + gap->ga_len);
+				++gap->ga_len;
+			    }
+			}
+			ht = ((hashtab_T *)gap->ga_data) + addlen - 1;
+			hash = hash_hash(p);
+			hi = hash_lookup(ht, p, hash);
+			if (HASHITEM_EMPTY(hi))
+			{
+			    /* First affix with this "ai_add", add to
+			     * hashtable. */
+			    hash_add_item(ht, hi, p, hash);
+			    ai->ai_next = NULL;
+			}
+			else
+			{
+			    /* There already is an affix with this "ai_add",
+			     * link in the list.  */
+			    ai2 = HI2AI(hi);
+			    ai->ai_next = ai2->ai_next;
+			    ai2->ai_next = ai;
+			}
+		    }
+		}
 	    }
 	}
     }
@@ -1262,19 +1698,54 @@
 	     * the case-folded word.  Note: this should only happen after the
 	     * basic word! */
 	    wlen = getc(fd);
-	    if (wlen == EOF)
+	    if (wlen < 0)
 		goto truncerr;
 	    for (i = 0; i < wlen; ++i)
 		cbuf[i] = getc(fd);
 	    cbuf[i] = NUL;
 	}
 
-	/* Find room to store the word in a fword_T. */
-	fw = (fword_T *)getroom(lp, &bl_used, (int)sizeof(fword_T) + wlen);
+	/* Optional prefixes */
+	p = affixbuf;
+	if (flags & BWF_PREFIX)
+	{
+	    cnt = getc(fd);				/* <affixcnt> */
+	    if (cnt < 0)
+		goto truncerr;
+	    prefixcnt = cnt;
+	    for (i = cnt * prefm; --i >= 0; )		/* <affixNR> */
+		*p++ = getc(fd);
+	}
+	else
+	    prefixcnt = 0;
+
+	/* Optional suffixes */
+	if (flags & BWF_SUFFIX)
+	{
+	    cnt = getc(fd);				/* <affixcnt> */
+	    if (cnt < 0)
+		goto truncerr;
+	    suffixcnt = cnt;
+	    for (i = cnt * suffm; --i >= 0; )		/* <affixNR> */
+		*p++ = getc(fd);
+	}
+	else
+	    suffixcnt = 0;
+
+	/* Find room to store the word in an fword_T. */
+	fw = (fword_T *)getroom(lp, &bl_used, (int)sizeof(fword_T) + wlen
+							    + (p - affixbuf));
 	if (fw == NULL)
 	    goto errorend;
 	mch_memmove(fw->fw_word, (flags & BWF_KEEPCAP) ? cbuf : buf, wlen + 1);
+
+	/* Put the affix NRs just after the word, if any. */
+	if (p > affixbuf)
+	    mch_memmove(fw->fw_word + wlen + 1, affixbuf, p - affixbuf);
+
 	fw->fw_flags = flags;
+	fw->fw_prefixcnt = prefixcnt;
+	fw->fw_suffixcnt = suffixcnt;
 
 	hash = hash_hash(buf);
 	hi = hash_lookup(&lp->sl_words, buf, hash);
@@ -1287,25 +1758,13 @@
 	else
 	{
 	    /* Already have this basic word in the hashtable, this one will
-	     * have different case flags. */
+	     * have different case flags and/or affixes. */
 	    fw2 = HI2FWORD(hi);
 	    fw->fw_next = fw2->fw_next;
 	    fw2->fw_next = fw;
 	    --widx;			/* don't count this one */
 	}
 
-	/* Optional prefixes and suffixes. */
-	if (flags & BWF_PREFIX)
-	    fw->fw_prefixcnt = spell_load_affixes(fd, lp, &bl_used,
-						       prefm, &fw->fw_prefix);
-	else
-	    fw->fw_prefixcnt = 0;
-	if (flags & BWF_SUFFIX)
-	    fw->fw_suffixcnt = spell_load_affixes(fd, lp, &bl_used,
-						       suffm, &fw->fw_suffix);
-	else
-	    fw->fw_suffixcnt = 0;
-
 	if (flags & BWF_REGION)
 	    fw->fw_region = getc(fd);			/* <region> */
 	else
@@ -1316,6 +1775,24 @@
 	{
 	    adds = (getc(fd) << 8) + getc(fd);		/* <addcnt> */
 
+	    if (adds > 30)
+	    {
+		/* Use a hashtable to loopup the part until the next word end.
+		 * This uses more memory and involves some overhead, thus only
+		 * do it when there are many additions (e.g., for French).  */
+		ht = (hashtab_T *)getroom(lp, &bl_used, sizeof(hashtab_T));
+		if (ht == NULL)
+		    goto errorend;
+		hash_init(ht);
+		fw->fw_adds = (addword_T *)ht;
+		fw->fw_flags |= BWF_ADDHASH;
+
+		/* Preset the size of the hashtable. It's never unlocked. */
+		hash_lock_size(ht, adds + 1);
+	    }
+	    else
+		ht = NULL;
+
 	    while (--adds >= 0)
 	    {
 		/* <add>: <addflags> <addlen> [<leadlen> <addstring>]
@@ -1352,9 +1829,6 @@
 					   sizeof(addword_T) + addlen + flen);
 		if (aw == NULL)
 		    goto errorend;
-		aw->aw_next = fw->fw_adds;
-		fw->fw_adds = aw;
-		aw->aw_leadlen = leadlen;
 
 		if (flags & ADD_KEEPCAP)
 		{
@@ -1368,11 +1842,82 @@
 
 		aw->aw_flags = flags;
 		aw->aw_wordlen = addlen;
+		aw->aw_leadlen = leadlen;
 
 		if (flags & ADD_REGION)
 		    aw->aw_region = getc(fd);		/* <region> */
 		else
 		    aw->aw_region = REGION_ALL;
+
+		if (ht == NULL)
+		{
+		    /* Using simple linked list, put it in front. */
+		    aw->aw_next = fw->fw_adds;
+		    fw->fw_adds = aw;
+		    aw->aw_saveb = NUL;
+		}
+		else
+		{
+		    /* Put addition in hashtable.  For key we use the part up
+		     * to the next end-of-word. */
+		    if (leadlen == 0)
+		    {
+			p = aw->aw_word;
+			while (*p != NUL && !spell_iswordc(p))
+			    mb_ptr_adv(p);
+		    }
+
+		    if (leadlen != 0 || *p == NUL)
+		    {
+			/* Only non-word characters in addition, add it to the
+			 * list with the special key NOWC_KEY.  Also do this
+			 * when there is a leadstring, it would get too
+			 * complicated. */
+			hash = hash_hash(NOWC_KEY);
+			hi = hash_lookup(ht, NOWC_KEY, hash);
+			if (HASHITEM_EMPTY(hi))
+			{
+			    /* we use a dummy item as the list header */
+			    naw = (addword_T *)getroom(lp, &bl_used,
+					sizeof(addword_T) + STRLEN(NOWC_KEY));
+			    if (naw == NULL)
+				goto errorend;
+			    STRCPY(naw->aw_word, NOWC_KEY);
+			    hash_add_item(ht, hi, naw->aw_word, hash);
+			    naw->aw_next = aw;
+			    aw->aw_next = NULL;
+			}
+			else
+			{
+			    naw = HI2ADDWORD(hi);
+			    aw->aw_next = naw->aw_next;
+			    naw->aw_next = aw;
+			}
+			aw->aw_saveb = NUL;
+		    }
+		    else
+		    {
+			/* Truncate at next non-word character, store that
+			 * byte in "aw_saveb". */
+			while (*p != NUL && spell_iswordc(p))
+			    mb_ptr_adv(p);
+			aw->aw_saveb = *p;
+			*p = NUL;
+			hash = hash_hash(aw->aw_word);
+			hi = hash_lookup(ht, aw->aw_word, hash);
+			if (HASHITEM_EMPTY(hi))
+			{
+			    hash_add_item(ht, hi, aw->aw_word, hash);
+			    aw->aw_next = NULL;
+			}
+			else
+			{
+			    naw = HI2ADDWORD(hi);
+			    aw->aw_next = naw->aw_next;
+			    naw->aw_next = aw;
+			}
+		    }
+		}
 	    }
 	}
     }
@@ -1389,49 +1934,6 @@
 }
 
 /*
- * Read a list of affixes from the spell file.
- */
-    static int
-spell_load_affixes(fd, lp, bl_usedp, affm, affp)
-    FILE	*fd;
-    slang_T	*lp;
-    int		*bl_usedp;
-    int		affm;
-    void	**affp;
-{
-    int		cnt;
-    int		i, n;
-    char_u	*p;
-
-    cnt = getc(fd);				/* <affixcnt> */
-    if (cnt == EOF)
-	return 0;
-
-    /* Get room to store the affixNR list, either as char_u (1
-     * byte) or short_u (2 bytes). */
-    p = (char_u *)getroom(lp, bl_usedp, cnt * affm);
-    if (p == NULL)
-	return 0;
-    *affp = p;
-    for (n = 0; n < cnt; ++n)
-    {
-	i = getc(fd);			/* <affixNR> */
-	if (affm > 1)
-	{
-	    i = (i << 8) + getc(fd);
-	    *(short_u *)p = i;
-	    p += sizeof(short_u);
-	}
-	else
-	{
-	    *(char_u *)p = i;
-	    ++p;
-	}
-    }
-    return cnt;
-}
-
-/*
  * Get part of an sblock_T, at least "len" bytes long.
  * Returns NULL when out of memory.
  */
@@ -1739,6 +2241,7 @@
 				 * char after word char.  NULL otherwise. */
     char_u	*ae_add_pw;	/* For a prefix with both word and non-word
 			         * chars: first word char.  NULL otherwise. */
+    char_u	ae_preword;	/* TRUE for a prefix with one word */
     char_u	*ae_cond;	/* condition (NULL for ".") */
     regprog_T	*ae_prog;	/* regexp program for ae_cond or NULL */
     short_u	ae_affnr;	/* for old affix: new affix number */
@@ -2237,6 +2740,7 @@
     int		retval = OK;
     char_u	*p;
     garray_T	tga;
+    int		preword;
 
     /*
      * Loop over all the old affix names.
@@ -2268,12 +2772,14 @@
 	    for (oldae = oldah->ah_first; oldae != NULL && retval == OK;
 						       oldae = oldae->ae_next)
 	    {
+		preword = FALSE;
 		oldae->ae_add_nw = NULL;
 		oldae->ae_add_pw = NULL;
 		if (oldae->ae_add != NULL)
 		{
 		    /* Check for non-word characters in the affix.  If there
-		     * is one this affix will be turned into an addition.
+		     * is one a suffix will be turned into an addition, a
+		     * prefix may be turned into a leadstring.
 		     * This is stored with the old affix, that is where
 		     * trans_affixes() will check. */
 		    for (p = oldae->ae_add; *p != NUL; mb_ptr_adv(p))
@@ -2285,9 +2791,17 @@
 
 		    if (prefix && oldae->ae_add_nw != NULL)
 		    {
-			/* If a prefix has both word and non-word characters
-			 * special treatment is necessary.  If it has only
-			 * non-word characters it becomes a leadstring. */
+			/* If a prefix has non-word characters special
+			 * treatment is necessary:
+			 * - If it has only non-word characters it becomes a
+			 *   leadstring.
+			 * - If it has a sequence of word characters followed
+			 *   by a non-word char it becomes a "preword": "d'",
+			 *   "de-", "d'ai", etc.
+			 * - if it has another mix of word and non-word
+			 *   characters the part before the last word char
+			 *   becomes a leadstring: "'d", etc.
+			 */
 			for (p = oldae->ae_add; *p != NUL; mb_ptr_adv(p))
 			    if (spell_iswordc(p))
 			    {
@@ -2305,6 +2819,12 @@
 				    oldae->ae_add_nw = p;
 				    break;
 				}
+			    if (oldae->ae_add_nw != NULL)
+			    {
+				preword = TRUE;
+				oldae->ae_add_pw = NULL;
+				oldae->ae_add_nw = NULL;
+			    }
 			}
 		    }
 		}
@@ -2355,6 +2875,7 @@
 			newae->ae_add = NULL;
 		    else
 			newae->ae_add = vim_strsave(oldae->ae_add);
+		    newae->ae_preword = preword;
 
 		    /* The condition is not copied, since the new affix is
 		     * only used for words where the condition matches. */
@@ -2369,7 +2890,8 @@
 		for (gapnr = 0; gapnr < gap->ga_len; ++gapnr)
 		{
 		    gapah = ((affheader_T *)gap->ga_data) + gapnr;
-		    if (same_affentries(newah, gapah))
+		    if (newah->ah_combine == gapah->ah_combine
+					     && same_affentries(newah, gapah))
 			/* Found an existing affheader_T entry with same
 			 * affentry_T list, use its number. */
 			break;
@@ -2569,8 +3091,12 @@
 	ah = HI2AH(aff_hi);
 	for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
 	{
+	    /* Setup for regexp matching.  Note that we don't ignore case.
+	     * This is weird, because he rules in an .aff file don't care
+	     * about case, but it's necessary for compatibility with Myspell.
+	     */
 	    regmatch.regprog = ae->ae_prog;
-	    regmatch.rm_ic = FALSE;	/* TODO: Should this be TRUE??? */
+	    regmatch.rm_ic = FALSE;
 	    if (ae->ae_prog == NULL
 			   || vim_regexec(&regmatch, dw->dw_word, (colnr_T)0))
 	    {
@@ -3095,9 +3621,14 @@
 }
 
 /*
- * For each basic word with additions turn the affixes into other additions
- * and/or new basic words.  The result is that no affixes apply to a word with
- * additions.
+ * For each basic word with additions turn the suffixes into other additions
+ * and/or new basic words.  For each basic word with a leadstring turn the
+ * prefixes into other leadstrings and/or new basic words.
+ * The result is that no affixes apply to the additions or leadstring of a
+ * word.
+ * This is also needed when a word with an addition has a prefix and the word
+ * with prefix also exists.  E.g., "blurp's/D" (D is prefix "de") and
+ * "deblurp".  "deblurp" would match and no prefix would be tried.
  */
     static void
 expand_affixes(newwords, prefgap, suffgap)
@@ -3165,6 +3696,9 @@
 			 * when there is no prefix (pi == -1). */
 			do
 			{
+			  /* Skip prewords, they don't need to be expanded. */
+			  if (pae == NULL || !pae->ae_preword)
+			  {
 			    /* Loop over all suffix numbers.  Do without a
 			     * suffix first when there is a prefix. */
 			    for (si = (pi == -1 ? 0 : -1);
@@ -3193,10 +3727,11 @@
 					sae = sae->ae_next;
 				} while (sae != NULL);
 			    }
+			  }
 
-			    /* Advance to next prefix entry, if there is one. */
-			    if (pae != NULL)
-				pae = pae->ae_next;
+			  /* Advance to next prefix entry, if there is one. */
+			  if (pae != NULL)
+			      pae = pae->ae_next;
 			} while (pae != NULL);
 		    }
 		}
@@ -3217,6 +3752,8 @@
 /*
  * Add one word to "add_words" for basic word "bw" with additions, adding
  * prefix "pae" and suffix "sae".  Either "pae" or "sae" can be NULL.
+ * Don't do this when not necessary:
+ * - no leadstring and adding prefix doesn't result in existing word.
  */
     static void
 expand_one_aff(bw, add_words, pae, sae)
@@ -3329,6 +3866,17 @@
 	    else
 		nbw->bw_leadstring = vim_strsave(bw->bw_leadstring);
 	}
+	else if (bw->bw_prefix.ga_len > 0)
+	{
+	    /* There is no leadstring, copy the list of possible prefixes. */
+	    ga_init2(&nbw->bw_prefix, sizeof(short_u), 1);
+	    if (ga_grow(&nbw->bw_prefix, bw->bw_prefix.ga_len) == OK)
+	    {
+		mch_memmove(nbw->bw_prefix.ga_data, bw->bw_prefix.ga_data,
+				      bw->bw_prefix.ga_len * sizeof(short_u));
+		nbw->bw_prefix.ga_len = bw->bw_prefix.ga_len;
+	    }
+	}
 
 	if (bw->bw_addstring != NULL)
 	{
@@ -3405,7 +3953,7 @@
 }
 
 /*
- * Write affix info.  <affflags> <affitemcnt> <affitem> ...
+ * Write affix info.  <affitemcnt> <affitem> ...
  */
     static void
 write_affix(fd, ah)
@@ -3416,15 +3964,21 @@
     affentry_T	*ae;
     char_u	*p;
     int		round;
-
-    fputc(ah->ah_combine ? 1 : 0, fd);		/* <affflags> */
+    int		flags;
 
     /* Count the number of entries. */
     for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
 	++i;
     put_bytes(fd, (long_u)i, 2);		/* <affitemcnt> */
 
+    /* <affitem>: <affflags> <affchoplen> <affchop> <affaddlen> <affadd> */
     for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
+    {
+	flags = ah->ah_combine ? AFF_COMBINE : 0;
+	if (ae->ae_preword)
+	    flags |= AFF_PREWORD;
+	fputc(flags, fd);			/* <affflags> */
+
 	for (round = 1; round <= 2; ++round)
 	{
 	    p = round == 1 ? ae->ae_chop : ae->ae_add;
@@ -3437,6 +3991,7 @@
 		fwrite(p, STRLEN(p), (size_t)1, fd);
 	    }
 	}
+    }
 }
 
 /*
@@ -3464,7 +4019,7 @@
  *
  * <HEADER>: <fileID> <regioncnt> <regionname> ...
  *
- * <fileID>     10 bytes    "VIMspell01"
+ * <fileID>     10 bytes    "VIMspell02"
  * <regioncnt>  1 byte	    number of regions following (8 supported)
  * <regionname>	2 bytes     Region name: ca, au, etc.
  *			    First <regionname> is region 1.
@@ -3480,14 +4035,15 @@
  *                          First affixNR is 0.
  * <afftotcnt>	2 bytes	    Total number of affix items (MSB comes first).
  *
- * <affix>: <affflags> <affitemcnt> <affitem> ...
+ * <affix>: <affitemcnt> <affitem> ...
  *
- * <affflags>	1 byte	    0x01: prefix combines with suffix.
- *			    0x02-0x80: unset
  * <affitemcnt> 2 bytes	    Number of affixes with this affixNR (MSB first).
  *
- * <affitem>: <affchoplen> <affchop> <affaddlen> <affadd>
+ * <affitem>: <affflags> <affchoplen> <affchop> <affaddlen> <affadd>
  *
+ * <affflags>	1 byte	    0x01: prefix combines with suffix, AFF_COMBINE
+ *			    0x02: prefix includes word, AFF_PREWORD
+ *			    0x04-0x80: unset
  * <affchoplen> 1 byte	    Length of <affchop> in bytes.
  * <affchop>    N bytes     To be removed from basic word.
  * <affaddlen>  1 byte	    Length of <affadd> in bytes.
@@ -3516,16 +4072,19 @@
  * <string>	N bytes	    Additional bytes for word, up to byte smaller than
  *			    0x20 (space).
  *			    Must only contain case-folded word characters.
- * <flags>	1 byte	    0x01: word is valid without addition
- *			    0x02: has region byte
- *			    0x04: first letter must be upper-case
+ * <flags>	1 byte	    0x01: word is valid without addition, BWF_VALID
+ *			    0x02: has region byte, BWF_REGION
+ *			    0x04: first letter must be upper-case, BWF_ONECAP
  *			    0x08: has suffixes, <affixcnt> and <affixNR> follow
- *			    0x10: more flags, <flags2> follows next
+ *					BWF_SUFFIX
+ *			    0x10: more flags, <flags2> follows next, BWF_SECOND
  *			    0x20-0x80: can't be used, unset
- * <flags2>	1 byte	    0x01: has additions, <addcnt> and <add> follow
+ * <flags2>	1 byte	    0x01: has additions, <addcnt> and <add> follow,
+ *					BWF_ADDS
  *			    0x02: has prefixes, <affixcnt> and <affixNR> follow
- *			    0x04: all letters must be upper-case
- *			    0x08: case must match
+ *					BWF_PREFIX
+ *			    0x04: all letters must be upper-case, BWF_ALLCAP
+ *			    0x08: case must match, BWF_KEEPCAP
  *			    0x10-0x80: unset
  * <caselen>	1 byte	    Length of <caseword>.
  * <caseword>	N bytes	    Word with matching case.
@@ -3539,12 +4098,13 @@
  *
  * <add>: <addflags> <addlen> [<leadlen> <addstring>] [<region>]
  *
- * <addflags>	1 byte	    0x01: fixed case, <addstring> is the whole word
- *				  with matching case.
- *			    0x02: first letter must be upper-case
- *			    0x04: all letters must be upper-case
- *			    0x08: has region byte
- *			    0x10-0x80: unset
+ * <addflags>	1 byte	    0x01: unset
+ *			    0x02: has region byte, ADD_REGION
+ *			    0x04: first letter must be upper-case, ADD_ONECAP
+ *			    0x08-0x20: unset
+ *			    0x40: all letters must be upper-case, ADD_ALLCAP
+ *			    0x80: fixed case, <addstring> is the whole word
+ *				  with matching case, ADD_KEEPCAP.
  * <addlen>	1 byte	    Length of <addstring> in bytes.
  * <leadlen>	1 byte	    Number of bytes at start of <addstring> that must
  *			    come before the start of the basic word.
@@ -3798,17 +4358,16 @@
     {
 	flags |= BWF_VALID;
 
-	/* Add the prefix/suffix list if there are prefixes/suffixes. */
-	if (bw->bw_prefix.ga_len > 0)
-	    flags |= BWF_PREFIX;
-	if (bw->bw_suffix.ga_len > 0)
-	    flags |= BWF_SUFFIX;
-
 	/* Flags: add the region byte if the word isn't valid in all
 	 * regions. */
 	if (regionmask != 0 && (bw->bw_region & regionmask) != regionmask)
 	    flags |= BWF_REGION;
     }
+    /* Add the prefix/suffix list if there are prefixes/suffixes. */
+    if (bw->bw_leadstring == NULL && bw->bw_prefix.ga_len > 0)
+	flags |= BWF_PREFIX;
+    if (bw->bw_addstring == NULL && bw->bw_suffix.ga_len > 0)
+	flags |= BWF_SUFFIX;
 
     /* Flags: may have additions. */
     if (adds > 0)
commit	63d5a1e57b225dd90ac6d5f24876e737683961ee	[log] [tgz]
author	Bram Moolenaar <Bram@vim.org>	Tue Apr 19 21:30:25 2005 +0000
committer	Bram Moolenaar <Bram@vim.org>	Tue Apr 19 21:30:25 2005 +0000
tree	c78fcd0186ca5d7ac3f69b818c1ce9ae2dd5191c
parent	0e21a3f623bc9766953882f30326783f76df39a0 [diff] [blame]