patch 8.2.1893: fuzzy matching does not support multiple words
Problem: Fuzzy matching does not support multiple words.
Solution: Add support for matching white space separated words. (Yegappan
Lakshmanan, closes #7163)
diff --git a/src/search.c b/src/search.c
index 8133df4..18d545a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -4203,16 +4203,16 @@
* Ported from the lib_fts library authored by Forrest Smith.
* https://github.com/forrestthewoods/lib_fts/tree/master/code
*
- * Blog describing the algorithm:
+ * The following blog describes the fuzzy matching algorithm:
* https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
*
* Each matching string is assigned a score. The following factors are checked:
- * Matched letter
- * Unmatched letter
- * Consecutively matched letters
- * Proximity to start
- * Letter following a separator (space, underscore)
- * Uppercase letter following lowercase (aka CamelCase)
+ * - Matched letter
+ * - Unmatched letter
+ * - Consecutively matched letters
+ * - Proximity to start
+ * - Letter following a separator (space, underscore)
+ * - Uppercase letter following lowercase (aka CamelCase)
*
* Matched letters are good. Unmatched letters are bad. Matching near the start
* is good. Matching the first letter in the middle of a phrase is good.
@@ -4222,16 +4222,17 @@
* File paths are different from file names. File extensions may be ignorable.
* Single words care about consecutive matches but not separators or camel
* case.
- * Score starts at 0
+ * Score starts at 100
* Matched letter: +0 points
* Unmatched letter: -1 point
- * Consecutive match bonus: +5 points
- * Separator bonus: +10 points
- * Camel case bonus: +10 points
- * Unmatched leading letter: -3 points (max: -9)
+ * Consecutive match bonus: +15 points
+ * First letter bonus: +15 points
+ * Separator bonus: +30 points
+ * Camel case bonus: +30 points
+ * Unmatched leading letter: -5 points (max: -15)
*
* There is some nuance to this. Scores don’t have an intrinsic meaning. The
- * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
+ * score range isn’t 0 to 100. It’s roughly [50, 150]. Longer words have a
* lower minimum score due to unmatched letter penalty. Longer search patterns
* have a higher maximum score due to match bonuses.
*
@@ -4247,6 +4248,7 @@
*/
typedef struct
{
+ int idx; // used for stable sort
listitem_T *item;
int score;
list_T *lmatchpos;
@@ -4267,6 +4269,8 @@
#define MAX_LEADING_LETTER_PENALTY -15
// penalty for every letter that doesn't match
#define UNMATCHED_LETTER_PENALTY -1
+// penalty for gap in matching positions (-2 * k)
+#define GAP_PENALTY -2
// Score for a string that doesn't fuzzy match the pattern
#define SCORE_NONE -9999
@@ -4319,6 +4323,8 @@
// Sequential
if (currIdx == (prevIdx + 1))
score += SEQUENTIAL_BONUS;
+ else
+ score += GAP_PENALTY * (currIdx - prevIdx);
}
// Check for bonuses based on neighbor character value
@@ -4334,7 +4340,7 @@
while (sidx < currIdx)
{
neighbor = (*mb_ptr2char)(p);
- (void)mb_ptr2char_adv(&p);
+ MB_PTR_ADV(p);
sidx++;
}
curr = (*mb_ptr2char)(p);
@@ -4362,6 +4368,10 @@
return score;
}
+/*
+ * Perform a recursive search for fuzzy matching 'fuzpat' in 'str'.
+ * Return the number of matching characters.
+ */
static int
fuzzy_match_recursive(
char_u *fuzpat,
@@ -4386,11 +4396,11 @@
// Count recursions
++*recursionCount;
if (*recursionCount >= FUZZY_MATCH_RECURSION_LIMIT)
- return FALSE;
+ return 0;
// Detect end of strings
if (*fuzpat == '\0' || *str == '\0')
- return FALSE;
+ return 0;
// Loop through fuzpat and str looking for a match
first_match = TRUE;
@@ -4411,7 +4421,7 @@
// Supplied matches buffer was too short
if (nextMatch >= maxMatches)
- return FALSE;
+ return 0;
// "Copy-on-Write" srcMatches into matches
if (first_match && srcMatches)
@@ -4444,12 +4454,12 @@
// Advance
matches[nextMatch++] = strIdx;
if (has_mbyte)
- (void)mb_ptr2char_adv(&fuzpat);
+ MB_PTR_ADV(fuzpat);
else
++fuzpat;
}
if (has_mbyte)
- (void)mb_ptr2char_adv(&str);
+ MB_PTR_ADV(str);
else
++str;
strIdx++;
@@ -4469,12 +4479,12 @@
// Recursive score is better than "this"
memcpy(matches, bestRecursiveMatches, maxMatches * sizeof(matches[0]));
*outScore = bestRecursiveScore;
- return TRUE;
+ return nextMatch;
}
else if (matched)
- return TRUE; // "this" score is better than recursive
+ return nextMatch; // "this" score is better than recursive
- return FALSE; // no match
+ return 0; // no match
}
/*
@@ -4485,45 +4495,110 @@
* Scores values have no intrinsic meaning. Possible score range is not
* normalized and varies with pattern.
* Recursion is limited internally (default=10) to prevent degenerate cases
- * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
+ * (pat_arg="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
* Uses char_u for match indices. Therefore patterns are limited to MAXMATCHES
* characters.
*
- * Returns TRUE if 'fuzpat' matches 'str'. Also returns the match score in
+ * Returns TRUE if 'pat_arg' matches 'str'. Also returns the match score in
* 'outScore' and the matching character positions in 'matches'.
*/
static int
fuzzy_match(
char_u *str,
- char_u *fuzpat,
+ char_u *pat_arg,
+ int matchseq,
int *outScore,
matchidx_T *matches,
int maxMatches)
{
int recursionCount = 0;
int len = MB_CHARLEN(str);
+ char_u *save_pat;
+ char_u *pat;
+ char_u *p;
+ int complete = FALSE;
+ int score = 0;
+ int numMatches = 0;
+ int matchCount;
*outScore = 0;
- return fuzzy_match_recursive(fuzpat, str, 0, outScore, str, len, NULL,
- matches, maxMatches, 0, &recursionCount);
+ save_pat = vim_strsave(pat_arg);
+ if (save_pat == NULL)
+ return FALSE;
+ pat = save_pat;
+ p = pat;
+
+ // Try matching each word in 'pat_arg' in 'str'
+ while (TRUE)
+ {
+ if (matchseq)
+ complete = TRUE;
+ else
+ {
+ // Extract one word from the pattern (separated by space)
+ p = skipwhite(p);
+ if (*p == NUL)
+ break;
+ pat = p;
+ while (*p != NUL && !VIM_ISWHITE(PTR2CHAR(p)))
+ {
+ if (has_mbyte)
+ MB_PTR_ADV(p);
+ else
+ ++p;
+ }
+ if (*p == NUL) // processed all the words
+ complete = TRUE;
+ *p = NUL;
+ }
+
+ score = 0;
+ recursionCount = 0;
+ matchCount = fuzzy_match_recursive(pat, str, 0, &score, str, len, NULL,
+ matches + numMatches, maxMatches - numMatches,
+ 0, &recursionCount);
+ if (matchCount == 0)
+ {
+ numMatches = 0;
+ break;
+ }
+
+ // Accumulate the match score and the number of matches
+ *outScore += score;
+ numMatches += matchCount;
+
+ if (complete)
+ break;
+
+ // try matching the next word
+ ++p;
+ }
+
+ vim_free(save_pat);
+ return numMatches != 0;
}
/*
* Sort the fuzzy matches in the descending order of the match score.
+ * For items with same score, retain the order using the index (stable sort)
*/
static int
-fuzzy_item_compare(const void *s1, const void *s2)
+fuzzy_match_item_compare(const void *s1, const void *s2)
{
int v1 = ((fuzzyItem_T *)s1)->score;
int v2 = ((fuzzyItem_T *)s2)->score;
+ int idx1 = ((fuzzyItem_T *)s1)->idx;
+ int idx2 = ((fuzzyItem_T *)s2)->idx;
- return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
+ return v1 == v2 ? (idx1 - idx2) : v1 > v2 ? -1 : 1;
}
/*
* Fuzzy search the string 'str' in a list of 'items' and return the matching
* strings in 'fmatchlist'.
+ * If 'matchseq' is TRUE, then for multi-word search strings, match all the
+ * words in sequence.
* If 'items' is a list of strings, then search for 'str' in the list.
* If 'items' is a list of dicts, then either use 'key' to lookup the string
* for each item or use 'item_cb' Funcref function to get the string.
@@ -4531,9 +4606,10 @@
* matches for each item.
*/
static void
-match_fuzzy(
+fuzzy_match_in_list(
list_T *items,
char_u *str,
+ int matchseq,
char_u *key,
callback_T *item_cb,
int retmatchpos,
@@ -4561,6 +4637,7 @@
char_u *itemstr;
typval_T rettv;
+ ptrs[i].idx = i;
ptrs[i].item = li;
ptrs[i].score = SCORE_NONE;
itemstr = NULL;
@@ -4593,25 +4670,34 @@
}
if (itemstr != NULL
- && fuzzy_match(itemstr, str, &score, matches,
+ && fuzzy_match(itemstr, str, matchseq, &score, matches,
sizeof(matches) / sizeof(matches[0])))
{
// Copy the list of matching positions in itemstr to a list, if
// 'retmatchpos' is set.
if (retmatchpos)
{
- int j;
- int strsz;
+ int j = 0;
+ char_u *p;
ptrs[i].lmatchpos = list_alloc();
if (ptrs[i].lmatchpos == NULL)
goto done;
- strsz = MB_CHARLEN(str);
- for (j = 0; j < strsz; j++)
+
+ p = str;
+ while (*p != NUL)
{
- if (list_append_number(ptrs[i].lmatchpos,
- matches[j]) == FAIL)
- goto done;
+ if (!VIM_ISWHITE(PTR2CHAR(p)))
+ {
+ if (list_append_number(ptrs[i].lmatchpos,
+ matches[j]) == FAIL)
+ goto done;
+ j++;
+ }
+ if (has_mbyte)
+ MB_PTR_ADV(p);
+ else
+ ++p;
}
}
ptrs[i].score = score;
@@ -4627,7 +4713,7 @@
// Sort the list by the descending order of the match score
qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
- fuzzy_item_compare);
+ fuzzy_match_item_compare);
// For matchfuzzy(), return a list of matched strings.
// ['str1', 'str2', 'str3']
@@ -4687,6 +4773,7 @@
callback_T cb;
char_u *key = NULL;
int ret;
+ int matchseq = FALSE;
CLEAR_POINTER(&cb);
@@ -4737,6 +4824,8 @@
return;
}
}
+ if ((di = dict_find(d, (char_u *)"matchseq", -1)) != NULL)
+ matchseq = TRUE;
}
// get the fuzzy matches
@@ -4762,8 +4851,8 @@
goto done;
}
- match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]), key,
- &cb, retmatchpos, rettv->vval.v_list);
+ fuzzy_match_in_list(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
+ matchseq, key, &cb, retmatchpos, rettv->vval.v_list);
done:
free_callback(&cb);