blob: 17e932d0032a3124a8997904a688e7b0d0757086 [file] [log] [blame]
Jonathon7c7a4e62025-01-12 09:58:00 +01001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10#include "vim.h"
11
12#define LN_MAX_BUFS 8
13#define LN_DECISION_MAX 255 // pow(2, LN_MAX_BUFS(8)) - 1 = 255
14
15// struct for running the diff linematch algorithm
16typedef struct diffcmppath_S diffcmppath_T;
17struct diffcmppath_S
18{
19 // to keep track of the total score of this path
20 int df_lev_score;
21 size_t df_path_n; // current index of this path
22 int df_choice_mem[LN_DECISION_MAX + 1];
23 int df_choice[LN_DECISION_MAX];
24 // to keep track of this path traveled
25 diffcmppath_T *df_decision[LN_DECISION_MAX];
26 size_t df_optimal_choice;
27};
28
29static int matching_chars(const mmfile_t *m1, const mmfile_t *m2);
30static size_t unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs);
31static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision);
32
33 static size_t
34line_len(const mmfile_t *m)
35{
36 char *s = m->ptr;
Jonathon7c7a4e62025-01-12 09:58:00 +010037 char *end;
38
zeertzjq34e1e8d2025-02-04 16:48:36 +010039 end = memchr(s, '\n', (size_t)m->size);
40 return end ? (size_t)(end - s) : (size_t)m->size;
Jonathon7c7a4e62025-01-12 09:58:00 +010041}
42
43#define MATCH_CHAR_MAX_LEN 800
44
45/// Same as matching_chars but ignore whitespace
46///
47/// @param s1
48/// @param s2
49 static int
50matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2)
51{
52 // the newly processed strings that will be compared
53 // delete the white space characters
54 mmfile_t sp[2];
55 char p[2][MATCH_CHAR_MAX_LEN];
56
57 for (int k = 0; k < 2; k++)
58 {
59 const mmfile_t *s = k == 0 ? s1 : s2;
60 size_t pi = 0;
61 size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s));
62
63 for (size_t i = 0; i <= slen; i++)
64 {
65 char e = s->ptr[i];
66
67 if (e != ' ' && e != '\t')
68 {
69 p[k][pi] = e;
70 pi++;
71 }
72 }
73
74 sp[k].ptr = p[k];
75 sp[k].size = (int)pi;
76 }
77
78 return matching_chars(&sp[0], &sp[1]);
79}
80
81/// Return matching characters between "s1" and "s2" whilst respecting sequence
82/// order.
83/// Consider the case of two strings 'AAACCC' and 'CCCAAA', the
84/// return value from this function will be 3, either to match
85/// the 3 C's, or the 3 A's.
86///
87/// Examples:
88/// matching_chars("aabc", "acba") -> 2 // 'a' and 'b' in common
89/// matching_chars("123hello567", "he123ll567o") -> 8 // '123', 'll' and '567' in common
90/// matching_chars("abcdefg", "gfedcba") -> 1 // all characters in common,
91/// // but only at most 1 in sequence
92///
93/// @param m1
94/// @param m2
95 static int
96matching_chars(const mmfile_t *m1, const mmfile_t *m2)
97{
98 size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1));
99 size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2));
100 char *s1 = m1->ptr;
101 char *s2 = m2->ptr;
102 int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 };
103 int icur = 1; // save space by storing only two rows for i axis
104
105 for (size_t i = 0; i < s1len; i++)
106 {
107 icur = (icur == 1 ? 0 : 1);
108 int *e1 = matrix[icur];
109 int *e2 = matrix[!icur];
110
111 for (size_t j = 0; j < s2len; j++)
112 {
113 // skip char in s1
114 if (e2[j + 1] > e1[j + 1])
115 e1[j + 1] = e2[j + 1];
116 // skip char in s2
117 if (e1[j] > e1[j + 1])
118 e1[j + 1] = e1[j];
119 // compare char in s1 and s2
120 if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1])
121 e1[j + 1] = e2[j] + 1;
122 }
123 }
124
125 return matrix[icur][s2len];
126}
127
128/// count the matching characters between a variable number of strings "sp"
129/// mark the strings that have already been compared to extract them later
130/// without re-running the character match counting.
131/// @param sp
132/// @param fomvals
133/// @param n
134 static int
135count_n_matched_chars(mmfile_t **sp, const size_t n, int iwhite)
136{
137 int matched_chars = 0;
138 int matched = 0;
139
140 for (size_t i = 0; i < n; i++)
141 {
142 for (size_t j = i + 1; j < n; j++)
143 {
144 if (sp[i]->ptr != NULL && sp[j]->ptr != NULL)
145 {
146 matched++;
147 // TODO(lewis6991): handle whitespace ignoring higher up in the
148 // stack
149 matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j])
150 : matching_chars(sp[i], sp[j]);
151 }
152 }
153 }
154
155 // prioritize a match of 3 (or more lines) equally to a match of 2 lines
156 if (matched >= 2)
157 {
158 matched_chars *= 2;
159 matched_chars /= matched;
160 }
161
162 return matched_chars;
163}
164
165 static mmfile_t
166fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum)
167{
168 for (int i = 0; i < lnum - 1; i++)
169 {
zeertzjq34e1e8d2025-02-04 16:48:36 +0100170 char *line_end;
Jonathon7c7a4e62025-01-12 09:58:00 +0100171
zeertzjq34e1e8d2025-02-04 16:48:36 +0100172 line_end = memchr(s.ptr, '\n', (size_t)s.size);
173 s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0;
174 s.ptr = line_end;
Jonathon7c7a4e62025-01-12 09:58:00 +0100175 if (!s.ptr)
176 break;
177 s.ptr++;
178 s.size--;
179 }
180
181 return s;
182}
183
184/// try all the different ways to compare these lines and use the one that
185/// results in the most matching characters
186/// @param df_iters
187/// @param paths
188/// @param npaths
189/// @param path_idx
190/// @param choice
191/// @param diffcmppath
192/// @param diff_len
193/// @param ndiffs
194/// @param diff_blk
195 static void
196try_possible_paths(
197 const int *df_iters,
198 const size_t *paths,
199 const int npaths,
200 const int path_idx,
201 int *choice,
202 diffcmppath_T *diffcmppath,
203 const int *diff_len,
204 const size_t ndiffs,
205 const mmfile_t **diff_blk,
206 int iwhite)
207{
208 if (path_idx == npaths)
209 {
210 if ((*choice) > 0)
211 {
212 int from_vals[LN_MAX_BUFS] = { 0 };
213 const int *to_vals = df_iters;
214
215 mmfile_t mm[LN_MAX_BUFS]; // stack memory for current_lines
216 mmfile_t *current_lines[LN_MAX_BUFS];
217 for (size_t k = 0; k < ndiffs; k++)
218 {
219 from_vals[k] = df_iters[k];
220 // get the index at all of the places
221 if ((*choice) & (1 << k))
222 {
223 from_vals[k]--;
224 mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]);
225 }
226 else
227 CLEAR_FIELD(mm[k]);
228 current_lines[k] = &mm[k];
229 }
230 size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs);
231 size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs);
232 int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite);
233 int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars;
234
235 if (score > diffcmppath[unwrapped_idx_to].df_lev_score)
236 {
237 diffcmppath[unwrapped_idx_to].df_path_n = 1;
238 diffcmppath[unwrapped_idx_to].df_decision[0] =
239 &diffcmppath[unwrapped_idx_from];
240 diffcmppath[unwrapped_idx_to].df_choice[0] = *choice;
241 diffcmppath[unwrapped_idx_to].df_lev_score = score;
242 }
243 else if (score == diffcmppath[unwrapped_idx_to].df_lev_score)
244 {
245 size_t k = diffcmppath[unwrapped_idx_to].df_path_n++;
246 diffcmppath[unwrapped_idx_to].df_decision[k] =
247 &diffcmppath[unwrapped_idx_from];
248 diffcmppath[unwrapped_idx_to].df_choice[k] = *choice;
249 }
250 }
251 return;
252 }
253
254 size_t bit_place = paths[path_idx];
255 *(choice) |= (1 << bit_place); // set it to 1
256 try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
257 diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
258 *(choice) &= ~(1 << bit_place); // set it to 0
259 try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
260 diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
261}
262
263/// unwrap indexes to access n dimensional tensor
264/// @param values
265/// @param diff_len
266/// @param ndiffs
267 static size_t
268unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs)
269{
270 size_t num_unwrap_scalar = 1;
271
272 for (size_t k = 0; k < ndiffs; k++)
273 num_unwrap_scalar *= (size_t)diff_len[k] + 1;
274
275 size_t path_idx = 0;
276 for (size_t k = 0; k < ndiffs; k++)
277 {
278 num_unwrap_scalar /= (size_t)diff_len[k] + 1;
279
280 int n = values[k];
281 path_idx += num_unwrap_scalar * (size_t)n;
282 }
283
284 return path_idx;
285}
286
287/// populate the values of the linematch algorithm tensor, and find the best
288/// decision for how to compare the relevant lines from each of the buffers at
289/// each point in the tensor
290/// @param df_iters
291/// @param ch_dim
292/// @param diffcmppath
293/// @param diff_len
294/// @param ndiffs
295/// @param diff_blk
296 static void
297populate_tensor(
298 int *df_iters,
299 const size_t ch_dim,
300 diffcmppath_T *diffcmppath,
301 const int *diff_len,
302 const size_t ndiffs,
303 const mmfile_t **diff_blk,
304 int iwhite)
305{
306 if (ch_dim == ndiffs)
307 {
308 int npaths = 0;
309 size_t paths[LN_MAX_BUFS];
310
311 for (size_t j = 0; j < ndiffs; j++)
312 {
313 if (df_iters[j] > 0)
314 {
315 paths[npaths] = j;
316 npaths++;
317 }
318 }
319
320 int choice = 0;
321 size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs);
322
323 diffcmppath[unwrapper_idx_to].df_lev_score = -1;
324 try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath,
325 diff_len, ndiffs, diff_blk, iwhite);
326 return;
327 }
328
329 for (int i = 0; i <= diff_len[ch_dim]; i++)
330 {
331 df_iters[ch_dim] = i;
332 populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len,
333 ndiffs, diff_blk, iwhite);
334 }
335}
336
337/// algorithm to find an optimal alignment of lines of a diff block with 2 or
338/// more files. The algorithm is generalized to work for any number of files
339/// which corresponds to another dimension added to the tensor used in the
340/// algorithm
341///
342/// for questions and information about the linematch algorithm please contact
343/// Jonathon White (jonathonwhite@protonmail.com)
344///
345/// for explanation, a summary of the algorithm in 3 dimensions (3 files
346/// compared) follows
347///
348/// The 3d case (for 3 buffers) of the algorithm implemented when diffopt
349/// 'linematch' is enabled. The algorithm constructs a 3d tensor to
350/// compare a diff between 3 buffers. The dimensions of the tensor are
351/// the length of the diff in each buffer plus 1 A path is constructed by
352/// moving from one edge of the cube/3d tensor to the opposite edge.
353/// Motions from one cell of the cube to the next represent decisions. In
354/// a 3d cube, there are a total of 7 decisions that can be made,
355/// represented by the enum df_path3_choice which is defined in
356/// buffer_defs.h a comparison of buffer 0 and 1 represents a motion
357/// toward the opposite edge of the cube with components along the 0 and
358/// 1 axes. a comparison of buffer 0, 1, and 2 represents a motion
359/// toward the opposite edge of the cube with components along the 0, 1,
360/// and 2 axes. A skip of buffer 0 represents a motion along only the 0
361/// axis. For each action, a point value is awarded, and the path is
362/// saved for reference later, if it is found to have been the optimal
363/// path. The optimal path has the highest score. The score is
364/// calculated as the summation of the total characters matching between
365/// all of the lines which were compared. The structure of the algorithm
366/// is that of a dynamic programming problem. We can calculate a point
367/// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the
368/// score and path at point i,j,k, we must determine which path we want
369/// to use, this is done by looking at the possibilities and choosing
370/// the one which results in the local highest score. The total highest
371/// scored path is, then in the end represented by the cell in the
372/// opposite corner from the start location. The entire algorithm
373/// consists of populating the 3d cube with the optimal paths from which
374/// it may have came.
375///
376/// Optimizations:
377/// As the function to calculate the cell of a tensor at point i,j,k is a
378/// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need
379/// to be stored in memory at once. In the case of the 3d cube, only two
380/// slices (along k and j axis) are stored in memory. For the 2d matrix
381/// (for 2 files), only two rows are stored at a time. The next/previous
382/// slice (or row) is always calculated from the other, and they alternate
383/// at each iteration.
384/// In the 3d case, 3 arrays are populated to memorize the score (matched
385/// characters) of the 3 buffers, so a redundant calculation of the
386/// scores does not occur
387/// @param diff_blk
388/// @param diff_len
389/// @param ndiffs
390/// @param [out] [allocated] decisions
391/// @return the length of decisions
392 size_t
393linematch_nbuffers(
394 const mmfile_t **diff_blk,
395 const int *diff_len,
396 const size_t ndiffs,
397 int **decisions,
398 int iwhite)
399{
400 assert(ndiffs <= LN_MAX_BUFS);
401
402 size_t memsize = 1;
403 size_t memsize_decisions = 0;
404 for (size_t i = 0; i < ndiffs; i++)
405 {
406 assert(diff_len[i] >= 0);
407 memsize *= (size_t)(diff_len[i] + 1);
408 memsize_decisions += (size_t)diff_len[i];
409 }
410
411 // create the flattened path matrix
412 diffcmppath_T *diffcmppath = lalloc(sizeof(diffcmppath_T) * memsize, TRUE);
413 // allocate memory here
414 for (size_t i = 0; i < memsize; i++)
415 {
416 diffcmppath[i].df_lev_score = 0;
417 diffcmppath[i].df_path_n = 0;
418 for (size_t j = 0; j < (size_t)pow(2, (double)ndiffs); j++)
419 diffcmppath[i].df_choice_mem[j] = -1;
420 }
421
422 // memory for avoiding repetitive calculations of score
423 int df_iters[LN_MAX_BUFS];
424 populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk,
425 iwhite);
426
427 const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs);
428 diffcmppath_T *startNode = &diffcmppath[u];
429
430 *decisions = lalloc(sizeof(int) * memsize_decisions, TRUE);
431 size_t n_optimal = 0;
432 test_charmatch_paths(startNode, 0);
433 while (startNode->df_path_n > 0)
434 {
435 size_t j = startNode->df_optimal_choice;
436 (*decisions)[n_optimal++] = startNode->df_choice[j];
437 startNode = startNode->df_decision[j];
438 }
439 // reverse array
440 for (size_t i = 0; i < (n_optimal / 2); i++)
441 {
442 int tmp = (*decisions)[i];
443 (*decisions)[i] = (*decisions)[n_optimal - 1 - i];
444 (*decisions)[n_optimal - 1 - i] = tmp;
445 }
446
447 vim_free(diffcmppath);
448
449 return n_optimal;
450}
451
452// returns the minimum amount of path changes from start to end
453 static size_t
454test_charmatch_paths(diffcmppath_T *node, int lastdecision)
455{
456 // memoization
457 if (node->df_choice_mem[lastdecision] == -1)
458 {
459 if (node->df_path_n == 0)
460 // we have reached the end of the tree
461 node->df_choice_mem[lastdecision] = 0;
462 else
463 {
464 // the minimum amount of turns required to reach the end
465 size_t minimum_turns = SIZE_MAX;
466 for (size_t i = 0; i < node->df_path_n; i++)
467 {
468 // recurse
469 size_t t = test_charmatch_paths(node->df_decision[i],
470 node->df_choice[i]) +
471 (lastdecision != node->df_choice[i] ? 1 : 0);
472 if (t < minimum_turns)
473 {
474 node->df_optimal_choice = i;
475 minimum_turns = t;
476 }
477 }
478 node->df_choice_mem[lastdecision] = (int)minimum_turns;
479 }
480 }
481
482 return (size_t)node->df_choice_mem[lastdecision];
483}