Jonathon | 7c7a4e6 | 2025-01-12 09:58:00 +0100 | [diff] [blame] | 1 | /* vi:set ts=8 sts=4 sw=4 noet: |
| 2 | * |
| 3 | * VIM - Vi IMproved by Bram Moolenaar |
| 4 | * |
| 5 | * Do ":help uganda" in Vim to read copying and usage conditions. |
| 6 | * Do ":help credits" in Vim to see a list of people who contributed. |
| 7 | * See README.txt for an overview of the Vim source code. |
| 8 | */ |
| 9 | |
| 10 | #include "vim.h" |
| 11 | |
| 12 | #define LN_MAX_BUFS 8 |
| 13 | #define LN_DECISION_MAX 255 // pow(2, LN_MAX_BUFS(8)) - 1 = 255 |
| 14 | |
| 15 | // struct for running the diff linematch algorithm |
| 16 | typedef struct diffcmppath_S diffcmppath_T; |
| 17 | struct diffcmppath_S |
| 18 | { |
| 19 | // to keep track of the total score of this path |
| 20 | int df_lev_score; |
| 21 | size_t df_path_n; // current index of this path |
| 22 | int df_choice_mem[LN_DECISION_MAX + 1]; |
| 23 | int df_choice[LN_DECISION_MAX]; |
| 24 | // to keep track of this path traveled |
| 25 | diffcmppath_T *df_decision[LN_DECISION_MAX]; |
| 26 | size_t df_optimal_choice; |
| 27 | }; |
| 28 | |
| 29 | static int matching_chars(const mmfile_t *m1, const mmfile_t *m2); |
| 30 | static size_t unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs); |
| 31 | static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision); |
| 32 | |
| 33 | static size_t |
| 34 | line_len(const mmfile_t *m) |
| 35 | { |
| 36 | char *s = m->ptr; |
Jonathon | 7c7a4e6 | 2025-01-12 09:58:00 +0100 | [diff] [blame] | 37 | char *end; |
| 38 | |
zeertzjq | 34e1e8d | 2025-02-04 16:48:36 +0100 | [diff] [blame] | 39 | end = memchr(s, '\n', (size_t)m->size); |
| 40 | return end ? (size_t)(end - s) : (size_t)m->size; |
Jonathon | 7c7a4e6 | 2025-01-12 09:58:00 +0100 | [diff] [blame] | 41 | } |
| 42 | |
| 43 | #define MATCH_CHAR_MAX_LEN 800 |
| 44 | |
| 45 | /// Same as matching_chars but ignore whitespace |
| 46 | /// |
| 47 | /// @param s1 |
| 48 | /// @param s2 |
| 49 | static int |
| 50 | matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2) |
| 51 | { |
| 52 | // the newly processed strings that will be compared |
| 53 | // delete the white space characters |
| 54 | mmfile_t sp[2]; |
| 55 | char p[2][MATCH_CHAR_MAX_LEN]; |
| 56 | |
| 57 | for (int k = 0; k < 2; k++) |
| 58 | { |
| 59 | const mmfile_t *s = k == 0 ? s1 : s2; |
| 60 | size_t pi = 0; |
| 61 | size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s)); |
| 62 | |
| 63 | for (size_t i = 0; i <= slen; i++) |
| 64 | { |
| 65 | char e = s->ptr[i]; |
| 66 | |
| 67 | if (e != ' ' && e != '\t') |
| 68 | { |
| 69 | p[k][pi] = e; |
| 70 | pi++; |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | sp[k].ptr = p[k]; |
| 75 | sp[k].size = (int)pi; |
| 76 | } |
| 77 | |
| 78 | return matching_chars(&sp[0], &sp[1]); |
| 79 | } |
| 80 | |
| 81 | /// Return matching characters between "s1" and "s2" whilst respecting sequence |
| 82 | /// order. |
| 83 | /// Consider the case of two strings 'AAACCC' and 'CCCAAA', the |
| 84 | /// return value from this function will be 3, either to match |
| 85 | /// the 3 C's, or the 3 A's. |
| 86 | /// |
| 87 | /// Examples: |
| 88 | /// matching_chars("aabc", "acba") -> 2 // 'a' and 'b' in common |
| 89 | /// matching_chars("123hello567", "he123ll567o") -> 8 // '123', 'll' and '567' in common |
| 90 | /// matching_chars("abcdefg", "gfedcba") -> 1 // all characters in common, |
| 91 | /// // but only at most 1 in sequence |
| 92 | /// |
| 93 | /// @param m1 |
| 94 | /// @param m2 |
| 95 | static int |
| 96 | matching_chars(const mmfile_t *m1, const mmfile_t *m2) |
| 97 | { |
| 98 | size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1)); |
| 99 | size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2)); |
| 100 | char *s1 = m1->ptr; |
| 101 | char *s2 = m2->ptr; |
| 102 | int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 }; |
| 103 | int icur = 1; // save space by storing only two rows for i axis |
| 104 | |
| 105 | for (size_t i = 0; i < s1len; i++) |
| 106 | { |
| 107 | icur = (icur == 1 ? 0 : 1); |
| 108 | int *e1 = matrix[icur]; |
| 109 | int *e2 = matrix[!icur]; |
| 110 | |
| 111 | for (size_t j = 0; j < s2len; j++) |
| 112 | { |
| 113 | // skip char in s1 |
| 114 | if (e2[j + 1] > e1[j + 1]) |
| 115 | e1[j + 1] = e2[j + 1]; |
| 116 | // skip char in s2 |
| 117 | if (e1[j] > e1[j + 1]) |
| 118 | e1[j + 1] = e1[j]; |
| 119 | // compare char in s1 and s2 |
| 120 | if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1]) |
| 121 | e1[j + 1] = e2[j] + 1; |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | return matrix[icur][s2len]; |
| 126 | } |
| 127 | |
| 128 | /// count the matching characters between a variable number of strings "sp" |
| 129 | /// mark the strings that have already been compared to extract them later |
| 130 | /// without re-running the character match counting. |
| 131 | /// @param sp |
| 132 | /// @param fomvals |
| 133 | /// @param n |
| 134 | static int |
| 135 | count_n_matched_chars(mmfile_t **sp, const size_t n, int iwhite) |
| 136 | { |
| 137 | int matched_chars = 0; |
| 138 | int matched = 0; |
| 139 | |
| 140 | for (size_t i = 0; i < n; i++) |
| 141 | { |
| 142 | for (size_t j = i + 1; j < n; j++) |
| 143 | { |
| 144 | if (sp[i]->ptr != NULL && sp[j]->ptr != NULL) |
| 145 | { |
| 146 | matched++; |
| 147 | // TODO(lewis6991): handle whitespace ignoring higher up in the |
| 148 | // stack |
| 149 | matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j]) |
| 150 | : matching_chars(sp[i], sp[j]); |
| 151 | } |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | // prioritize a match of 3 (or more lines) equally to a match of 2 lines |
| 156 | if (matched >= 2) |
| 157 | { |
| 158 | matched_chars *= 2; |
| 159 | matched_chars /= matched; |
| 160 | } |
| 161 | |
| 162 | return matched_chars; |
| 163 | } |
| 164 | |
| 165 | static mmfile_t |
| 166 | fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum) |
| 167 | { |
| 168 | for (int i = 0; i < lnum - 1; i++) |
| 169 | { |
zeertzjq | 34e1e8d | 2025-02-04 16:48:36 +0100 | [diff] [blame] | 170 | char *line_end; |
Jonathon | 7c7a4e6 | 2025-01-12 09:58:00 +0100 | [diff] [blame] | 171 | |
zeertzjq | 34e1e8d | 2025-02-04 16:48:36 +0100 | [diff] [blame] | 172 | line_end = memchr(s.ptr, '\n', (size_t)s.size); |
| 173 | s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0; |
| 174 | s.ptr = line_end; |
Jonathon | 7c7a4e6 | 2025-01-12 09:58:00 +0100 | [diff] [blame] | 175 | if (!s.ptr) |
| 176 | break; |
| 177 | s.ptr++; |
| 178 | s.size--; |
| 179 | } |
| 180 | |
| 181 | return s; |
| 182 | } |
| 183 | |
| 184 | /// try all the different ways to compare these lines and use the one that |
| 185 | /// results in the most matching characters |
| 186 | /// @param df_iters |
| 187 | /// @param paths |
| 188 | /// @param npaths |
| 189 | /// @param path_idx |
| 190 | /// @param choice |
| 191 | /// @param diffcmppath |
| 192 | /// @param diff_len |
| 193 | /// @param ndiffs |
| 194 | /// @param diff_blk |
| 195 | static void |
| 196 | try_possible_paths( |
| 197 | const int *df_iters, |
| 198 | const size_t *paths, |
| 199 | const int npaths, |
| 200 | const int path_idx, |
| 201 | int *choice, |
| 202 | diffcmppath_T *diffcmppath, |
| 203 | const int *diff_len, |
| 204 | const size_t ndiffs, |
| 205 | const mmfile_t **diff_blk, |
| 206 | int iwhite) |
| 207 | { |
| 208 | if (path_idx == npaths) |
| 209 | { |
| 210 | if ((*choice) > 0) |
| 211 | { |
| 212 | int from_vals[LN_MAX_BUFS] = { 0 }; |
| 213 | const int *to_vals = df_iters; |
| 214 | |
| 215 | mmfile_t mm[LN_MAX_BUFS]; // stack memory for current_lines |
| 216 | mmfile_t *current_lines[LN_MAX_BUFS]; |
| 217 | for (size_t k = 0; k < ndiffs; k++) |
| 218 | { |
| 219 | from_vals[k] = df_iters[k]; |
| 220 | // get the index at all of the places |
| 221 | if ((*choice) & (1 << k)) |
| 222 | { |
| 223 | from_vals[k]--; |
| 224 | mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]); |
| 225 | } |
| 226 | else |
| 227 | CLEAR_FIELD(mm[k]); |
| 228 | current_lines[k] = &mm[k]; |
| 229 | } |
| 230 | size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs); |
| 231 | size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs); |
| 232 | int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite); |
| 233 | int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars; |
| 234 | |
| 235 | if (score > diffcmppath[unwrapped_idx_to].df_lev_score) |
| 236 | { |
| 237 | diffcmppath[unwrapped_idx_to].df_path_n = 1; |
| 238 | diffcmppath[unwrapped_idx_to].df_decision[0] = |
| 239 | &diffcmppath[unwrapped_idx_from]; |
| 240 | diffcmppath[unwrapped_idx_to].df_choice[0] = *choice; |
| 241 | diffcmppath[unwrapped_idx_to].df_lev_score = score; |
| 242 | } |
| 243 | else if (score == diffcmppath[unwrapped_idx_to].df_lev_score) |
| 244 | { |
| 245 | size_t k = diffcmppath[unwrapped_idx_to].df_path_n++; |
| 246 | diffcmppath[unwrapped_idx_to].df_decision[k] = |
| 247 | &diffcmppath[unwrapped_idx_from]; |
| 248 | diffcmppath[unwrapped_idx_to].df_choice[k] = *choice; |
| 249 | } |
| 250 | } |
| 251 | return; |
| 252 | } |
| 253 | |
| 254 | size_t bit_place = paths[path_idx]; |
| 255 | *(choice) |= (1 << bit_place); // set it to 1 |
| 256 | try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice, |
| 257 | diffcmppath, diff_len, ndiffs, diff_blk, iwhite); |
| 258 | *(choice) &= ~(1 << bit_place); // set it to 0 |
| 259 | try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice, |
| 260 | diffcmppath, diff_len, ndiffs, diff_blk, iwhite); |
| 261 | } |
| 262 | |
| 263 | /// unwrap indexes to access n dimensional tensor |
| 264 | /// @param values |
| 265 | /// @param diff_len |
| 266 | /// @param ndiffs |
| 267 | static size_t |
| 268 | unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs) |
| 269 | { |
| 270 | size_t num_unwrap_scalar = 1; |
| 271 | |
| 272 | for (size_t k = 0; k < ndiffs; k++) |
| 273 | num_unwrap_scalar *= (size_t)diff_len[k] + 1; |
| 274 | |
| 275 | size_t path_idx = 0; |
| 276 | for (size_t k = 0; k < ndiffs; k++) |
| 277 | { |
| 278 | num_unwrap_scalar /= (size_t)diff_len[k] + 1; |
| 279 | |
| 280 | int n = values[k]; |
| 281 | path_idx += num_unwrap_scalar * (size_t)n; |
| 282 | } |
| 283 | |
| 284 | return path_idx; |
| 285 | } |
| 286 | |
| 287 | /// populate the values of the linematch algorithm tensor, and find the best |
| 288 | /// decision for how to compare the relevant lines from each of the buffers at |
| 289 | /// each point in the tensor |
| 290 | /// @param df_iters |
| 291 | /// @param ch_dim |
| 292 | /// @param diffcmppath |
| 293 | /// @param diff_len |
| 294 | /// @param ndiffs |
| 295 | /// @param diff_blk |
| 296 | static void |
| 297 | populate_tensor( |
| 298 | int *df_iters, |
| 299 | const size_t ch_dim, |
| 300 | diffcmppath_T *diffcmppath, |
| 301 | const int *diff_len, |
| 302 | const size_t ndiffs, |
| 303 | const mmfile_t **diff_blk, |
| 304 | int iwhite) |
| 305 | { |
| 306 | if (ch_dim == ndiffs) |
| 307 | { |
| 308 | int npaths = 0; |
| 309 | size_t paths[LN_MAX_BUFS]; |
| 310 | |
| 311 | for (size_t j = 0; j < ndiffs; j++) |
| 312 | { |
| 313 | if (df_iters[j] > 0) |
| 314 | { |
| 315 | paths[npaths] = j; |
| 316 | npaths++; |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | int choice = 0; |
| 321 | size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs); |
| 322 | |
| 323 | diffcmppath[unwrapper_idx_to].df_lev_score = -1; |
| 324 | try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath, |
| 325 | diff_len, ndiffs, diff_blk, iwhite); |
| 326 | return; |
| 327 | } |
| 328 | |
| 329 | for (int i = 0; i <= diff_len[ch_dim]; i++) |
| 330 | { |
| 331 | df_iters[ch_dim] = i; |
| 332 | populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len, |
| 333 | ndiffs, diff_blk, iwhite); |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | /// algorithm to find an optimal alignment of lines of a diff block with 2 or |
| 338 | /// more files. The algorithm is generalized to work for any number of files |
| 339 | /// which corresponds to another dimension added to the tensor used in the |
| 340 | /// algorithm |
| 341 | /// |
| 342 | /// for questions and information about the linematch algorithm please contact |
| 343 | /// Jonathon White (jonathonwhite@protonmail.com) |
| 344 | /// |
| 345 | /// for explanation, a summary of the algorithm in 3 dimensions (3 files |
| 346 | /// compared) follows |
| 347 | /// |
| 348 | /// The 3d case (for 3 buffers) of the algorithm implemented when diffopt |
| 349 | /// 'linematch' is enabled. The algorithm constructs a 3d tensor to |
| 350 | /// compare a diff between 3 buffers. The dimensions of the tensor are |
| 351 | /// the length of the diff in each buffer plus 1 A path is constructed by |
| 352 | /// moving from one edge of the cube/3d tensor to the opposite edge. |
| 353 | /// Motions from one cell of the cube to the next represent decisions. In |
| 354 | /// a 3d cube, there are a total of 7 decisions that can be made, |
| 355 | /// represented by the enum df_path3_choice which is defined in |
| 356 | /// buffer_defs.h a comparison of buffer 0 and 1 represents a motion |
| 357 | /// toward the opposite edge of the cube with components along the 0 and |
| 358 | /// 1 axes. a comparison of buffer 0, 1, and 2 represents a motion |
| 359 | /// toward the opposite edge of the cube with components along the 0, 1, |
| 360 | /// and 2 axes. A skip of buffer 0 represents a motion along only the 0 |
| 361 | /// axis. For each action, a point value is awarded, and the path is |
| 362 | /// saved for reference later, if it is found to have been the optimal |
| 363 | /// path. The optimal path has the highest score. The score is |
| 364 | /// calculated as the summation of the total characters matching between |
| 365 | /// all of the lines which were compared. The structure of the algorithm |
| 366 | /// is that of a dynamic programming problem. We can calculate a point |
| 367 | /// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the |
| 368 | /// score and path at point i,j,k, we must determine which path we want |
| 369 | /// to use, this is done by looking at the possibilities and choosing |
| 370 | /// the one which results in the local highest score. The total highest |
| 371 | /// scored path is, then in the end represented by the cell in the |
| 372 | /// opposite corner from the start location. The entire algorithm |
| 373 | /// consists of populating the 3d cube with the optimal paths from which |
| 374 | /// it may have came. |
| 375 | /// |
| 376 | /// Optimizations: |
| 377 | /// As the function to calculate the cell of a tensor at point i,j,k is a |
| 378 | /// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need |
| 379 | /// to be stored in memory at once. In the case of the 3d cube, only two |
| 380 | /// slices (along k and j axis) are stored in memory. For the 2d matrix |
| 381 | /// (for 2 files), only two rows are stored at a time. The next/previous |
| 382 | /// slice (or row) is always calculated from the other, and they alternate |
| 383 | /// at each iteration. |
| 384 | /// In the 3d case, 3 arrays are populated to memorize the score (matched |
| 385 | /// characters) of the 3 buffers, so a redundant calculation of the |
| 386 | /// scores does not occur |
| 387 | /// @param diff_blk |
| 388 | /// @param diff_len |
| 389 | /// @param ndiffs |
| 390 | /// @param [out] [allocated] decisions |
| 391 | /// @return the length of decisions |
| 392 | size_t |
| 393 | linematch_nbuffers( |
| 394 | const mmfile_t **diff_blk, |
| 395 | const int *diff_len, |
| 396 | const size_t ndiffs, |
| 397 | int **decisions, |
| 398 | int iwhite) |
| 399 | { |
| 400 | assert(ndiffs <= LN_MAX_BUFS); |
| 401 | |
| 402 | size_t memsize = 1; |
| 403 | size_t memsize_decisions = 0; |
| 404 | for (size_t i = 0; i < ndiffs; i++) |
| 405 | { |
| 406 | assert(diff_len[i] >= 0); |
| 407 | memsize *= (size_t)(diff_len[i] + 1); |
| 408 | memsize_decisions += (size_t)diff_len[i]; |
| 409 | } |
| 410 | |
| 411 | // create the flattened path matrix |
| 412 | diffcmppath_T *diffcmppath = lalloc(sizeof(diffcmppath_T) * memsize, TRUE); |
| 413 | // allocate memory here |
| 414 | for (size_t i = 0; i < memsize; i++) |
| 415 | { |
| 416 | diffcmppath[i].df_lev_score = 0; |
| 417 | diffcmppath[i].df_path_n = 0; |
| 418 | for (size_t j = 0; j < (size_t)pow(2, (double)ndiffs); j++) |
| 419 | diffcmppath[i].df_choice_mem[j] = -1; |
| 420 | } |
| 421 | |
| 422 | // memory for avoiding repetitive calculations of score |
| 423 | int df_iters[LN_MAX_BUFS]; |
| 424 | populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk, |
| 425 | iwhite); |
| 426 | |
| 427 | const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs); |
| 428 | diffcmppath_T *startNode = &diffcmppath[u]; |
| 429 | |
| 430 | *decisions = lalloc(sizeof(int) * memsize_decisions, TRUE); |
| 431 | size_t n_optimal = 0; |
| 432 | test_charmatch_paths(startNode, 0); |
| 433 | while (startNode->df_path_n > 0) |
| 434 | { |
| 435 | size_t j = startNode->df_optimal_choice; |
| 436 | (*decisions)[n_optimal++] = startNode->df_choice[j]; |
| 437 | startNode = startNode->df_decision[j]; |
| 438 | } |
| 439 | // reverse array |
| 440 | for (size_t i = 0; i < (n_optimal / 2); i++) |
| 441 | { |
| 442 | int tmp = (*decisions)[i]; |
| 443 | (*decisions)[i] = (*decisions)[n_optimal - 1 - i]; |
| 444 | (*decisions)[n_optimal - 1 - i] = tmp; |
| 445 | } |
| 446 | |
| 447 | vim_free(diffcmppath); |
| 448 | |
| 449 | return n_optimal; |
| 450 | } |
| 451 | |
| 452 | // returns the minimum amount of path changes from start to end |
| 453 | static size_t |
| 454 | test_charmatch_paths(diffcmppath_T *node, int lastdecision) |
| 455 | { |
| 456 | // memoization |
| 457 | if (node->df_choice_mem[lastdecision] == -1) |
| 458 | { |
| 459 | if (node->df_path_n == 0) |
| 460 | // we have reached the end of the tree |
| 461 | node->df_choice_mem[lastdecision] = 0; |
| 462 | else |
| 463 | { |
| 464 | // the minimum amount of turns required to reach the end |
| 465 | size_t minimum_turns = SIZE_MAX; |
| 466 | for (size_t i = 0; i < node->df_path_n; i++) |
| 467 | { |
| 468 | // recurse |
| 469 | size_t t = test_charmatch_paths(node->df_decision[i], |
| 470 | node->df_choice[i]) + |
| 471 | (lastdecision != node->df_choice[i] ? 1 : 0); |
| 472 | if (t < minimum_turns) |
| 473 | { |
| 474 | node->df_optimal_choice = i; |
| 475 | minimum_turns = t; |
| 476 | } |
| 477 | } |
| 478 | node->df_choice_mem[lastdecision] = (int)minimum_turns; |
| 479 | } |
| 480 | } |
| 481 | |
| 482 | return (size_t)node->df_choice_mem[lastdecision]; |
| 483 | } |