Blame - src/linematch.c - android_external_vim

blob: 17e932d0032a3124a8997904a688e7b0d0757086 [file] [log] [blame]

Jonathon	7c7a4e6	2025-01-12 09:58:00 +0100	[diff] [blame]	1	/* vi:set ts=8 sts=4 sw=4 noet:
				2	*
				3	* VIM - Vi IMproved by Bram Moolenaar
				4	*
				5	* Do ":help uganda" in Vim to read copying and usage conditions.
				6	* Do ":help credits" in Vim to see a list of people who contributed.
				7	* See README.txt for an overview of the Vim source code.
				8	*/
				9
				10	#include "vim.h"
				11
				12	#define LN_MAX_BUFS 8
				13	#define LN_DECISION_MAX 255 // pow(2, LN_MAX_BUFS(8)) - 1 = 255
				14
				15	// struct for running the diff linematch algorithm
				16	typedef struct diffcmppath_S diffcmppath_T;
				17	struct diffcmppath_S
				18	{
				19	// to keep track of the total score of this path
				20	int df_lev_score;
				21	size_t df_path_n; // current index of this path
				22	int df_choice_mem[LN_DECISION_MAX + 1];
				23	int df_choice[LN_DECISION_MAX];
				24	// to keep track of this path traveled
				25	diffcmppath_T *df_decision[LN_DECISION_MAX];
				26	size_t df_optimal_choice;
				27	};
				28
				29	static int matching_chars(const mmfile_t m1, const mmfile_t m2);
				30	static size_t unwrap_indexes(const int values, const int diff_len, const size_t ndiffs);
				31	static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision);
				32
				33	static size_t
				34	line_len(const mmfile_t *m)
				35	{
				36	char *s = m->ptr;
Jonathon	7c7a4e6	2025-01-12 09:58:00 +0100	[diff] [blame]	37	char *end;
				38
zeertzjq	34e1e8d	2025-02-04 16:48:36 +0100	[diff] [blame]	39	end = memchr(s, '\n', (size_t)m->size);
				40	return end ? (size_t)(end - s) : (size_t)m->size;
Jonathon	7c7a4e6	2025-01-12 09:58:00 +0100	[diff] [blame]	41	}
				42
				43	#define MATCH_CHAR_MAX_LEN 800
				44
				45	/// Same as matching_chars but ignore whitespace
				46	///
				47	/// @param s1
				48	/// @param s2
				49	static int
				50	matching_chars_iwhite(const mmfile_t s1, const mmfile_t s2)
				51	{
				52	// the newly processed strings that will be compared
				53	// delete the white space characters
				54	mmfile_t sp[2];
				55	char p[2][MATCH_CHAR_MAX_LEN];
				56
				57	for (int k = 0; k < 2; k++)
				58	{
				59	const mmfile_t *s = k == 0 ? s1 : s2;
				60	size_t pi = 0;
				61	size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s));
				62
				63	for (size_t i = 0; i <= slen; i++)
				64	{
				65	char e = s->ptr[i];
				66
				67	if (e != ' ' && e != '\t')
				68	{
				69	p[k][pi] = e;
				70	pi++;
				71	}
				72	}
				73
				74	sp[k].ptr = p[k];
				75	sp[k].size = (int)pi;
				76	}
				77
				78	return matching_chars(&sp[0], &sp[1]);
				79	}
				80
				81	/// Return matching characters between "s1" and "s2" whilst respecting sequence
				82	/// order.
				83	/// Consider the case of two strings 'AAACCC' and 'CCCAAA', the
				84	/// return value from this function will be 3, either to match
				85	/// the 3 C's, or the 3 A's.
				86	///
				87	/// Examples:
				88	/// matching_chars("aabc", "acba") -> 2 // 'a' and 'b' in common
				89	/// matching_chars("123hello567", "he123ll567o") -> 8 // '123', 'll' and '567' in common
				90	/// matching_chars("abcdefg", "gfedcba") -> 1 // all characters in common,
				91	/// // but only at most 1 in sequence
				92	///
				93	/// @param m1
				94	/// @param m2
				95	static int
				96	matching_chars(const mmfile_t m1, const mmfile_t m2)
				97	{
				98	size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1));
				99	size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2));
				100	char *s1 = m1->ptr;
				101	char *s2 = m2->ptr;
				102	int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 };
				103	int icur = 1; // save space by storing only two rows for i axis
				104
				105	for (size_t i = 0; i < s1len; i++)
				106	{
				107	icur = (icur == 1 ? 0 : 1);
				108	int *e1 = matrix[icur];
				109	int *e2 = matrix[!icur];
				110
				111	for (size_t j = 0; j < s2len; j++)
				112	{
				113	// skip char in s1
				114	if (e2[j + 1] > e1[j + 1])
				115	e1[j + 1] = e2[j + 1];
				116	// skip char in s2
				117	if (e1[j] > e1[j + 1])
				118	e1[j + 1] = e1[j];
				119	// compare char in s1 and s2
				120	if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1])
				121	e1[j + 1] = e2[j] + 1;
				122	}
				123	}
				124
				125	return matrix[icur][s2len];
				126	}
				127
				128	/// count the matching characters between a variable number of strings "sp"
				129	/// mark the strings that have already been compared to extract them later
				130	/// without re-running the character match counting.
				131	/// @param sp
				132	/// @param fomvals
				133	/// @param n
				134	static int
				135	count_n_matched_chars(mmfile_t **sp, const size_t n, int iwhite)
				136	{
				137	int matched_chars = 0;
				138	int matched = 0;
				139
				140	for (size_t i = 0; i < n; i++)
				141	{
				142	for (size_t j = i + 1; j < n; j++)
				143	{
				144	if (sp[i]->ptr != NULL && sp[j]->ptr != NULL)
				145	{
				146	matched++;
				147	// TODO(lewis6991): handle whitespace ignoring higher up in the
				148	// stack
				149	matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j])
				150	: matching_chars(sp[i], sp[j]);
				151	}
				152	}
				153	}
				154
				155	// prioritize a match of 3 (or more lines) equally to a match of 2 lines
				156	if (matched >= 2)
				157	{
				158	matched_chars *= 2;
				159	matched_chars /= matched;
				160	}
				161
				162	return matched_chars;
				163	}
				164
				165	static mmfile_t
				166	fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum)
				167	{
				168	for (int i = 0; i < lnum - 1; i++)
				169	{
zeertzjq	34e1e8d	2025-02-04 16:48:36 +0100	[diff] [blame]	170	char *line_end;
Jonathon	7c7a4e6	2025-01-12 09:58:00 +0100	[diff] [blame]	171
zeertzjq	34e1e8d	2025-02-04 16:48:36 +0100	[diff] [blame]	172	line_end = memchr(s.ptr, '\n', (size_t)s.size);
				173	s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0;
				174	s.ptr = line_end;
Jonathon	7c7a4e6	2025-01-12 09:58:00 +0100	[diff] [blame]	175	if (!s.ptr)
				176	break;
				177	s.ptr++;
				178	s.size--;
				179	}
				180
				181	return s;
				182	}
				183
				184	/// try all the different ways to compare these lines and use the one that
				185	/// results in the most matching characters
				186	/// @param df_iters
				187	/// @param paths
				188	/// @param npaths
				189	/// @param path_idx
				190	/// @param choice
				191	/// @param diffcmppath
				192	/// @param diff_len
				193	/// @param ndiffs
				194	/// @param diff_blk
				195	static void
				196	try_possible_paths(
				197	const int *df_iters,
				198	const size_t *paths,
				199	const int npaths,
				200	const int path_idx,
				201	int *choice,
				202	diffcmppath_T *diffcmppath,
				203	const int *diff_len,
				204	const size_t ndiffs,
				205	const mmfile_t **diff_blk,
				206	int iwhite)
				207	{
				208	if (path_idx == npaths)
				209	{
				210	if ((*choice) > 0)
				211	{
				212	int from_vals[LN_MAX_BUFS] = { 0 };
				213	const int *to_vals = df_iters;
				214
				215	mmfile_t mm[LN_MAX_BUFS]; // stack memory for current_lines
				216	mmfile_t *current_lines[LN_MAX_BUFS];
				217	for (size_t k = 0; k < ndiffs; k++)
				218	{
				219	from_vals[k] = df_iters[k];
				220	// get the index at all of the places
				221	if ((*choice) & (1 << k))
				222	{
				223	from_vals[k]--;
				224	mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]);
				225	}
				226	else
				227	CLEAR_FIELD(mm[k]);
				228	current_lines[k] = &mm[k];
				229	}
				230	size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs);
				231	size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs);
				232	int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite);
				233	int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars;
				234
				235	if (score > diffcmppath[unwrapped_idx_to].df_lev_score)
				236	{
				237	diffcmppath[unwrapped_idx_to].df_path_n = 1;
				238	diffcmppath[unwrapped_idx_to].df_decision[0] =
				239	&diffcmppath[unwrapped_idx_from];
				240	diffcmppath[unwrapped_idx_to].df_choice[0] = *choice;
				241	diffcmppath[unwrapped_idx_to].df_lev_score = score;
				242	}
				243	else if (score == diffcmppath[unwrapped_idx_to].df_lev_score)
				244	{
				245	size_t k = diffcmppath[unwrapped_idx_to].df_path_n++;
				246	diffcmppath[unwrapped_idx_to].df_decision[k] =
				247	&diffcmppath[unwrapped_idx_from];
				248	diffcmppath[unwrapped_idx_to].df_choice[k] = *choice;
				249	}
				250	}
				251	return;
				252	}
				253
				254	size_t bit_place = paths[path_idx];
				255	*(choice) \|= (1 << bit_place); // set it to 1
				256	try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
				257	diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
				258	*(choice) &= ~(1 << bit_place); // set it to 0
				259	try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
				260	diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
				261	}
				262
				263	/// unwrap indexes to access n dimensional tensor
				264	/// @param values
				265	/// @param diff_len
				266	/// @param ndiffs
				267	static size_t
				268	unwrap_indexes(const int values, const int diff_len, const size_t ndiffs)
				269	{
				270	size_t num_unwrap_scalar = 1;
				271
				272	for (size_t k = 0; k < ndiffs; k++)
				273	num_unwrap_scalar *= (size_t)diff_len[k] + 1;
				274
				275	size_t path_idx = 0;
				276	for (size_t k = 0; k < ndiffs; k++)
				277	{
				278	num_unwrap_scalar /= (size_t)diff_len[k] + 1;
				279
				280	int n = values[k];
				281	path_idx += num_unwrap_scalar * (size_t)n;
				282	}
				283
				284	return path_idx;
				285	}
				286
				287	/// populate the values of the linematch algorithm tensor, and find the best
				288	/// decision for how to compare the relevant lines from each of the buffers at
				289	/// each point in the tensor
				290	/// @param df_iters
				291	/// @param ch_dim
				292	/// @param diffcmppath
				293	/// @param diff_len
				294	/// @param ndiffs
				295	/// @param diff_blk
				296	static void
				297	populate_tensor(
				298	int *df_iters,
				299	const size_t ch_dim,
				300	diffcmppath_T *diffcmppath,
				301	const int *diff_len,
				302	const size_t ndiffs,
				303	const mmfile_t **diff_blk,
				304	int iwhite)
				305	{
				306	if (ch_dim == ndiffs)
				307	{
				308	int npaths = 0;
				309	size_t paths[LN_MAX_BUFS];
				310
				311	for (size_t j = 0; j < ndiffs; j++)
				312	{
				313	if (df_iters[j] > 0)
				314	{
				315	paths[npaths] = j;
				316	npaths++;
				317	}
				318	}
				319
				320	int choice = 0;
				321	size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs);
				322
				323	diffcmppath[unwrapper_idx_to].df_lev_score = -1;
				324	try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath,
				325	diff_len, ndiffs, diff_blk, iwhite);
				326	return;
				327	}
				328
				329	for (int i = 0; i <= diff_len[ch_dim]; i++)
				330	{
				331	df_iters[ch_dim] = i;
				332	populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len,
				333	ndiffs, diff_blk, iwhite);
				334	}
				335	}
				336
				337	/// algorithm to find an optimal alignment of lines of a diff block with 2 or
				338	/// more files. The algorithm is generalized to work for any number of files
				339	/// which corresponds to another dimension added to the tensor used in the
				340	/// algorithm
				341	///
				342	/// for questions and information about the linematch algorithm please contact
				343	/// Jonathon White (jonathonwhite@protonmail.com)
				344	///
				345	/// for explanation, a summary of the algorithm in 3 dimensions (3 files
				346	/// compared) follows
				347	///
				348	/// The 3d case (for 3 buffers) of the algorithm implemented when diffopt
				349	/// 'linematch' is enabled. The algorithm constructs a 3d tensor to
				350	/// compare a diff between 3 buffers. The dimensions of the tensor are
				351	/// the length of the diff in each buffer plus 1 A path is constructed by
				352	/// moving from one edge of the cube/3d tensor to the opposite edge.
				353	/// Motions from one cell of the cube to the next represent decisions. In
				354	/// a 3d cube, there are a total of 7 decisions that can be made,
				355	/// represented by the enum df_path3_choice which is defined in
				356	/// buffer_defs.h a comparison of buffer 0 and 1 represents a motion
				357	/// toward the opposite edge of the cube with components along the 0 and
				358	/// 1 axes. a comparison of buffer 0, 1, and 2 represents a motion
				359	/// toward the opposite edge of the cube with components along the 0, 1,
				360	/// and 2 axes. A skip of buffer 0 represents a motion along only the 0
				361	/// axis. For each action, a point value is awarded, and the path is
				362	/// saved for reference later, if it is found to have been the optimal
				363	/// path. The optimal path has the highest score. The score is
				364	/// calculated as the summation of the total characters matching between
				365	/// all of the lines which were compared. The structure of the algorithm
				366	/// is that of a dynamic programming problem. We can calculate a point
				367	/// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the
				368	/// score and path at point i,j,k, we must determine which path we want
				369	/// to use, this is done by looking at the possibilities and choosing
				370	/// the one which results in the local highest score. The total highest
				371	/// scored path is, then in the end represented by the cell in the
				372	/// opposite corner from the start location. The entire algorithm
				373	/// consists of populating the 3d cube with the optimal paths from which
				374	/// it may have came.
				375	///
				376	/// Optimizations:
				377	/// As the function to calculate the cell of a tensor at point i,j,k is a
				378	/// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need
				379	/// to be stored in memory at once. In the case of the 3d cube, only two
				380	/// slices (along k and j axis) are stored in memory. For the 2d matrix
				381	/// (for 2 files), only two rows are stored at a time. The next/previous
				382	/// slice (or row) is always calculated from the other, and they alternate
				383	/// at each iteration.
				384	/// In the 3d case, 3 arrays are populated to memorize the score (matched
				385	/// characters) of the 3 buffers, so a redundant calculation of the
				386	/// scores does not occur
				387	/// @param diff_blk
				388	/// @param diff_len
				389	/// @param ndiffs
				390	/// @param [out] [allocated] decisions
				391	/// @return the length of decisions
				392	size_t
				393	linematch_nbuffers(
				394	const mmfile_t **diff_blk,
				395	const int *diff_len,
				396	const size_t ndiffs,
				397	int **decisions,
				398	int iwhite)
				399	{
				400	assert(ndiffs <= LN_MAX_BUFS);
				401
				402	size_t memsize = 1;
				403	size_t memsize_decisions = 0;
				404	for (size_t i = 0; i < ndiffs; i++)
				405	{
				406	assert(diff_len[i] >= 0);
				407	memsize *= (size_t)(diff_len[i] + 1);
				408	memsize_decisions += (size_t)diff_len[i];
				409	}
				410
				411	// create the flattened path matrix
				412	diffcmppath_T diffcmppath = lalloc(sizeof(diffcmppath_T) memsize, TRUE);
				413	// allocate memory here
				414	for (size_t i = 0; i < memsize; i++)
				415	{
				416	diffcmppath[i].df_lev_score = 0;
				417	diffcmppath[i].df_path_n = 0;
				418	for (size_t j = 0; j < (size_t)pow(2, (double)ndiffs); j++)
				419	diffcmppath[i].df_choice_mem[j] = -1;
				420	}
				421
				422	// memory for avoiding repetitive calculations of score
				423	int df_iters[LN_MAX_BUFS];
				424	populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk,
				425	iwhite);
				426
				427	const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs);
				428	diffcmppath_T *startNode = &diffcmppath[u];
				429
				430	decisions = lalloc(sizeof(int) memsize_decisions, TRUE);
				431	size_t n_optimal = 0;
				432	test_charmatch_paths(startNode, 0);
				433	while (startNode->df_path_n > 0)
				434	{
				435	size_t j = startNode->df_optimal_choice;
				436	(*decisions)[n_optimal++] = startNode->df_choice[j];
				437	startNode = startNode->df_decision[j];
				438	}
				439	// reverse array
				440	for (size_t i = 0; i < (n_optimal / 2); i++)
				441	{
				442	int tmp = (*decisions)[i];
				443	(decisions)[i] = (decisions)[n_optimal - 1 - i];
				444	(*decisions)[n_optimal - 1 - i] = tmp;
				445	}
				446
				447	vim_free(diffcmppath);
				448
				449	return n_optimal;
				450	}
				451
				452	// returns the minimum amount of path changes from start to end
				453	static size_t
				454	test_charmatch_paths(diffcmppath_T *node, int lastdecision)
				455	{
				456	// memoization
				457	if (node->df_choice_mem[lastdecision] == -1)
				458	{
				459	if (node->df_path_n == 0)
				460	// we have reached the end of the tree
				461	node->df_choice_mem[lastdecision] = 0;
				462	else
				463	{
				464	// the minimum amount of turns required to reach the end
				465	size_t minimum_turns = SIZE_MAX;
				466	for (size_t i = 0; i < node->df_path_n; i++)
				467	{
				468	// recurse
				469	size_t t = test_charmatch_paths(node->df_decision[i],
				470	node->df_choice[i]) +
				471	(lastdecision != node->df_choice[i] ? 1 : 0);
				472	if (t < minimum_turns)
				473	{
				474	node->df_optimal_choice = i;
				475	minimum_turns = t;
				476	}
				477	}
				478	node->df_choice_mem[lastdecision] = (int)minimum_turns;
				479	}
				480	}
				481
				482	return (size_t)node->df_choice_mem[lastdecision];
				483	}