patch 8.2.4695: JSON encoding could be faster Problem: JSON encoding could be faster. Solution: Optimize encoding JSON strings. (closes #10086)

commit: beb0ef1ab2dbd9760345e3e03647b93914591d56 [log] [tgz]
author: LemonBoy <thatlemon@gmail.com> Tue Apr 05 15:07:32 2022 +0100
committer: Bram Moolenaar <Bram@vim.org> Tue Apr 05 15:07:32 2022 +0100
tree: 6c210b9fc90f48c2b92a65ac5b0e4d98bbd007a7
parent: 02560424bf838cadc8c19294af6b6b6c383ab291 [diff]
diff --git a/src/json.c b/src/json.c
index 5be8f7f..47bf990 100644
--- a/src/json.c
+++ b/src/json.c

@@ -114,37 +114,72 @@
 }
 #endif
 
+/*
+ * Lookup table to quickly know if the given ASCII character must be escaped.
+ */
+static const char ascii_needs_escape[128] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x0.
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1.
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x2.
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x3.
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // 0x5.
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x6.
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
+};
+
+/*
+ * Encode the utf-8 encoded string "str" into "gap".
+ */
     static void
 write_string(garray_T *gap, char_u *str)
 {
     char_u	*res = str;
     char_u	numbuf[NUMBUFLEN];
+    char_u	*from;
+#if defined(USE_ICONV)
+    vimconv_T   conv;
+    char_u	*converted = NULL;
+#endif
+    int		c;
 
     if (res == NULL)
-	ga_concat(gap, (char_u *)"\"\"");
-    else
     {
-#if defined(USE_ICONV)
-	vimconv_T   conv;
-	char_u	    *converted = NULL;
+	ga_concat(gap, (char_u *)"\"\"");
+	return;
+    }
 
-	if (!enc_utf8)
-	{
-	    // Convert the text from 'encoding' to utf-8, the JSON string is
-	    // always utf-8.
-	    conv.vc_type = CONV_NONE;
-	    convert_setup(&conv, p_enc, (char_u*)"utf-8");
-	    if (conv.vc_type != CONV_NONE)
-		converted = res = string_convert(&conv, res, NULL);
-	    convert_setup(&conv, NULL, NULL);
-	}
+#if defined(USE_ICONV)
+    if (!enc_utf8)
+    {
+	// Convert the text from 'encoding' to utf-8, because a JSON string is
+	// always utf-8.
+	conv.vc_type = CONV_NONE;
+	convert_setup(&conv, p_enc, (char_u*)"utf-8");
+	if (conv.vc_type != CONV_NONE)
+	    converted = res = string_convert(&conv, res, NULL);
+	convert_setup(&conv, NULL, NULL);
+    }
 #endif
-	ga_append(gap, '"');
-	while (*res != NUL)
+    ga_append(gap, '"');
+    // `from` is the beginning of a sequence of bytes we can directly copy from
+    // the input string, avoiding the overhead associated to decoding/encoding
+    // them.
+    from = res;
+    while ((c = *res) != NUL)
+    {
+	// always use utf-8 encoding, ignore 'encoding'
+	if (c < 0x80)
 	{
-	    int c;
-	    // always use utf-8 encoding, ignore 'encoding'
-	    c = utf_ptr2char(res);
+	    if (!ascii_needs_escape[c])
+	    {
+		res += 1;
+		continue;
+	    }
+
+	    if (res != from)
+		ga_concat_len(gap, from, res - from);
+	    from = res + 1;
 
 	    switch (c)
 	    {
@@ -164,25 +199,43 @@
 		    ga_append(gap, c);
 		    break;
 		default:
-		    if (c >= 0x20)
-		    {
-			numbuf[utf_char2bytes(c, numbuf)] = NUL;
-			ga_concat(gap, numbuf);
-		    }
-		    else
-		    {
-			vim_snprintf((char *)numbuf, NUMBUFLEN,
-							 "\\u%04lx", (long)c);
-			ga_concat(gap, numbuf);
-		    }
+		    vim_snprintf((char *)numbuf, NUMBUFLEN, "\\u%04lx",
+								      (long)c);
+		    ga_concat(gap, numbuf);
 	    }
-	    res += utf_ptr2len(res);
+
+	    res += 1;
 	}
-	ga_append(gap, '"');
-#if defined(USE_ICONV)
-	vim_free(converted);
-#endif
+	else
+	{
+	    int l = utf_ptr2len(res);
+
+	    if (l > 1)
+	    {
+		res += l;
+		continue;
+	    }
+
+	    // Invalid utf-8 sequence, replace it with the Unicode replacement
+	    // character U+FFFD.
+	    if (res != from)
+		ga_concat_len(gap, from, res - from);
+	    from = res + 1;
+
+	    numbuf[utf_char2bytes(0xFFFD, numbuf)] = NUL;
+	    ga_concat(gap, numbuf);
+
+	    res += l;
+	}
     }
+
+    if (res != from)
+	ga_concat_len(gap, from, res - from);
+
+    ga_append(gap, '"');
+#if defined(USE_ICONV)
+    vim_free(converted);
+#endif
 }
 
 /*

diff --git a/src/testdir/test_json.vim b/src/testdir/test_json.vim
index 0248aa9..3ee7837 100644
--- a/src/testdir/test_json.vim
+++ b/src/testdir/test_json.vim

@@ -107,6 +107,9 @@
   call assert_equal('"café"', json_encode("caf\xe9"))
   let &encoding = save_encoding
 
+  " Invalid utf-8 sequences are replaced with U+FFFD (replacement character)
+  call assert_equal('"foo' . "\ufffd" . '"', json_encode("foo\xAB"))
+
   call assert_fails('echo json_encode(function("tr"))', 'E1161: Cannot json encode a func')
   call assert_fails('echo json_encode([function("tr")])', 'E1161: Cannot json encode a func')
 

diff --git a/src/version.c b/src/version.c
index 3e220f5..f4d300b 100644
--- a/src/version.c
+++ b/src/version.c

@@ -747,6 +747,8 @@
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    4695,
+/**/
     4694,
 /**/
     4693,
commit	beb0ef1ab2dbd9760345e3e03647b93914591d56	[log] [tgz]
author	LemonBoy <thatlemon@gmail.com>	Tue Apr 05 15:07:32 2022 +0100
committer	Bram Moolenaar <Bram@vim.org>	Tue Apr 05 15:07:32 2022 +0100
tree	6c210b9fc90f48c2b92a65ac5b0e4d98bbd007a7
parent	02560424bf838cadc8c19294af6b6b6c383ab291 [diff]