More scanf cleanup.
Merge CT_CCL and CT_STRING handling before we add %m.
Also fix an accidental scanf/wscanf difference.
Add currently-disabled tests for questionable behavior noticed during
code review that isn't a regression, but should be fixed later.
Bug: http://b/68672236
Bug: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=202240
Test: ran tests
Change-Id: I3eec9b7dfce84f63c68426406224822c52551d64
diff --git a/libc/stdio/vfscanf.c b/libc/stdio/vfscanf.c
index 887b435..c9e4385 100644
--- a/libc/stdio/vfscanf.c
+++ b/libc/stdio/vfscanf.c
@@ -324,10 +324,7 @@
/* scan arbitrary characters (sets NOSKIP) */
if (width == 0) width = 1;
if (flags & LONG) {
- if ((flags & SUPPRESS) == 0)
- wcp = va_arg(ap, wchar_t*);
- else
- wcp = NULL;
+ wcp = ((flags & SUPPRESS) == 0) ? va_arg(ap, wchar_t*) : NULL;
n = 0;
while (width != 0) {
if (n == (int)MB_CUR_MAX) {
@@ -388,20 +385,17 @@
break;
case CT_CCL:
- /* scan a (nonempty) character class (sets NOSKIP) */
- if (width == 0) width = (size_t)~0; /* `infinity' */
- /* take only those things in the class */
+ case CT_STRING:
+ // CT_CCL: scan a (nonempty) character class (sets NOSKIP).
+ // CT_STRING: like CCL, but zero-length string OK, & no NOSKIP.
+ if (width == 0) width = (size_t)~0; // 'infinity'.
if (flags & LONG) {
wchar_t twc;
- int nchars;
+ int nchars = 0;
- if ((flags & SUPPRESS) == 0)
- wcp = va_arg(ap, wchar_t*);
- else
- wcp = &twc;
+ wcp = (flags & SUPPRESS) == 0 ? va_arg(ap, wchar_t*) : &twc;
n = 0;
- nchars = 0;
- while (width != 0) {
+ while ((c == CT_CCL || !isspace(*fp->_p)) && width != 0) {
if (n == (int)MB_CUR_MAX) {
fp->_flags |= __SERR;
goto input_failure;
@@ -417,7 +411,7 @@
}
if (nconv == 0) *wcp = L'\0';
if (nconv != (size_t)-2) {
- if (wctob(*wcp) != EOF && !ccltab[wctob(*wcp)]) {
+ if ((c == CT_CCL && wctob(*wcp) != EOF && !ccltab[wctob(*wcp)]) || (c == CT_STRING && iswspace(*wcp))) {
while (n != 0) {
n--;
ungetc(buf[n], fp);
@@ -438,121 +432,46 @@
break;
}
}
- if (n != 0) {
+ if (c == CT_CCL && n != 0) {
fp->_flags |= __SERR;
goto input_failure;
}
n = nchars;
- if (n == 0) goto match_failure;
- if (!(flags & SUPPRESS)) {
- *wcp = L'\0';
- nassigned++;
- }
- } else
- /* take only those things in the class */
- if (flags & SUPPRESS) {
+ } else if (flags & SUPPRESS) {
n = 0;
- while (ccltab[*fp->_p]) {
+ while ((c == CT_CCL && ccltab[*fp->_p]) || (c == CT_STRING && !isspace(*fp->_p))) {
n++, fp->_r--, fp->_p++;
if (--width == 0) break;
if (fp->_r <= 0 && __srefill(fp)) {
- if (n == 0) goto input_failure;
+ if (c == CT_CCL && n == 0) goto input_failure;
break;
}
}
- if (n == 0) goto match_failure;
} else {
p0 = p = va_arg(ap, char*);
- while (ccltab[*fp->_p]) {
+ while ((c == CT_CCL && ccltab[*fp->_p]) || (c == CT_STRING && !isspace(*fp->_p))) {
fp->_r--;
*p++ = *fp->_p++;
if (--width == 0) break;
if (fp->_r <= 0 && __srefill(fp)) {
- if (p == p0) goto input_failure;
+ if (c == CT_CCL && p == p0) goto input_failure;
break;
}
}
n = p - p0;
- if (n == 0) goto match_failure;
- *p = '\0';
- nassigned++;
+ }
+ if (c == CT_CCL && n == 0) goto match_failure;
+ if (!(flags & SUPPRESS)) {
+ if (flags & LONG) {
+ *wcp = L'\0';
+ } else {
+ *p = '\0';
+ }
+ ++nassigned;
}
nread += n;
break;
- case CT_STRING:
- /* like CCL, but zero-length string OK, & no NOSKIP */
- if (width == 0) width = (size_t)~0;
- if (flags & LONG) {
- wchar_t twc;
-
- if ((flags & SUPPRESS) == 0)
- wcp = va_arg(ap, wchar_t*);
- else
- wcp = &twc;
- n = 0;
- while (!isspace(*fp->_p) && width != 0) {
- if (n == (int)MB_CUR_MAX) {
- fp->_flags |= __SERR;
- goto input_failure;
- }
- buf[n++] = *fp->_p;
- fp->_p++;
- fp->_r--;
- memset(&mbs, 0, sizeof(mbs));
- nconv = mbrtowc(wcp, buf, n, &mbs);
- if (nconv == (size_t)-1) {
- fp->_flags |= __SERR;
- goto input_failure;
- }
- if (nconv == 0) *wcp = L'\0';
- if (nconv != (size_t)-2) {
- if (iswspace(*wcp)) {
- while (n != 0) {
- n--;
- ungetc(buf[n], fp);
- }
- break;
- }
- nread += n;
- width--;
- if (!(flags & SUPPRESS)) wcp++;
- n = 0;
- }
- if (fp->_r <= 0 && __srefill(fp)) {
- if (n != 0) {
- fp->_flags |= __SERR;
- goto input_failure;
- }
- break;
- }
- }
- if (!(flags & SUPPRESS)) {
- *wcp = L'\0';
- nassigned++;
- }
- } else if (flags & SUPPRESS) {
- n = 0;
- while (!isspace(*fp->_p)) {
- n++, fp->_r--, fp->_p++;
- if (--width == 0) break;
- if (fp->_r <= 0 && __srefill(fp)) break;
- }
- nread += n;
- } else {
- p0 = p = va_arg(ap, char*);
- while (!isspace(*fp->_p)) {
- fp->_r--;
- *p++ = *fp->_p++;
- if (--width == 0) break;
- if (fp->_r <= 0 && __srefill(fp)) break;
- }
- *p = '\0';
- nread += p - p0;
- nassigned++;
- }
- continue;
-
case CT_INT:
/* scan an integer as if by strtoimax/strtoumax */
#ifdef hardway
diff --git a/libc/stdio/vfwscanf.c b/libc/stdio/vfwscanf.c
index 1030a62..71cd49b 100644
--- a/libc/stdio/vfwscanf.c
+++ b/libc/stdio/vfwscanf.c
@@ -182,7 +182,7 @@
if ((wi = __fgetwc_unlock(fp)) == WEOF) goto input_failure;
if (wi != c) {
__ungetwc(wi, fp);
- goto input_failure;
+ goto match_failure;
}
nread++;
continue;
@@ -402,28 +402,26 @@
break;
case CT_CCL:
- /* scan a (nonempty) character class (sets NOSKIP) */
- if (width == 0) width = (size_t)~0; /* `infinity' */
- /* take only those things in the class */
+ case CT_STRING:
+ // CT_CCL: scan a (nonempty) character class (sets NOSKIP).
+ // CT_STRING: like CCL, but zero-length string OK, & no NOSKIP.
+ if (width == 0) width = (size_t)~0; // 'infinity'.
if ((flags & SUPPRESS) && (flags & LONG)) {
n = 0;
- while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && in_ccl(wi, ccl)) n++;
+ while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && ((c == CT_CCL && in_ccl(wi, ccl)) || (c == CT_STRING && !iswspace(wi)))) n++;
if (wi != WEOF) __ungetwc(wi, fp);
- if (n == 0) goto match_failure;
} else if (flags & LONG) {
p0 = p = va_arg(ap, wchar_t*);
- while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && in_ccl(wi, ccl))
+ while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && ((c == CT_CCL && in_ccl(wi, ccl)) || (c == CT_STRING && !iswspace(wi)))) {
*p++ = (wchar_t)wi;
+ }
if (wi != WEOF) __ungetwc(wi, fp);
n = p - p0;
- if (n == 0) goto match_failure;
- *p = 0;
- nassigned++;
} else {
if (!(flags & SUPPRESS)) mbp = va_arg(ap, char*);
n = 0;
memset(&mbs, 0, sizeof(mbs));
- while ((wi = __fgetwc_unlock(fp)) != WEOF && width != 0 && in_ccl(wi, ccl)) {
+ while ((wi = __fgetwc_unlock(fp)) != WEOF && width != 0 && ((c == CT_CCL && in_ccl(wi, ccl)) || (c == CT_STRING && !iswspace(wi)))) {
if (width >= MB_CUR_MAX && !(flags & SUPPRESS)) {
nconv = wcrtomb(mbp, wi, &mbs);
if (nconv == (size_t)-1) goto input_failure;
@@ -438,57 +436,20 @@
n++;
}
if (wi != WEOF) __ungetwc(wi, fp);
- if (n == 0) goto match_failure;
- if (!(flags & SUPPRESS)) {
- *mbp = 0;
- nassigned++;
+ }
+ if (c == CT_CCL && n == 0) goto match_failure;
+ if (!(flags & SUPPRESS)) {
+ if (flags & LONG) {
+ *p = L'\0';
+ } else {
+ *mbp = '\0';
}
+ ++nassigned;
}
nread += n;
nconversions++;
break;
- case CT_STRING:
- /* like CCL, but zero-length string OK, & no NOSKIP */
- if (width == 0) width = (size_t)~0;
- if ((flags & SUPPRESS) && (flags & LONG)) {
- while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && !iswspace(wi)) nread++;
- if (wi != WEOF) __ungetwc(wi, fp);
- } else if (flags & LONG) {
- p0 = p = va_arg(ap, wchar_t*);
- while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && !iswspace(wi)) {
- *p++ = (wchar_t)wi;
- nread++;
- }
- if (wi != WEOF) __ungetwc(wi, fp);
- *p = 0;
- nassigned++;
- } else {
- if (!(flags & SUPPRESS)) mbp = va_arg(ap, char*);
- memset(&mbs, 0, sizeof(mbs));
- while ((wi = __fgetwc_unlock(fp)) != WEOF && width != 0 && !iswspace(wi)) {
- if (width >= MB_CUR_MAX && !(flags & SUPPRESS)) {
- nconv = wcrtomb(mbp, wi, &mbs);
- if (nconv == (size_t)-1) goto input_failure;
- } else {
- nconv = wcrtomb(mbbuf, wi, &mbs);
- if (nconv == (size_t)-1) goto input_failure;
- if (nconv > width) break;
- if (!(flags & SUPPRESS)) memcpy(mbp, mbbuf, nconv);
- }
- if (!(flags & SUPPRESS)) mbp += nconv;
- width -= nconv;
- nread++;
- }
- if (wi != WEOF) __ungetwc(wi, fp);
- if (!(flags & SUPPRESS)) {
- *mbp = 0;
- nassigned++;
- }
- }
- nconversions++;
- continue;
-
case CT_INT:
/* scan an integer as if by strtoimax/strtoumax */
if (width == 0 || width > sizeof(buf) / sizeof(*buf) - 1)
diff --git a/tests/stdio_test.cpp b/tests/stdio_test.cpp
index f0e0ab6..e060cd9 100644
--- a/tests/stdio_test.cpp
+++ b/tests/stdio_test.cpp
@@ -1016,6 +1016,95 @@
CheckScanf(swscanf, L"+,-/.", L"%[+--/]", 1, "+,-/");
}
+// https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=202240
+TEST(STDIO_TEST, scanf_wscanf_EOF) {
+ EXPECT_EQ(0, sscanf("b", "ab"));
+ EXPECT_EQ(EOF, sscanf("", "a"));
+ EXPECT_EQ(0, swscanf(L"b", L"ab"));
+ EXPECT_EQ(EOF, swscanf(L"", L"a"));
+}
+
+TEST(STDIO_TEST, scanf_invalid_UTF8) {
+#if 0 // TODO: more tests invented during code review; no regressions, so fix later.
+ char buf[BUFSIZ];
+ wchar_t wbuf[BUFSIZ];
+
+ memset(buf, 0, sizeof(buf));
+ memset(wbuf, 0, sizeof(wbuf));
+ EXPECT_EQ(0, sscanf("\xc0" " foo", "%ls %s", wbuf, buf));
+#endif
+}
+
+TEST(STDIO_TEST, scanf_no_match_no_termination) {
+ char buf[4] = "x";
+ EXPECT_EQ(0, sscanf("d", "%[abc]", buf));
+ EXPECT_EQ('x', buf[0]);
+ EXPECT_EQ(0, swscanf(L"d", L"%[abc]", buf));
+ EXPECT_EQ('x', buf[0]);
+
+ wchar_t wbuf[4] = L"x";
+ EXPECT_EQ(0, swscanf(L"d", L"%l[abc]", wbuf));
+ EXPECT_EQ(L'x', wbuf[0]);
+
+ EXPECT_EQ(EOF, sscanf("", "%s", buf));
+ EXPECT_EQ('x', buf[0]);
+
+ EXPECT_EQ(EOF, swscanf(L"", L"%ls", wbuf));
+ EXPECT_EQ(L'x', wbuf[0]);
+}
+
+TEST(STDIO_TEST, scanf_wscanf_wide_character_class) {
+#if 0 // TODO: more tests invented during code review; no regressions, so fix later.
+ wchar_t buf[BUFSIZ];
+
+ // A wide character shouldn't match an ASCII-only class for scanf or wscanf.
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, sscanf("xĀyz", "%l[xy]", buf));
+ EXPECT_EQ(L"x"s, std::wstring(buf));
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, swscanf(L"xĀyz", L"%l[xy]", buf));
+ EXPECT_EQ(L"x"s, std::wstring(buf));
+
+ // Even if scanf has wide characters in a class, they won't match...
+ // TODO: is that a bug?
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, sscanf("xĀyz", "%l[xĀy]", buf));
+ EXPECT_EQ(L"x"s, std::wstring(buf));
+ // ...unless you use wscanf.
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, swscanf(L"xĀyz", L"%l[xĀy]", buf));
+ EXPECT_EQ(L"xĀy"s, std::wstring(buf));
+
+ // Negation only covers ASCII for scanf...
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, sscanf("xĀyz", "%l[^ab]", buf));
+ EXPECT_EQ(L"x"s, std::wstring(buf));
+ // ...but covers wide characters for wscanf.
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, swscanf(L"xĀyz", L"%l[^ab]", buf));
+ EXPECT_EQ(L"xĀyz"s, std::wstring(buf));
+
+ // We already determined that non-ASCII characters are ignored in scanf classes.
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, sscanf("x"
+ "\xc4\x80" // Matches a byte from each wide char in the class.
+ "\xc6\x82" // Neither byte is in the class.
+ "yz",
+ "%l[xy" "\xc5\x80" "\xc4\x81" "]", buf));
+ EXPECT_EQ(L"x", std::wstring(buf));
+ // bionic and glibc both behave badly for wscanf, so let's call it right for now...
+ memset(buf, 0, sizeof(buf));
+ EXPECT_EQ(1, swscanf(L"x"
+ L"\xc4\x80"
+ L"\xc6\x82"
+ L"yz",
+ L"%l[xy" L"\xc5\x80" L"\xc4\x81" L"]", buf));
+ // Note that this isn't L"xĀ" --- although the *bytes* matched, they're
+ // not put back together as a wide character.
+ EXPECT_EQ(L"x" L"\xc4" L"\x80", std::wstring(buf));
+#endif
+}
+
TEST(STDIO_TEST, cantwrite_EBADF) {
// If we open a file read-only...
FILE* fp = fopen("/proc/version", "r");