Diagnose/eliminate FilesystemCopierAction unit test failure.
* Added a retry count for failed write operations: for the particular
failure at hand (an EIO return value on the last write call), we would
attempt to "rewrite" the buffer up to a given number of times. This
will tell us whether the error we're getting is transient or
persistent. This mechanism will try to reposition the output stream to
where it last succeeded, and re-mark buffer as full. The retry count
is zero for all instances of FilesystemCopierAction with the exception
of the instance used in RunAsRootSimpleTest (where it's set to 3).
Note, however, that we will keep failing to operation to ensure that
the unit tests are failing (and logs can be inspected). If this proves
to be a transient error that can be worked around via retry, we'll
probably leave this mechanism in place (but will stop failing the
action).
* Added a debug message that prints the number of bytes we're trying to
write when we attempt to write the residual (i.e. last piece of) data.
This is just to be sure that we're passing the correct number.
* Removed the random selection of data to be copied during
RunAsRootSimpleTest. It is obvious by now that only the sizes that are
not divisible by the (unknown but likely a reasonably large exponent
of two) fragment size are failing.
BUG=chromium-os:31082
TEST=Builds and runs unit tests
Change-Id: I3367eee638333686ab24997297d868cee416ff96
Reviewed-on: https://gerrit.chromium.org/gerrit/27094
Reviewed-by: Gilad Arnold <garnold@chromium.org>
Tested-by: Gilad Arnold <garnold@chromium.org>
Reviewed-by: Jay Srinivasan <jaysri@chromium.org>
Commit-Ready: Gilad Arnold <garnold@chromium.org>
diff --git a/filesystem_copier_action.cc b/filesystem_copier_action.cc
index 340dd1d..1f0fbc7 100644
--- a/filesystem_copier_action.cc
+++ b/filesystem_copier_action.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+// Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -15,6 +15,8 @@
#include <string>
#include <vector>
+#include <base/string_util.h>
+#include <base/stringprintf.h>
#include <gio/gio.h>
#include <gio/gunixinputstream.h>
#include <gio/gunixoutputstream.h>
@@ -36,8 +38,8 @@
} // namespace {}
FilesystemCopierAction::FilesystemCopierAction(
- bool copying_kernel_install_path,
- bool verify_hash)
+ bool copying_kernel_install_path, bool verify_hash,
+ unsigned max_rewrite_attempts)
: copying_kernel_install_path_(copying_kernel_install_path),
verify_hash_(verify_hash),
src_stream_(NULL),
@@ -45,7 +47,12 @@
read_done_(false),
failed_(false),
cancelled_(false),
- filesystem_size_(kint64max) {
+ filesystem_size_(kint64max),
+ total_bytes_written_(0),
+ num_curr_rewrite_attempts_(0),
+ num_total_rewrite_attempts_(0),
+ max_rewrite_attempts_(max_rewrite_attempts),
+ is_debug_last_read_(false) {
// A lot of code works on the implicit assumption that processing is done on
// exactly 2 ping-pong buffers.
COMPILE_ASSERT(arraysize(buffer_) == 2 &&
@@ -156,6 +163,18 @@
cancelled_ = g_cancellable_is_cancelled(canceller_[index]) == TRUE;
ssize_t bytes_read = g_input_stream_read_finish(src_stream_, res, &error);
+
+ // TODO(garnold) this is debugging code which needs to be removed once we
+ // figure out the reasons for write I/O error.
+ ssize_t bytes_expected = std::min(static_cast<int64_t>(buffer_[0].size()),
+ filesystem_size_);
+ if (!is_debug_last_read_ && bytes_read > 0 && bytes_read < bytes_expected) {
+ LOG(INFO) << "[debug] read fewer bytes than expected ("
+ << bytes_read << " < " << bytes_expected
+ << "), this is likely the final read";
+ is_debug_last_read_ = true;
+ }
+
if (bytes_read < 0) {
LOG(ERROR) << "Read failed: " << utils::GetAndFreeGError(&error);
failed_ = true;
@@ -196,7 +215,6 @@
GAsyncResult *res) {
int index = buffer_state_[0] == kBufferStateWriting ? 0 : 1;
CHECK(buffer_state_[index] == kBufferStateWriting);
- buffer_state_[index] = kBufferStateEmpty;
GError* error = NULL;
CHECK(canceller_[index]);
@@ -205,16 +223,52 @@
ssize_t bytes_written = g_output_stream_write_finish(dst_stream_,
res,
&error);
+
if (bytes_written < static_cast<ssize_t>(buffer_valid_size_[index])) {
if (bytes_written < 0) {
- LOG(ERROR) << "Write failed: " << utils::GetAndFreeGError(&error);
+ LOG(ERROR) << "Write error: " << utils::GetAndFreeGError(&error);
} else {
- LOG(ERROR) << "Write was short: wrote " << bytes_written
- << " but expected to write " << buffer_valid_size_[index];
+ LOG(ERROR) << "Wrote too few bytes: " << bytes_written
+ << " instead of " << buffer_valid_size_[index];
}
- failed_ = true;
+
+ bool do_fail = true; // fail, unless we can retry
+
+ if (cancelled_) {
+ LOG(ERROR) << "write operation cancelled, copying failed";
+ } else if (num_curr_rewrite_attempts_ < max_rewrite_attempts_) {
+ // Try again: mark buffer as full, reposition the output stream.
+ num_curr_rewrite_attempts_++;
+ num_total_rewrite_attempts_++;
+ off_t ret =
+ lseek(g_unix_output_stream_get_fd(G_UNIX_OUTPUT_STREAM(dst_stream_)),
+ static_cast<off_t>(total_bytes_written_),
+ SEEK_SET);
+ if (ret == static_cast<off_t>(total_bytes_written_)) {
+ buffer_state_[index] = kBufferStateFull;
+ do_fail = false;
+ LOG(INFO) << "attempting to rewrite current buffer at offset "
+ << total_bytes_written_ << " (" << num_curr_rewrite_attempts_
+ << " of " << max_rewrite_attempts_ << " attempts)";
+ } else {
+ PLOG(ERROR)
+ << "can't reposition output stream for rewrite, copying failed";
+ }
+ } else {
+ LOG(ERROR) << "rewrite attempts exhausted (" << max_rewrite_attempts_
+ << "), copying failed";
+ }
+
+ if (do_fail)
+ failed_ = true;
+ } else {
+ total_bytes_written_ += static_cast<size_t>(bytes_written);
+ num_curr_rewrite_attempts_ = 0;
}
+ if (buffer_state_[index] == kBufferStateWriting)
+ buffer_state_[index] = kBufferStateEmpty;
+
SpawnAsyncActions();
}
@@ -259,6 +313,12 @@
buffer_state_[i] = kBufferStateReading;
} else if (!writing && !verify_hash_ &&
buffer_state_[i] == kBufferStateFull) {
+ // TODO(garnold) debugging code, to be removed.
+ if (is_debug_last_read_) {
+ LOG(INFO) << "[debug] spawning async write of "
+ << buffer_valid_size_[i] << " bytes";
+ }
+
g_output_stream_write_async(
dst_stream_,
buffer_[i].data(),
@@ -274,6 +334,15 @@
if (!reading && !writing) {
// We're done!
ActionExitCode code = kActionCodeSuccess;
+ // TODO(garnold) we're signaling a failure if buffer rewriting was attempted
+ // during the copy action; this is necessary for keeping our unit tests
+ // failing, allowing us to diagnose the root cause for write I/O errors.
+ // When diagnosis is complete, this error return code needs to be reversed.
+ if (num_total_rewrite_attempts_) {
+ code = kActionCodeError;
+ LOG(ERROR) << "[debug] rewrite was attempted "
+ << num_total_rewrite_attempts_ << " time(s), copying failed";
+ }
if (hasher_.Finalize()) {
LOG(INFO) << "Hash: " << hasher_.hash();
if (verify_hash_) {