[RE-SkiaVk] Utilize Skia's new VK_ERROR_DEVICE_LOST callback to log
Skia will invoke this callback when it encounters VK_ERROR_DEVICE_LOST.
Since RE is enabling the VK_EXT_device_fault extension (when available),
additional data should be provided from the driver. If the extension is
not available, Skia will provide a generic `description` indicating
that no info is available.
The goal of this logging is to provide as much verbose information as
possible in the logs, while condensing any differentiating info into a
more succinct crash message to aid clustering.
Bug: 313369997
Test: manually forced crash
Change-Id: Ibda00573a1db9d8aaa83057207f0f699cc756e33
diff --git a/libs/renderengine/skia/SkiaVkRenderEngine.cpp b/libs/renderengine/skia/SkiaVkRenderEngine.cpp
index 3af85c0..5cf0310 100644
--- a/libs/renderengine/skia/SkiaVkRenderEngine.cpp
+++ b/libs/renderengine/skia/SkiaVkRenderEngine.cpp
@@ -34,6 +34,7 @@
#include <cstdint>
#include <memory>
+#include <string>
#include <vector>
#include <vulkan/vulkan.h>
@@ -67,6 +68,13 @@
DestroySemaphoreInfo(VkSemaphore semaphore) : mSemaphore(semaphore) {}
};
+namespace {
+void onVkDeviceFault(void* callbackContext, const std::string& description,
+ const std::vector<VkDeviceFaultAddressInfoEXT>& addressInfos,
+ const std::vector<VkDeviceFaultVendorInfoEXT>& vendorInfos,
+ const std::vector<std::byte>& vendorBinaryData);
+} // anonymous namespace
+
struct VulkanInterface {
bool initialized = false;
VkInstance instance;
@@ -79,6 +87,7 @@
VkPhysicalDeviceFeatures2* physicalDeviceFeatures2 = nullptr;
VkPhysicalDeviceSamplerYcbcrConversionFeatures* samplerYcbcrConversionFeatures = nullptr;
VkPhysicalDeviceProtectedMemoryFeatures* protectedMemoryFeatures = nullptr;
+ VkPhysicalDeviceFaultFeaturesEXT* deviceFaultFeatures = nullptr;
GrVkGetProc grGetProc;
bool isProtected;
bool isRealtimePriority;
@@ -100,6 +109,8 @@
backendContext.fDeviceFeatures2 = physicalDeviceFeatures2;
backendContext.fGetProc = grGetProc;
backendContext.fProtectedContext = isProtected ? GrProtected::kYes : GrProtected::kNo;
+ backendContext.fDeviceLostContext = this; // VulkanInterface is long-lived
+ backendContext.fDeviceLostProc = onVkDeviceFault;
return backendContext;
};
@@ -178,6 +189,67 @@
}
};
+namespace {
+void onVkDeviceFault(void* callbackContext, const std::string& description,
+ const std::vector<VkDeviceFaultAddressInfoEXT>& addressInfos,
+ const std::vector<VkDeviceFaultVendorInfoEXT>& vendorInfos,
+ const std::vector<std::byte>& vendorBinaryData) {
+ VulkanInterface* interface = static_cast<VulkanInterface*>(callbackContext);
+ const string protectedStr = interface->isProtected ? "protected" : "non-protected";
+ // The final crash string should contain as much differentiating info as possible, up to 1024
+ // bytes. As this final message is constructed, the same information is also dumped to the logs
+ // but in a more verbose format. Building the crash string is unsightly, so the clearer logging
+ // statement is always placed first to give context.
+ ALOGE("VK_ERROR_DEVICE_LOST (%s context): %s", protectedStr.c_str(), description.c_str());
+ string crashStr = "VK_ERROR_DEVICE_LOST (" + protectedStr;
+
+ if (!addressInfos.empty()) {
+ ALOGE("%zu VkDeviceFaultAddressInfoEXT:", addressInfos.size());
+ crashStr += ", " + std::to_string(addressInfos.size()) + " address info (";
+ for (VkDeviceFaultAddressInfoEXT addressInfo : addressInfos) {
+ ALOGE(" addressType: %d", (int)addressInfo.addressType);
+ ALOGE(" reportedAddress: %" PRIu64, addressInfo.reportedAddress);
+ ALOGE(" addressPrecision: %" PRIu64, addressInfo.addressPrecision);
+ crashStr += std::to_string(addressInfo.addressType) + ":" +
+ std::to_string(addressInfo.reportedAddress) + ":" +
+ std::to_string(addressInfo.addressPrecision) + ", ";
+ }
+ crashStr.resize(crashStr.size() - 2); // Remove trailing ", "
+ crashStr += ")";
+ }
+
+ if (!vendorInfos.empty()) {
+ ALOGE("%zu VkDeviceFaultVendorInfoEXT:", vendorInfos.size());
+ crashStr += ", " + std::to_string(vendorInfos.size()) + " vendor info (";
+ for (VkDeviceFaultVendorInfoEXT vendorInfo : vendorInfos) {
+ ALOGE(" description: %s", vendorInfo.description);
+ ALOGE(" vendorFaultCode: %" PRIu64, vendorInfo.vendorFaultCode);
+ ALOGE(" vendorFaultData: %" PRIu64, vendorInfo.vendorFaultData);
+ // Omit descriptions for individual vendor info structs in the crash string, as the
+ // fault code and fault data fields should be enough for clustering, and the verbosity
+ // isn't worth it. Additionally, vendors may just set the general description field of
+ // the overall fault to the description of the first element in this list, and that
+ // overall description will be placed at the end of the crash string.
+ crashStr += std::to_string(vendorInfo.vendorFaultCode) + ":" +
+ std::to_string(vendorInfo.vendorFaultData) + ", ";
+ }
+ crashStr.resize(crashStr.size() - 2); // Remove trailing ", "
+ crashStr += ")";
+ }
+
+ if (!vendorBinaryData.empty()) {
+ // TODO: b/322830575 - Log in base64, or dump directly to a file that gets put in bugreports
+ ALOGE("%zu bytes of vendor-specific binary data (please notify Android's Core Graphics"
+ " Stack team if you observe this message).",
+ vendorBinaryData.size());
+ crashStr += ", " + std::to_string(vendorBinaryData.size()) + " bytes binary";
+ }
+
+ crashStr += "): " + description;
+ LOG_ALWAYS_FATAL("%s", crashStr.c_str());
+};
+} // anonymous namespace
+
static GrVkGetProc sGetProc = [](const char* proc_name, VkInstance instance, VkDevice device) {
if (device != VK_NULL_HANDLE) {
return vkGetDeviceProcAddr(device, proc_name);
@@ -429,6 +501,14 @@
tailPnext = &interface.protectedMemoryFeatures->pNext;
}
+ if (interface.grExtensions.hasExtension(VK_EXT_DEVICE_FAULT_EXTENSION_NAME, 1)) {
+ interface.deviceFaultFeatures = new VkPhysicalDeviceFaultFeaturesEXT;
+ interface.deviceFaultFeatures->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FAULT_FEATURES_EXT;
+ interface.deviceFaultFeatures->pNext = nullptr;
+ *tailPnext = interface.deviceFaultFeatures;
+ tailPnext = &interface.deviceFaultFeatures->pNext;
+ }
+
vkGetPhysicalDeviceFeatures2(physicalDevice, interface.physicalDeviceFeatures2);
// Looks like this would slow things down and we can't depend on it on all platforms
interface.physicalDeviceFeatures2->features.robustBufferAccess = VK_FALSE;
@@ -545,6 +625,10 @@
delete interface->physicalDeviceFeatures2;
}
+ if (interface->deviceFaultFeatures) {
+ delete interface->deviceFaultFeatures;
+ }
+
interface->samplerYcbcrConversionFeatures = nullptr;
interface->physicalDeviceFeatures2 = nullptr;
interface->protectedMemoryFeatures = nullptr;