pvmfw: Implement software dirty state handling

Flush only dirty pages when dropping RW memory regions. Implement
handling of dirty bit in software. Mark writable regions read-only
and make the pages writable-dirty when access causes a permission
fault.

Bug: 269738062
Test: atest MicrodroidTestApp

Change-Id: I2e73a7cc867bae8b68c2a3b68d382405327f99e8
diff --git a/pvmfw/src/entry.rs b/pvmfw/src/entry.rs
index 4d2d696..ca74740 100644
--- a/pvmfw/src/entry.rs
+++ b/pvmfw/src/entry.rs
@@ -35,6 +35,7 @@
 use log::warn;
 use log::LevelFilter;
 use vmbase::{console, layout, logger, main, power::reboot};
+use zeroize::Zeroize;
 
 #[derive(Debug, Clone)]
 pub enum RebootReason {
@@ -82,18 +83,13 @@
 }
 
 impl<'a> MemorySlices<'a> {
-    fn new(
-        fdt: usize,
-        kernel: usize,
-        kernel_size: usize,
-        memory: &mut MemoryTracker,
-    ) -> Result<Self, RebootReason> {
+    fn new(fdt: usize, kernel: usize, kernel_size: usize) -> Result<Self, RebootReason> {
         // SAFETY - SIZE_2MB is non-zero.
         const FDT_SIZE: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(helpers::SIZE_2MB) };
         // TODO - Only map the FDT as read-only, until we modify it right before jump_to_payload()
         // e.g. by generating a DTBO for a template DT in main() and, on return, re-map DT as RW,
         // overwrite with the template DT and apply the DTBO.
-        let range = memory.alloc_mut(fdt, FDT_SIZE).map_err(|e| {
+        let range = MEMORY.lock().as_mut().unwrap().alloc_mut(fdt, FDT_SIZE).map_err(|e| {
             error!("Failed to allocate the FDT range: {e}");
             RebootReason::InternalError
         })?;
@@ -110,13 +106,13 @@
 
         let memory_range = info.memory_range;
         debug!("Resizing MemoryTracker to range {memory_range:#x?}");
-        memory.shrink(&memory_range).map_err(|_| {
-            error!("Failed to use memory range value from DT: {memory_range:#x?}");
+        MEMORY.lock().as_mut().unwrap().shrink(&memory_range).map_err(|e| {
+            error!("Failed to use memory range value from DT: {memory_range:#x?}: {e}");
             RebootReason::InvalidFdt
         })?;
 
         if get_hypervisor().has_cap(HypervisorCap::DYNAMIC_MEM_SHARE) {
-            memory.init_dynamic_shared_pool().map_err(|e| {
+            MEMORY.lock().as_mut().unwrap().init_dynamic_shared_pool().map_err(|e| {
                 error!("Failed to initialize dynamically shared pool: {e}");
                 RebootReason::InternalError
             })?;
@@ -126,14 +122,14 @@
                 RebootReason::InvalidFdt
             })?;
 
-            memory.init_static_shared_pool(range).map_err(|e| {
+            MEMORY.lock().as_mut().unwrap().init_static_shared_pool(range).map_err(|e| {
                 error!("Failed to initialize pre-shared pool {e}");
                 RebootReason::InvalidFdt
             })?;
         }
 
         let kernel_range = if let Some(r) = info.kernel_range {
-            memory.alloc_range(&r).map_err(|e| {
+            MEMORY.lock().as_mut().unwrap().alloc_range(&r).map_err(|e| {
                 error!("Failed to obtain the kernel range with DT range: {e}");
                 RebootReason::InternalError
             })?
@@ -145,7 +141,7 @@
                 RebootReason::InvalidPayload
             })?;
 
-            memory.alloc(kernel, kernel_size).map_err(|e| {
+            MEMORY.lock().as_mut().unwrap().alloc(kernel, kernel_size).map_err(|e| {
                 error!("Failed to obtain the kernel range with legacy range: {e}");
                 RebootReason::InternalError
             })?
@@ -160,7 +156,7 @@
 
         let ramdisk = if let Some(r) = info.initrd_range {
             debug!("Located ramdisk at {r:?}");
-            let r = memory.alloc_range(&r).map_err(|e| {
+            let r = MEMORY.lock().as_mut().unwrap().alloc_range(&r).map_err(|e| {
                 error!("Failed to obtain the initrd range: {e}");
                 RebootReason::InvalidRamdisk
             })?;
@@ -242,7 +238,7 @@
     debug!("... Success!");
 
     MEMORY.lock().replace(MemoryTracker::new(page_table));
-    let slices = MemorySlices::new(fdt, payload, payload_size, MEMORY.lock().as_mut().unwrap())?;
+    let slices = MemorySlices::new(fdt, payload, payload_size)?;
 
     rand::init().map_err(|e| {
         error!("Failed to initialize rand: {e}");
@@ -252,7 +248,8 @@
     // This wrapper allows main() to be blissfully ignorant of platform details.
     let next_bcc = crate::main(slices.fdt, slices.kernel, slices.ramdisk, bcc_slice, debug_policy)?;
 
-    helpers::flushed_zeroize(bcc_slice);
+    // Writable-dirty regions will be flushed when MemoryTracker is dropped.
+    bcc_slice.zeroize();
 
     info!("Expecting a bug making MMIO_GUARD_UNMAP return NOT_SUPPORTED on success");
     MEMORY.lock().as_mut().unwrap().mmio_unmap_all().map_err(|e| {
@@ -395,13 +392,10 @@
 }
 
 unsafe fn get_appended_data_slice() -> &'static mut [u8] {
-    let base = helpers::align_up(layout::binary_end(), helpers::SIZE_4KB).unwrap();
-    // pvmfw is contained in a 2MiB region so the payload can't be larger than the 2MiB alignment.
-    let size = helpers::align_up(base, helpers::SIZE_2MB).unwrap() - base;
-
+    let range = mmu::PageTable::appended_payload_range();
     // SAFETY: This region is mapped and the linker script prevents it from overlapping with other
     // objects.
-    unsafe { slice::from_raw_parts_mut(base as *mut u8, size) }
+    unsafe { slice::from_raw_parts_mut(range.start as *mut u8, range.len()) }
 }
 
 enum AppendedConfigType {
diff --git a/pvmfw/src/exceptions.rs b/pvmfw/src/exceptions.rs
index 39641b0..e819729 100644
--- a/pvmfw/src/exceptions.rs
+++ b/pvmfw/src/exceptions.rs
@@ -89,15 +89,27 @@
     }
 }
 
+#[inline]
+fn handle_translation_fault(far: usize) -> Result<(), HandleExceptionError> {
+    let mut guard = MEMORY.try_lock().ok_or(HandleExceptionError::PageTableUnavailable)?;
+    let memory = guard.as_mut().ok_or(HandleExceptionError::PageTableNotInitialized)?;
+    Ok(memory.handle_mmio_fault(far)?)
+}
+
+#[inline]
+fn handle_permission_fault(far: usize) -> Result<(), HandleExceptionError> {
+    let mut guard = MEMORY.try_lock().ok_or(HandleExceptionError::PageTableUnavailable)?;
+    let memory = guard.as_mut().ok_or(HandleExceptionError::PageTableNotInitialized)?;
+    Ok(memory.handle_permission_fault(far)?)
+}
+
 fn handle_exception(esr: Esr, far: usize) -> Result<(), HandleExceptionError> {
     // Handle all translation faults on both read and write, and MMIO guard map
     // flagged invalid pages or blocks that caused the exception.
+    // Handle permission faults for DBM flagged entries, and flag them as dirty on write.
     match esr {
-        Esr::DataAbortTranslationFault => {
-            let mut locked = MEMORY.try_lock().ok_or(HandleExceptionError::PageTableUnavailable)?;
-            let memory = locked.as_mut().ok_or(HandleExceptionError::PageTableNotInitialized)?;
-            Ok(memory.handle_mmio_fault(far)?)
-        }
+        Esr::DataAbortTranslationFault => handle_translation_fault(far),
+        Esr::DataAbortPermissionFault => handle_permission_fault(far),
         _ => Err(HandleExceptionError::UnknownException),
     }
 }
diff --git a/pvmfw/src/helpers.rs b/pvmfw/src/helpers.rs
index 403c7e4..4b669d7 100644
--- a/pvmfw/src/helpers.rs
+++ b/pvmfw/src/helpers.rs
@@ -186,3 +186,46 @@
         core::ffi::CStr::from_bytes_with_nul(concat!($str, "\0").as_bytes()).unwrap()
     }};
 }
+
+/// Executes a data synchronization barrier.
+#[macro_export]
+macro_rules! dsb {
+    ($option:literal) => {{
+        // Safe because this is just a memory barrier and does not affect Rust.
+        #[allow(unused_unsafe)] // In case the macro is used within an unsafe block.
+        unsafe {
+            core::arch::asm!(concat!("dsb ", $option), options(nomem, nostack, preserves_flags));
+        }
+    }};
+}
+
+/// Executes an instruction synchronization barrier.
+#[macro_export]
+macro_rules! isb {
+    () => {{
+        // Safe because this is just a memory barrier and does not affect Rust.
+        #[allow(unused_unsafe)] // In case the macro is used within an unsafe block.
+        unsafe {
+            core::arch::asm!("isb", options(nomem, nostack, preserves_flags));
+        }
+    }};
+}
+
+/// Invalidates cached leaf PTE entries by virtual address.
+#[macro_export]
+macro_rules! tlbi {
+    ($option:literal, $asid:expr, $addr:expr) => {{
+        let asid: usize = $asid;
+        let addr: usize = $addr;
+        // Safe because it invalidates TLB and doesn't affect Rust. When the address matches a
+        // block entry larger than the page size, all translations for the block are invalidated.
+        #[allow(unused_unsafe)] // In case the macro is used within an unsafe block.
+        unsafe {
+            core::arch::asm!(
+                concat!("tlbi ", $option, ", {x}"),
+                x = in(reg) (asid << 48) | (addr >> 12),
+                options(nomem, nostack, preserves_flags)
+            );
+        }
+    }};
+}
diff --git a/pvmfw/src/memory.rs b/pvmfw/src/memory.rs
index 1a2b4b7..4ed3072 100644
--- a/pvmfw/src/memory.rs
+++ b/pvmfw/src/memory.rs
@@ -18,6 +18,7 @@
 
 use crate::helpers::{self, page_4kb_of, RangeExt, PVMFW_PAGE_SIZE, SIZE_4MB};
 use crate::mmu;
+use crate::{dsb, isb, tlbi};
 use aarch64_paging::paging::{Attributes, Descriptor, MemoryRegion as VaRange};
 use alloc::alloc::alloc_zeroed;
 use alloc::alloc::dealloc;
@@ -29,6 +30,7 @@
 use core::cmp::max;
 use core::cmp::min;
 use core::fmt;
+use core::iter::once;
 use core::num::NonZeroUsize;
 use core::ops::Range;
 use core::ptr::NonNull;
@@ -50,7 +52,7 @@
 pub static MEMORY: SpinMutex<Option<MemoryTracker>> = SpinMutex::new(None);
 unsafe impl Send for MemoryTracker {}
 
-#[derive(Clone, Copy, Debug, Default)]
+#[derive(Clone, Copy, Debug, Default, PartialEq)]
 enum MemoryType {
     #[default]
     ReadOnly,
@@ -121,6 +123,10 @@
     SharedPoolSetFailure,
     /// Invalid page table entry.
     InvalidPte,
+    /// Failed to flush memory region.
+    FlushRegionFailed,
+    /// Failed to set PTE dirty state.
+    SetPteDirtyFailed,
 }
 
 impl fmt::Display for MemoryTrackerError {
@@ -138,6 +144,8 @@
             Self::SharedMemorySetFailure => write!(f, "Failed to set SHARED_MEMORY"),
             Self::SharedPoolSetFailure => write!(f, "Failed to set SHARED_POOL"),
             Self::InvalidPte => write!(f, "Page table entry is not valid"),
+            Self::FlushRegionFailed => write!(f, "Failed to flush memory region"),
+            Self::SetPteDirtyFailed => write!(f, "Failed to set PTE dirty state"),
         }
     }
 }
@@ -386,19 +394,38 @@
         // Maps a single device page, breaking up block mappings if necessary.
         self.page_table.map_device(&page_range).map_err(|_| MemoryTrackerError::FailedToMap)
     }
+
+    /// Flush all memory regions marked as writable-dirty.
+    fn flush_dirty_pages(&mut self) -> Result<()> {
+        // Collect memory ranges for which dirty state is tracked.
+        let writable_regions =
+            self.regions.iter().filter(|r| r.mem_type == MemoryType::ReadWrite).map(|r| &r.range);
+        let payload_range = mmu::PageTable::appended_payload_range();
+        // Execute a barrier instruction to ensure all hardware updates to the page table have been
+        // observed before reading PTE flags to determine dirty state.
+        dsb!("ish");
+        // Now flush writable-dirty pages in those regions.
+        for range in writable_regions.chain(once(&payload_range)) {
+            self.page_table
+                .modify_range(range, &flush_dirty_range)
+                .map_err(|_| MemoryTrackerError::FlushRegionFailed)?;
+        }
+        Ok(())
+    }
+
+    /// Handles permission fault for read-only blocks by setting writable-dirty state.
+    /// In general, this should be called from the exception handler when hardware dirty
+    /// state management is disabled or unavailable.
+    pub fn handle_permission_fault(&mut self, addr: usize) -> Result<()> {
+        self.page_table
+            .modify_range(&(addr..addr + 1), &mark_dirty_block)
+            .map_err(|_| MemoryTrackerError::SetPteDirtyFailed)
+    }
 }
 
 impl Drop for MemoryTracker {
     fn drop(&mut self) {
-        for region in &self.regions {
-            match region.mem_type {
-                MemoryType::ReadWrite => {
-                    // TODO(b/269738062): Use PT's dirty bit to only flush pages that were touched.
-                    helpers::flush_region(region.range.start, region.range.len())
-                }
-                MemoryType::ReadOnly => {}
-            }
-        }
+        self.flush_dirty_pages().unwrap();
         self.unshare_all_memory()
     }
 }
@@ -494,11 +521,14 @@
     level: usize,
 ) -> result::Result<(), ()> {
     let flags = desc.flags().expect("Unsupported PTE flags set");
+    if !is_leaf_pte(&flags, level) {
+        return Ok(());
+    }
     // This function will be called on an address range that corresponds to a device. Only if a
     // page has been accessed (written to or read from), will it contain the VALID flag and be MMIO
     // guard mapped. Therefore, we can skip unmapping invalid pages, they were never MMIO guard
     // mapped anyway.
-    if is_leaf_pte(&flags, level) && flags.contains(Attributes::VALID) {
+    if flags.contains(Attributes::VALID) {
         assert!(
             flags.contains(mmu::MMIO_LAZY_MAP_FLAG),
             "Attempting MMIO guard unmap for non-device pages"
@@ -519,3 +549,48 @@
     }
     Ok(())
 }
+
+/// Flushes a memory range the descriptor refers to, if the descriptor is in writable-dirty state.
+fn flush_dirty_range(
+    va_range: &VaRange,
+    desc: &mut Descriptor,
+    level: usize,
+) -> result::Result<(), ()> {
+    // Only flush ranges corresponding to dirty leaf PTEs.
+    let flags = desc.flags().ok_or(())?;
+    if !is_leaf_pte(&flags, level) {
+        return Ok(());
+    }
+    if !flags.contains(Attributes::READ_ONLY) {
+        helpers::flush_region(va_range.start().0, va_range.len());
+    }
+    Ok(())
+}
+
+/// Clears read-only flag on a PTE, making it writable-dirty. Used when dirty state is managed
+/// in software to handle permission faults on read-only descriptors.
+fn mark_dirty_block(
+    va_range: &VaRange,
+    desc: &mut Descriptor,
+    level: usize,
+) -> result::Result<(), ()> {
+    let flags = desc.flags().ok_or(())?;
+    if !is_leaf_pte(&flags, level) {
+        return Ok(());
+    }
+    if flags.contains(Attributes::DBM) {
+        assert!(flags.contains(Attributes::READ_ONLY), "unexpected PTE writable state");
+        desc.modify_flags(Attributes::empty(), Attributes::READ_ONLY);
+        // Updating the read-only bit of a PTE requires TLB invalidation.
+        // A TLB maintenance instruction is only guaranteed to be complete after a DSB instruction.
+        // An ISB instruction is required to ensure the effects of completed TLB maintenance
+        // instructions are visible to instructions fetched afterwards.
+        // See ARM ARM E2.3.10, and G5.9.
+        tlbi!("vale1", mmu::PageTable::ASID, va_range.start().0);
+        dsb!("ish");
+        isb!();
+        Ok(())
+    } else {
+        Err(())
+    }
+}
diff --git a/pvmfw/src/mmu.rs b/pvmfw/src/mmu.rs
index ed9b209..c72ceea 100644
--- a/pvmfw/src/mmu.rs
+++ b/pvmfw/src/mmu.rs
@@ -36,20 +36,13 @@
 const CODE: Attributes = MEMORY.union(Attributes::READ_ONLY);
 const DATA: Attributes = MEMORY.union(Attributes::EXECUTE_NEVER);
 const RODATA: Attributes = DATA.union(Attributes::READ_ONLY);
+const DATA_DBM: Attributes = RODATA.union(Attributes::DBM);
 
 /// High-level API for managing MMU mappings.
 pub struct PageTable {
     idmap: IdMap,
 }
 
-fn appended_payload_range() -> Range<usize> {
-    let start = helpers::align_up(layout::binary_end(), helpers::SIZE_4KB).unwrap();
-    // pvmfw is contained in a 2MiB region so the payload can't be larger than the 2MiB alignment.
-    let end = helpers::align_up(start, helpers::SIZE_2MB).unwrap();
-
-    start..end
-}
-
 /// Region allocated for the stack.
 pub fn stack_range() -> Range<usize> {
     const STACK_PAGES: usize = 8;
@@ -58,18 +51,28 @@
 }
 
 impl PageTable {
-    const ASID: usize = 1;
+    pub const ASID: usize = 1;
     const ROOT_LEVEL: usize = 1;
 
+    /// Returns memory range reserved for the appended payload.
+    pub fn appended_payload_range() -> Range<usize> {
+        let start = helpers::align_up(layout::binary_end(), helpers::SIZE_4KB).unwrap();
+        // pvmfw is contained in a 2MiB region so the payload can't be larger than the 2MiB alignment.
+        let end = helpers::align_up(start, helpers::SIZE_2MB).unwrap();
+        start..end
+    }
+
     /// Creates an instance pre-populated with pvmfw's binary layout.
     pub fn from_static_layout() -> Result<Self, MapError> {
         let mut page_table = Self { idmap: IdMap::new(Self::ASID, Self::ROOT_LEVEL) };
 
+        // Stack and scratch ranges are explicitly zeroed and flushed before jumping to payload,
+        // so dirty state management can be omitted.
+        page_table.map_range(&layout::scratch_range(), DATA)?;
+        page_table.map_range(&stack_range(), DATA)?;
         page_table.map_code(&layout::text_range())?;
-        page_table.map_data(&layout::scratch_range())?;
-        page_table.map_data(&stack_range())?;
         page_table.map_rodata(&layout::rodata_range())?;
-        page_table.map_data(&appended_payload_range())?;
+        page_table.map_data(&Self::appended_payload_range())?;
 
         Ok(page_table)
     }
@@ -87,7 +90,7 @@
     }
 
     pub fn map_data(&mut self, range: &Range<usize>) -> Result<(), MapError> {
-        self.map_range(range, DATA)
+        self.map_range(range, DATA_DBM)
     }
 
     pub fn map_code(&mut self, range: &Range<usize>) -> Result<(), MapError> {