sf: optimize luma sampling code

Removing some math from the luma-sampling
code reduces its running time by about half
(~.5ms -> ~.25ms at 1.1MHz on a little core
in some experiments).

Bug: 127973145
Bug: 134922154
Test: trace and compare sampleArea times when
      scrolling in news app

Change-Id: Ie53d9595bea6685cf45f53972b42daa5e32fcc8e
diff --git a/services/surfaceflinger/RegionSamplingThread.cpp b/services/surfaceflinger/RegionSamplingThread.cpp
index 7fa33f5..fdc68aa 100644
--- a/services/surfaceflinger/RegionSamplingThread.cpp
+++ b/services/surfaceflinger/RegionSamplingThread.cpp
@@ -258,7 +258,7 @@
 
 namespace {
 // Using Rec. 709 primaries
-float getLuma(float r, float g, float b) {
+inline float getLuma(float r, float g, float b) {
     constexpr auto rec709_red_primary = 0.2126f;
     constexpr auto rec709_green_primary = 0.7152f;
     constexpr auto rec709_blue_primary = 0.0722f;
@@ -289,10 +289,10 @@
         const uint32_t* rowBase = data + row * stride;
         for (int32_t column = area.left; column < area.right; ++column) {
             uint32_t pixel = rowBase[column];
-            const float r = (pixel & 0xFF) / 255.0f;
-            const float g = ((pixel >> 8) & 0xFF) / 255.0f;
-            const float b = ((pixel >> 16) & 0xFF) / 255.0f;
-            const uint8_t luma = std::round(getLuma(r, g, b) * 255.0f);
+            const float r = pixel & 0xFF;
+            const float g = (pixel >> 8) & 0xFF;
+            const float b = (pixel >> 16) & 0xFF;
+            const uint8_t luma = std::round(getLuma(r, g, b));
             ++brightnessBuckets[luma];
             if (brightnessBuckets[luma] > majoritySampleNum) return luma / 255.0f;
         }