eclair snapshot
diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk
index 50eb5f5..0cc85d9 100644
--- a/libpixelflinger/Android.mk
+++ b/libpixelflinger/Android.mk
@@ -64,12 +64,14 @@
 LOCAL_MODULE:= libpixelflinger
 LOCAL_SRC_FILES := $(PIXELFLINGER_SRC_FILES)
 LOCAL_CFLAGS := $(PIXELFLINGER_CFLAGS)
+
 ifneq ($(BUILD_TINY_ANDROID),true)
 # Really this should go away entirely or at least not depend on
 # libhardware, but this at least gets us built.
 LOCAL_SHARED_LIBRARIES += libhardware_legacy
 LOCAL_CFLAGS += -DWITH_LIB_HARDWARE
 endif
+
 ifeq ($(TARGET_ARCH),arm)
 LOCAL_WHOLE_STATIC_LIBRARIES := libpixelflinger_armv6
 endif
diff --git a/libpixelflinger/codeflinger/blending.cpp b/libpixelflinger/codeflinger/blending.cpp
index f10217b..083042c 100644
--- a/libpixelflinger/codeflinger/blending.cpp
+++ b/libpixelflinger/codeflinger/blending.cpp
@@ -221,17 +221,7 @@
                 build_blend_factor(dst_factor, fd,
                         component, pixel, fragment, fb, scratches);
                 mul_factor_add(temp, fb, dst_factor, component_t(fragment));
-                if (fd==GGL_ONE_MINUS_SRC_ALPHA) {
-                    // XXX: in theory this is not correct, we should
-                    // saturate here. However, this mode is often
-                    // used for displaying alpha-premultiplied graphics,
-                    // in which case, saturation is not necessary.
-                    // unfortunatelly, we have no way to know.
-                    // This is a case, where we sacrifice correctness for
-                    // performance. we should probably have some heuristics.
-                } else {
-                    component_sat(temp);
-                }
+                component_sat(temp);
             }
         } else {
             // compute fs
diff --git a/libpixelflinger/t32cb16blend.S b/libpixelflinger/t32cb16blend.S
index d4b2579..caf9eb7 100644
--- a/libpixelflinger/t32cb16blend.S
+++ b/libpixelflinger/t32cb16blend.S
@@ -21,53 +21,80 @@
 	
 	.global scanline_t32cb16blend_arm
 
-// uses r6, r7, lr
 
-.macro pixel,   DREG, SRC, FB, OFFSET
+/*
+ * .macro pixel
+ *
+ * \DREG is a 32-bit register containing *two* original destination RGB565 
+ *       pixels, with the even one in the low-16 bits, and the odd one in the
+ *       high 16 bits.
+ *
+ * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
+ *
+ * \FB is a target register that will contain the blended pixel values.
+ *
+ * \ODD is either 0 or 1 and indicates if we're blending the lower or 
+ *      upper 16-bit pixels in DREG into FB
+ *
+ *
+ * clobbered: r6, r7, lr
+ *
+ */
 
-    // SRC = AARRGGBB
+.macro pixel,   DREG, SRC, FB, ODD
+
+    // SRC = 0xAABBGGRR
     mov     r7, \SRC, lsr #24           // sA
     add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
     rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
 
 1:
 
-.if \OFFSET
+.if \ODD
 
     // red
-    mov     lr, \DREG, lsr #(\OFFSET + 6 + 5)
+    mov     lr, \DREG, lsr #(16 + 11)
     smulbb  lr, r7, lr
     mov     r6, \SRC, lsr #3
     and     r6, r6, #0x1F
     add     lr, r6, lr, lsr #8
-    orr     \FB, lr, lsl #(\OFFSET + 11)
+    cmp     lr, #0x1F
+    orrhs   \FB, \FB, #(0x1F<<(16 + 11))
+    orrlo   \FB, \FB, lr, lsl #(16 + 11)
 
         // green
-        and     r6, \DREG, #(0x3F<<(\OFFSET + 5))
+        and     r6, \DREG, #(0x3F<<(16 + 5))
         smulbt  r6, r7, r6
         mov     lr, \SRC, lsr #(8+2)
         and     lr, lr, #0x3F
         add     r6, lr, r6, lsr #(5+8)
-        orr     \FB, \FB, r6, lsl #(\OFFSET + 5)
+        cmp     r6, #0x3F
+        orrhs   \FB, \FB, #(0x3F<<(16 + 5))
+        orrlo   \FB, \FB, r6, lsl #(16 + 5)
 
             // blue
-            and     lr, \DREG, #(0x1F << \OFFSET)
+            and     lr, \DREG, #(0x1F << 16)
             smulbt  lr, r7, lr
             mov     r6, \SRC, lsr #(8+8+3)
             and     r6, r6, #0x1F
             add     lr, r6, lr, lsr #8
-            orr     \FB, \FB, lr, lsl #\OFFSET
+            cmp     lr, #0x1F
+            orrhs   \FB, \FB, #(0x1F << 16)
+            orrlo   \FB, \FB, lr, lsl #16
 
 .else
 
     // red
-    mov     lr, \DREG, lsr #(6+5)
+    mov     lr, \DREG, lsr #11
     and     lr, lr, #0x1F
     smulbb  lr, r7, lr
     mov     r6, \SRC, lsr #3
     and     r6, r6, #0x1F
     add     lr, r6, lr, lsr #8
-    mov     \FB, lr, lsl #11
+    cmp     lr, #0x1F
+    movhs   \FB, #(0x1F<<11)
+    movlo   \FB, lr, lsl #11
+
 
         // green
         and     r6, \DREG, #(0x3F<<5)
@@ -75,7 +102,9 @@
         mov     lr, \SRC, lsr #(8+2)
         and     lr, lr, #0x3F
         add     r6, lr, r6, lsr #(5+8)
-        orr     \FB, \FB, r6, lsl #5
+        cmp     r6, #0x3F
+        orrhs   \FB, \FB, #(0x3F<<5)
+        orrlo   \FB, \FB, r6, lsl #5
 
             // blue
             and     lr, \DREG, #0x1F
@@ -83,7 +112,9 @@
             mov     r6, \SRC, lsr #(8+8+3)
             and     r6, r6, #0x1F
             add     lr, r6, lr, lsr #8
-            orr     \FB, \FB, lr
+            cmp     lr, #0x1F
+            orrhs   \FB, \FB, #0x1F
+            orrlo   \FB, \FB, lr
 
 .endif
 
@@ -128,7 +159,7 @@
     subs    r2, r2, #2
     blo     9f
 
-    // The main loop is unrolled twice and process 4 pixels
+    // The main loop is unrolled twice and processes 4 pixels
 8:  ldmia   r1!, {r4, r5}
     // stream the source
     pld     [r1, #32]
@@ -142,7 +173,7 @@
     // stream the destination
     pld     [r0, #32]
     pixel   r3, r4, r12, 0
-    pixel   r3, r5, r12, 16
+    pixel   r3, r5, r12, 1
     // effectively, we're getting write-combining by virtue of the
     // cpu's write-back cache.
     str     r12, [r0, #-4]