před 5 roky · 2dfe060564
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -421,6 +421,23 @@ BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#if SDL_ARM_NEON_BLITTERS
			
 
				+void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
			
 
				+
			
 
				+static void
			
 
				+BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
			
 
				+{
			
 
				+	int32_t width = info->dst_w;
			
 
				+	int32_t height = info->dst_h;
			
 
				+	uint32_t *dstp = (uint32_t *)info->dst;
			
 
				+	int32_t dststride = width + (info->dst_skip >> 2);
			
 
				+	uint32_t *srcp = (uint32_t *)info->src;
			
 
				+	int32_t srcstride = width + (info->src_skip >> 2);
			
 
				+
			
 
				+	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
			
 
				 static void
			
 
				 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
			
@@ -1356,6 +1373,10 @@ SDL_CalculateBlitA(SDL_Surface * surface)
 
				                 }
			
 
				 #endif /* __MMX__ || __3dNOW__ */
			
 
				                 if (sf->Amask == 0xff000000) {
			
 
				+#if SDL_ARM_NEON_BLITTERS
			
 
				+                    if (SDL_HasNEON())
			
 
				+                        return BlitRGBtoRGBPixelAlphaARMNEON;
			
 
				+#endif
			
 
				 #if SDL_ARM_SIMD_BLITTERS
			
 
				                     if (SDL_HasARMSIMD())
			
 
				                         return BlitRGBtoRGBPixelAlphaARMSIMD;
			
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -0,0 +1,159 @@
 
				+/*
			
 
				+ * Copyright © 2009 Nokia Corporation
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice (including the next
			
 
				+ * paragraph) shall be included in all copies or substantial portions of the
			
 
				+ * Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ *
			
 
				+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Copyright (c) 2018 RISC OS Open Ltd
			
 
				+ *
			
 
				+ * This software is provided 'as-is', without any express or implied
			
 
				+ * warranty.  In no event will the authors be held liable for any damages
			
 
				+ * arising from the use of this software.
			
 
				+ *
			
 
				+ * Permission is granted to anyone to use this software for any purpose,
			
 
				+ * including commercial applications, and to alter it and redistribute it
			
 
				+ * freely, subject to the following restrictions:
			
 
				+ *
			
 
				+ * 1. The origin of this software must not be misrepresented; you must not
			
 
				+ *    claim that you wrote the original software. If you use this software
			
 
				+ *    in a product, an acknowledgment in the product documentation would be
			
 
				+ *    appreciated but is not required.
			
 
				+ * 2. Altered source versions must be plainly marked as such, and must not be
			
 
				+ *    misrepresented as being the original software.
			
 
				+ * 3. This notice may not be removed or altered from any source distribution.
			
 
				+ */
			
 
				+
			
 
				+/* Prevent the stack from becoming executable for no reason... */
			
 
				+#if defined(__linux__) && defined(__ELF__)
			
 
				+.section .note.GNU-stack,"",%progbits
			
 
				+#endif
			
 
				+
			
 
				+    .text
			
 
				+    .fpu neon
			
 
				+    .arch armv7a
			
 
				+    .object_arch armv4
			
 
				+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
			
 
				+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
			
 
				+    .arm
			
 
				+    .altmacro
			
 
				+    .p2align 2
			
 
				+
			
 
				+#include "pixman-arm-asm.h"
			
 
				+#include "pixman-arm-neon-asm.h"
			
 
				+
			
 
				+/* Global configuration options and preferences */
			
 
				+
			
 
				+/*
			
 
				+ * The code can optionally make use of unaligned memory accesses to improve
			
 
				+ * performance of handling leading/trailing pixels for each scanline.
			
 
				+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
			
 
				+ * example in linux if unaligned memory accesses are not configured to
			
 
				+ * generate.exceptions.
			
 
				+ */
			
 
				+.set RESPECT_STRICT_ALIGNMENT, 1
			
 
				+
			
 
				+/*
			
 
				+ * Set default prefetch type. There is a choice between the following options:
			
 
				+ *
			
 
				+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
			
 
				+ * as NOP to workaround some HW bugs or for whatever other reason)
			
 
				+ *
			
 
				+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
			
 
				+ * advanced prefetch intruduces heavy overhead)
			
 
				+ *
			
 
				+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
			
 
				+ * which can run ARM and NEON instructions simultaneously so that extra ARM
			
 
				+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
			
 
				+ *
			
 
				+ * Note: some types of function can't support advanced prefetch and fallback
			
 
				+ *       to simple one (those which handle 24bpp pixels)
			
 
				+ */
			
 
				+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
			
 
				+
			
 
				+/* Prefetch distance in pixels for simple prefetch */
			
 
				+.set PREFETCH_DISTANCE_SIMPLE, 64
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+.macro RGBtoRGBPixelAlpha_process_pixblock_head
			
 
				+    vmvn        d30, d3  /* get inverted source alpha */
			
 
				+    vmov        d31, d7  /* dest alpha is always unchanged */
			
 
				+    vmull.u8    q14, d0, d3
			
 
				+    vmlal.u8    q14, d4, d30
			
 
				+    vmull.u8    q0, d1, d3
			
 
				+    vmlal.u8    q0, d5, d30
			
 
				+    vmull.u8    q1, d2, d3
			
 
				+    vmlal.u8    q1, d6, d30
			
 
				+    vrshr.u16   q2, q14, #8
			
 
				+    vrshr.u16   q3, q0, #8
			
 
				+    vraddhn.u16 d28, q14, q2
			
 
				+    vrshr.u16   q2, q1, #8
			
 
				+    vraddhn.u16 d29, q0, q3
			
 
				+    vraddhn.u16 d30, q1, q2
			
 
				+.endm
			
 
				+
			
 
				+.macro RGBtoRGBPixelAlpha_process_pixblock_tail
			
 
				+    /* nothing */
			
 
				+.endm
			
 
				+
			
 
				+.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
			
 
				+    vld4.8      {d0-d3}, [SRC]!
			
 
				+                                    PF add PF_X, PF_X, #8
			
 
				+        vst4.8      {d28-d31}, [DST_W :128]!
			
 
				+                                    PF tst PF_CTL, #0xF
			
 
				+    vld4.8      {d4-d7}, [DST_R :128]!
			
 
				+                                    PF addne PF_X, PF_X, #8
			
 
				+    vmvn        d30, d3  /* get inverted source alpha */
			
 
				+    vmov        d31, d7  /* dest alpha is always unchanged */
			
 
				+    vmull.u8    q14, d0, d3
			
 
				+                                    PF subne PF_CTL, PF_CTL, #1
			
 
				+    vmlal.u8    q14, d4, d30
			
 
				+                                    PF cmp PF_X, ORIG_W
			
 
				+    vmull.u8    q0, d1, d3
			
 
				+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
			
 
				+    vmlal.u8    q0, d5, d30
			
 
				+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
			
 
				+    vmull.u8    q1, d2, d3
			
 
				+                                    PF subge PF_X, PF_X, ORIG_W
			
 
				+    vmlal.u8    q1, d6, d30
			
 
				+                                    PF subges PF_CTL, PF_CTL, #0x10
			
 
				+    vrshr.u16   q2, q14, #8
			
 
				+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
			
 
				+    vrshr.u16   q3, q0, #8
			
 
				+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
			
 
				+    vraddhn.u16 d28, q14, q2
			
 
				+    vrshr.u16   q2, q1, #8
			
 
				+    vraddhn.u16 d29, q0, q3
			
 
				+    vraddhn.u16 d30, q1, q2
			
 
				+.endm
			
 
				+
			
 
				+generate_composite_function \
			
 
				+    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
			
 
				+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
			
 
				+    8, /* number of pixels, processed in a single block */ \
			
 
				+    5, /* prefetch distance */ \
			
 
				+    default_init, \
			
 
				+    default_cleanup, \
			
 
				+    RGBtoRGBPixelAlpha_process_pixblock_head, \
			
 
				+    RGBtoRGBPixelAlpha_process_pixblock_tail, \
			
 
				+    RGBtoRGBPixelAlpha_process_pixblock_tail_head
			
--- a/src/video/arm/pixman-arm-neon-asm.h
+++ b/src/video/arm/pixman-arm-neon-asm.h
@@ -0,0 +1,1184 @@
 
				+/*
			
 
				+ * Copyright © 2009 Nokia Corporation
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice (including the next
			
 
				+ * paragraph) shall be included in all copies or substantial portions of the
			
 
				+ * Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ *
			
 
				+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This file contains a macro ('generate_composite_function') which can
			
 
				+ * construct 2D image processing functions, based on a common template.
			
 
				+ * Any combinations of source, destination and mask images with 8bpp,
			
 
				+ * 16bpp, 24bpp, 32bpp color formats are supported.
			
 
				+ *
			
 
				+ * This macro takes care of:
			
 
				+ *  - handling of leading and trailing unaligned pixels
			
 
				+ *  - doing most of the work related to L2 cache preload
			
 
				+ *  - encourages the use of software pipelining for better instructions
			
 
				+ *    scheduling
			
 
				+ *
			
 
				+ * The user of this macro has to provide some configuration parameters
			
 
				+ * (bit depths for the images, prefetch distance, etc.) and a set of
			
 
				+ * macros, which should implement basic code chunks responsible for
			
 
				+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
			
 
				+ * examples.
			
 
				+ *
			
 
				+ * TODO:
			
 
				+ *  - try overlapped pixel method (from Ian Rickards) when processing
			
 
				+ *    exactly two blocks of pixels
			
 
				+ *  - maybe add an option to do reverse scanline processing
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Bit flags for 'generate_composite_function' macro which are used
			
 
				+ * to tune generated functions behavior.
			
 
				+ */
			
 
				+.set FLAG_DST_WRITEONLY,       0
			
 
				+.set FLAG_DST_READWRITE,       1
			
 
				+.set FLAG_DEINTERLEAVE_32BPP,  2
			
 
				+
			
 
				+/*
			
 
				+ * Offset in stack where mask and source pointer/stride can be accessed
			
 
				+ * from 'init' macro. This is useful for doing special handling for solid mask.
			
 
				+ */
			
 
				+.set ARGS_STACK_OFFSET,        40
			
 
				+
			
 
				+/*
			
 
				+ * Constants for selecting preferable prefetch type.
			
 
				+ */
			
 
				+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
			
 
				+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
			
 
				+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
			
 
				+
			
 
				+/*
			
 
				+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
			
 
				+ * pixel data).
			
 
				+ */
			
 
				+
			
 
				+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
			
 
				+.if abits > 0
			
 
				+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
			
 
				+.else
			
 
				+    op&.&elem_size {d&reg1}, [&mem_operand&]!
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
			
 
				+.if abits > 0
			
 
				+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
			
 
				+.else
			
 
				+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
			
 
				+.if abits > 0
			
 
				+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
			
 
				+.else
			
 
				+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
			
 
				+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
			
 
				+.endm
			
 
				+
			
 
				+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
			
 
				+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
			
 
				+.endm
			
 
				+
			
 
				+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
			
 
				+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
			
 
				+.endm
			
 
				+
			
 
				+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
			
 
				+.if numbytes == 32
			
 
				+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
			
 
				+                              %(basereg+6), %(basereg+7), mem_operand, abits
			
 
				+.elseif numbytes == 16
			
 
				+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
			
 
				+.elseif numbytes == 8
			
 
				+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
			
 
				+.elseif numbytes == 4
			
 
				+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
			
 
				+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
			
 
				+    .elseif elem_size == 16
			
 
				+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
			
 
				+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
			
 
				+    .else
			
 
				+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
			
 
				+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
			
 
				+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
			
 
				+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
			
 
				+    .endif
			
 
				+.elseif numbytes == 2
			
 
				+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
			
 
				+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
			
 
				+    .else
			
 
				+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
			
 
				+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
			
 
				+    .endif
			
 
				+.elseif numbytes == 1
			
 
				+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
			
 
				+.else
			
 
				+    .error "unsupported size: numbytes"
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
			
 
				+.if bpp > 0
			
 
				+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
			
 
				+                      %(basereg+6), %(basereg+7), mem_operand, abits
			
 
				+.elseif (bpp == 24) && (numpix == 8)
			
 
				+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
			
 
				+.elseif (bpp == 24) && (numpix == 4)
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
			
 
				+.elseif (bpp == 24) && (numpix == 2)
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
			
 
				+.elseif (bpp == 24) && (numpix == 1)
			
 
				+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
			
 
				+.else
			
 
				+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
			
 
				+.endif
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
			
 
				+.if bpp > 0
			
 
				+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
			
 
				+                      %(basereg+6), %(basereg+7), mem_operand, abits
			
 
				+.elseif (bpp == 24) && (numpix == 8)
			
 
				+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
			
 
				+.elseif (bpp == 24) && (numpix == 4)
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
			
 
				+.elseif (bpp == 24) && (numpix == 2)
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
			
 
				+.elseif (bpp == 24) && (numpix == 1)
			
 
				+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
			
 
				+.else
			
 
				+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
			
 
				+.endif
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixld_a numpix, bpp, basereg, mem_operand
			
 
				+.if (bpp * numpix) <= 128
			
 
				+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
			
 
				+.else
			
 
				+    pixld numpix, bpp, basereg, mem_operand, 128
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixst_a numpix, bpp, basereg, mem_operand
			
 
				+.if (bpp * numpix) <= 128
			
 
				+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
			
 
				+.else
			
 
				+    pixst numpix, bpp, basereg, mem_operand, 128
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
			
 
				+ * aliases to be defined)
			
 
				+ */
			
 
				+.macro pixld1_s elem_size, reg1, mem_operand
			
 
				+.if elem_size == 16
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP1, mem_operand, TMP1, asl #1
			
 
				+    mov     TMP2, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP2, mem_operand, TMP2, asl #1
			
 
				+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP1, mem_operand, TMP1, asl #1
			
 
				+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
			
 
				+    mov     TMP2, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP2, mem_operand, TMP2, asl #1
			
 
				+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
			
 
				+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
			
 
				+.elseif elem_size == 32
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP1, mem_operand, TMP1, asl #2
			
 
				+    mov     TMP2, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP2, mem_operand, TMP2, asl #2
			
 
				+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
			
 
				+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
			
 
				+.else
			
 
				+    .error "unsupported"
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixld2_s elem_size, reg1, reg2, mem_operand
			
 
				+.if 0 /* elem_size == 32 */
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    add     VX, VX, UNIT_X, asl #1
			
 
				+    add     TMP1, mem_operand, TMP1, asl #2
			
 
				+    mov     TMP2, VX, asr #16
			
 
				+    sub     VX, VX, UNIT_X
			
 
				+    add     TMP2, mem_operand, TMP2, asl #2
			
 
				+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    add     VX, VX, UNIT_X, asl #1
			
 
				+    add     TMP1, mem_operand, TMP1, asl #2
			
 
				+    vld1.32 {d&reg2&[0]}, [TMP2, :32]
			
 
				+    mov     TMP2, VX, asr #16
			
 
				+    add     VX, VX, UNIT_X
			
 
				+    add     TMP2, mem_operand, TMP2, asl #2
			
 
				+    vld1.32 {d&reg1&[1]}, [TMP1, :32]
			
 
				+    vld1.32 {d&reg2&[1]}, [TMP2, :32]
			
 
				+.else
			
 
				+    pixld1_s elem_size, reg1, mem_operand
			
 
				+    pixld1_s elem_size, reg2, mem_operand
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixld0_s elem_size, reg1, idx, mem_operand
			
 
				+.if elem_size == 16
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP1, mem_operand, TMP1, asl #1
			
 
				+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
			
 
				+.elseif elem_size == 32
			
 
				+    mov     TMP1, VX, asr #16
			
 
				+    adds    VX, VX, UNIT_X
			
 
				+5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				+    bpl     5b
			
 
				+    add     TMP1, mem_operand, TMP1, asl #2
			
 
				+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
			
 
				+.if numbytes == 32
			
 
				+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
			
 
				+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
			
 
				+    pixdeinterleave elem_size, %(basereg+4)
			
 
				+.elseif numbytes == 16
			
 
				+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
			
 
				+.elseif numbytes == 8
			
 
				+    pixld1_s elem_size, %(basereg+1), mem_operand
			
 
				+.elseif numbytes == 4
			
 
				+    .if elem_size == 32
			
 
				+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
			
 
				+    .elseif elem_size == 16
			
 
				+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
			
 
				+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
			
 
				+    .else
			
 
				+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
			
 
				+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
			
 
				+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
			
 
				+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
			
 
				+    .endif
			
 
				+.elseif numbytes == 2
			
 
				+    .if elem_size == 16
			
 
				+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
			
 
				+    .else
			
 
				+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
			
 
				+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
			
 
				+    .endif
			
 
				+.elseif numbytes == 1
			
 
				+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
			
 
				+.else
			
 
				+    .error "unsupported size: numbytes"
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro pixld_s numpix, bpp, basereg, mem_operand
			
 
				+.if bpp > 0
			
 
				+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro vuzp8 reg1, reg2
			
 
				+    vuzp.8 d&reg1, d&reg2
			
 
				+.endm
			
 
				+
			
 
				+.macro vzip8 reg1, reg2
			
 
				+    vzip.8 d&reg1, d&reg2
			
 
				+.endm
			
 
				+
			
 
				+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
			
 
				+.macro pixdeinterleave bpp, basereg
			
 
				+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				+    vuzp8 %(basereg+0), %(basereg+1)
			
 
				+    vuzp8 %(basereg+2), %(basereg+3)
			
 
				+    vuzp8 %(basereg+1), %(basereg+3)
			
 
				+    vuzp8 %(basereg+0), %(basereg+2)
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
			
 
				+.macro pixinterleave bpp, basereg
			
 
				+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				+    vzip8 %(basereg+0), %(basereg+2)
			
 
				+    vzip8 %(basereg+1), %(basereg+3)
			
 
				+    vzip8 %(basereg+2), %(basereg+3)
			
 
				+    vzip8 %(basereg+0), %(basereg+1)
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * This is a macro for implementing cache preload. The main idea is that
			
 
				+ * cache preload logic is mostly independent from the rest of pixels
			
 
				+ * processing code. It starts at the top left pixel and moves forward
			
 
				+ * across pixels and can jump across scanlines. Prefetch distance is
			
 
				+ * handled in an 'incremental' way: it starts from 0 and advances to the
			
 
				+ * optimal distance over time. After reaching optimal prefetch distance,
			
 
				+ * it is kept constant. There are some checks which prevent prefetching
			
 
				+ * unneeded pixel lines below the image (but it still can prefetch a bit
			
 
				+ * more data on the right side of the image - not a big issue and may
			
 
				+ * be actually helpful when rendering text glyphs). Additional trick is
			
 
				+ * the use of LDR instruction for prefetch instead of PLD when moving to
			
 
				+ * the next line, the point is that we have a high chance of getting TLB
			
 
				+ * miss in this case, and PLD would be useless.
			
 
				+ *
			
 
				+ * This sounds like it may introduce a noticeable overhead (when working with
			
 
				+ * fully cached data). But in reality, due to having a separate pipeline and
			
 
				+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
			
 
				+ * execute simultaneously with NEON and be completely shadowed by it. Thus
			
 
				+ * we get no performance overhead at all (*). This looks like a very nice
			
 
				+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
			
 
				+ * but still can implement some rather advanced prefetch logic in software
			
 
				+ * for almost zero cost!
			
 
				+ *
			
 
				+ * (*) The overhead of the prefetcher is visible when running some trivial
			
 
				+ * pixels processing like simple copy. Anyway, having prefetch is a must
			
 
				+ * when working with the graphics data.
			
 
				+ */
			
 
				+.macro PF a, x:vararg
			
 
				+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
			
 
				+    a x
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro cache_preload std_increment, boost_increment
			
 
				+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
			
 
				+.if regs_shortage
			
 
				+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
			
 
				+.endif
			
 
				+.if std_increment != 0
			
 
				+    PF add PF_X, PF_X, #std_increment
			
 
				+.endif
			
 
				+    PF tst PF_CTL, #0xF
			
 
				+    PF addne PF_X, PF_X, #boost_increment
			
 
				+    PF subne PF_CTL, PF_CTL, #1
			
 
				+    PF cmp PF_X, ORIG_W
			
 
				+.if src_bpp_shift >= 0
			
 
				+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
			
 
				+.endif
			
 
				+.if dst_r_bpp != 0
			
 
				+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
			
 
				+.endif
			
 
				+.if mask_bpp_shift >= 0
			
 
				+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
			
 
				+.endif
			
 
				+    PF subge PF_X, PF_X, ORIG_W
			
 
				+    PF subges PF_CTL, PF_CTL, #0x10
			
 
				+.if src_bpp_shift >= 0
			
 
				+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
			
 
				+.endif
			
 
				+.if dst_r_bpp != 0
			
 
				+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
			
 
				+.endif
			
 
				+.if mask_bpp_shift >= 0
			
 
				+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
			
 
				+.endif
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro cache_preload_simple
			
 
				+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
			
 
				+.if src_bpp > 0
			
 
				+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
			
 
				+.endif
			
 
				+.if dst_r_bpp > 0
			
 
				+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
			
 
				+.endif
			
 
				+.if mask_bpp > 0
			
 
				+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
			
 
				+.endif
			
 
				+.endif
			
 
				+.endm
			
 
				+
			
 
				+.macro fetch_mask_pixblock
			
 
				+    pixld       pixblock_size, mask_bpp, \
			
 
				+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Macro which is used to process leading pixels until destination
			
 
				+ * pointer is properly aligned (at 16 bytes boundary). When destination
			
 
				+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
			
 
				+ */
			
 
				+.macro ensure_destination_ptr_alignment process_pixblock_head, \
			
 
				+                                        process_pixblock_tail, \
			
 
				+                                        process_pixblock_tail_head
			
 
				+.if dst_w_bpp != 24
			
 
				+    tst         DST_R, #0xF
			
 
				+    beq         2f
			
 
				+
			
 
				+.irp lowbit, 1, 2, 4, 8, 16
			
 
				+local skip1
			
 
				+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
			
 
				+.if lowbit < 16 /* we don't need more than 16-byte alignment */
			
 
				+    tst         DST_R, #lowbit
			
 
				+    beq         1f
			
 
				+.endif
			
 
				+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
			
 
				+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
			
 
				+.if dst_r_bpp > 0
			
 
				+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
			
 
				+.else
			
 
				+    add         DST_R, DST_R, #lowbit
			
 
				+.endif
			
 
				+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
			
 
				+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
			
 
				+1:
			
 
				+.endif
			
 
				+.endr
			
 
				+    pixdeinterleave src_bpp, src_basereg
			
 
				+    pixdeinterleave mask_bpp, mask_basereg
			
 
				+    pixdeinterleave dst_r_bpp, dst_r_basereg
			
 
				+
			
 
				+    process_pixblock_head
			
 
				+    cache_preload 0, pixblock_size
			
 
				+    cache_preload_simple
			
 
				+    process_pixblock_tail
			
 
				+
			
 
				+    pixinterleave dst_w_bpp, dst_w_basereg
			
 
				+.irp lowbit, 1, 2, 4, 8, 16
			
 
				+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
			
 
				+.if lowbit < 16 /* we don't need more than 16-byte alignment */
			
 
				+    tst         DST_W, #lowbit
			
 
				+    beq         1f
			
 
				+.endif
			
 
				+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
			
 
				+1:
			
 
				+.endif
			
 
				+.endr
			
 
				+.endif
			
 
				+2:
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Special code for processing up to (pixblock_size - 1) remaining
			
 
				+ * trailing pixels. As SIMD processing performs operation on
			
 
				+ * pixblock_size pixels, anything smaller than this has to be loaded
			
 
				+ * and stored in a special way. Loading and storing of pixel data is
			
 
				+ * performed in such a way that we fill some 'slots' in the NEON
			
 
				+ * registers (some slots naturally are unused), then perform compositing
			
 
				+ * operation as usual. In the end, the data is taken from these 'slots'
			
 
				+ * and saved to memory.
			
 
				+ *
			
 
				+ * cache_preload_flag - allows to suppress prefetch if
			
 
				+ *                      set to 0
			
 
				+ * dst_aligned_flag   - selects whether destination buffer
			
 
				+ *                      is aligned
			
 
				+ */
			
 
				+.macro process_trailing_pixels cache_preload_flag, \
			
 
				+                               dst_aligned_flag, \
			
 
				+                               process_pixblock_head, \
			
 
				+                               process_pixblock_tail, \
			
 
				+                               process_pixblock_tail_head
			
 
				+    tst         W, #(pixblock_size - 1)
			
 
				+    beq         2f
			
 
				+.irp chunk_size, 16, 8, 4, 2, 1
			
 
				+.if pixblock_size > chunk_size
			
 
				+    tst         W, #chunk_size
			
 
				+    beq         1f
			
 
				+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
			
 
				+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
			
 
				+.if dst_aligned_flag != 0
			
 
				+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
			
 
				+.else
			
 
				+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
			
 
				+.endif
			
 
				+.if cache_preload_flag != 0
			
 
				+    PF add      PF_X, PF_X, #chunk_size
			
 
				+.endif
			
 
				+1:
			
 
				+.endif
			
 
				+.endr
			
 
				+    pixdeinterleave src_bpp, src_basereg
			
 
				+    pixdeinterleave mask_bpp, mask_basereg
			
 
				+    pixdeinterleave dst_r_bpp, dst_r_basereg
			
 
				+
			
 
				+    process_pixblock_head
			
 
				+.if cache_preload_flag != 0
			
 
				+    cache_preload 0, pixblock_size
			
 
				+    cache_preload_simple
			
 
				+.endif
			
 
				+    process_pixblock_tail
			
 
				+    pixinterleave dst_w_bpp, dst_w_basereg
			
 
				+.irp chunk_size, 16, 8, 4, 2, 1
			
 
				+.if pixblock_size > chunk_size
			
 
				+    tst         W, #chunk_size
			
 
				+    beq         1f
			
 
				+.if dst_aligned_flag != 0
			
 
				+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
			
 
				+.else
			
 
				+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
			
 
				+.endif
			
 
				+1:
			
 
				+.endif
			
 
				+.endr
			
 
				+2:
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Macro, which performs all the needed operations to switch to the next
			
 
				+ * scanline and start the next loop iteration unless all the scanlines
			
 
				+ * are already processed.
			
 
				+ */
			
 
				+.macro advance_to_next_scanline start_of_loop_label
			
 
				+.if regs_shortage
			
 
				+    ldrd        W, [sp] /* load W and H (width and height) from stack */
			
 
				+.else
			
 
				+    mov         W, ORIG_W
			
 
				+.endif
			
 
				+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
			
 
				+.if src_bpp != 0
			
 
				+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
			
 
				+.endif
			
 
				+.if mask_bpp != 0
			
 
				+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
			
 
				+.endif
			
 
				+.if (dst_w_bpp != 24)
			
 
				+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
			
 
				+.endif
			
 
				+.if (src_bpp != 24) && (src_bpp != 0)
			
 
				+    sub         SRC, SRC, W, lsl #src_bpp_shift
			
 
				+.endif
			
 
				+.if (mask_bpp != 24) && (mask_bpp != 0)
			
 
				+    sub         MASK, MASK, W, lsl #mask_bpp_shift
			
 
				+.endif
			
 
				+    subs        H, H, #1
			
 
				+    mov         DST_R, DST_W
			
 
				+.if regs_shortage
			
 
				+    str         H, [sp, #4] /* save updated height to stack */
			
 
				+.endif
			
 
				+    bge         start_of_loop_label
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Registers are allocated in the following way by default:
			
 
				+ * d0, d1, d2, d3     - reserved for loading source pixel data
			
 
				+ * d4, d5, d6, d7     - reserved for loading destination pixel data
			
 
				+ * d24, d25, d26, d27 - reserved for loading mask pixel data
			
 
				+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
			
 
				+ */
			
 
				+.macro generate_composite_function fname, \
			
 
				+                                   src_bpp_, \
			
 
				+                                   mask_bpp_, \
			
 
				+                                   dst_w_bpp_, \
			
 
				+                                   flags, \
			
 
				+                                   pixblock_size_, \
			
 
				+                                   prefetch_distance, \
			
 
				+                                   init, \
			
 
				+                                   cleanup, \
			
 
				+                                   process_pixblock_head, \
			
 
				+                                   process_pixblock_tail, \
			
 
				+                                   process_pixblock_tail_head, \
			
 
				+                                   dst_w_basereg_ = 28, \
			
 
				+                                   dst_r_basereg_ = 4, \
			
 
				+                                   src_basereg_   = 0, \
			
 
				+                                   mask_basereg_  = 24
			
 
				+
			
 
				+    pixman_asm_function fname
			
 
				+
			
 
				+    push        {r4-r12, lr}        /* save all registers */
			
 
				+
			
 
				+/*
			
 
				+ * Select prefetch type for this function. If prefetch distance is
			
 
				+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
			
 
				+ * has to be used instead of ADVANCED.
			
 
				+ */
			
 
				+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
			
 
				+.if prefetch_distance == 0
			
 
				+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
			
 
				+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
			
 
				+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
			
 
				+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
			
 
				+.endif
			
 
				+
			
 
				+/*
			
 
				+ * Make some macro arguments globally visible and accessible
			
 
				+ * from other macros
			
 
				+ */
			
 
				+    .set src_bpp, src_bpp_
			
 
				+    .set mask_bpp, mask_bpp_
			
 
				+    .set dst_w_bpp, dst_w_bpp_
			
 
				+    .set pixblock_size, pixblock_size_
			
 
				+    .set dst_w_basereg, dst_w_basereg_
			
 
				+    .set dst_r_basereg, dst_r_basereg_
			
 
				+    .set src_basereg, src_basereg_
			
 
				+    .set mask_basereg, mask_basereg_
			
 
				+
			
 
				+    .macro pixld_src x:vararg
			
 
				+        pixld x
			
 
				+    .endm
			
 
				+    .macro fetch_src_pixblock
			
 
				+        pixld_src   pixblock_size, src_bpp, \
			
 
				+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
			
 
				+    .endm
			
 
				+/*
			
 
				+ * Assign symbolic names to registers
			
 
				+ */
			
 
				+    W           .req        r0      /* width (is updated during processing) */
			
 
				+    H           .req        r1      /* height (is updated during processing) */
			
 
				+    DST_W       .req        r2      /* destination buffer pointer for writes */
			
 
				+    DST_STRIDE  .req        r3      /* destination image stride */
			
 
				+    SRC         .req        r4      /* source buffer pointer */
			
 
				+    SRC_STRIDE  .req        r5      /* source image stride */
			
 
				+    DST_R       .req        r6      /* destination buffer pointer for reads */
			
 
				+
			
 
				+    MASK        .req        r7      /* mask pointer */
			
 
				+    MASK_STRIDE .req        r8      /* mask stride */
			
 
				+
			
 
				+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
			
 
				+                                    /* distance increment counter */
			
 
				+    PF_X        .req        r10     /* pixel index in a scanline for current */
			
 
				+                                    /* pretetch position */
			
 
				+    PF_SRC      .req        r11     /* pointer to source scanline start */
			
 
				+                                    /* for prefetch purposes */
			
 
				+    PF_DST      .req        r12     /* pointer to destination scanline start */
			
 
				+                                    /* for prefetch purposes */
			
 
				+    PF_MASK     .req        r14     /* pointer to mask scanline start */
			
 
				+                                    /* for prefetch purposes */
			
 
				+/*
			
 
				+ * Check whether we have enough registers for all the local variables.
			
 
				+ * If we don't have enough registers, original width and height are
			
 
				+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
			
 
				+ * this for the rest of code). Even if there are enough registers, the
			
 
				+ * allocation scheme may be a bit different depending on whether source
			
 
				+ * or mask is not used.
			
 
				+ */
			
 
				+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
			
 
				+    ORIG_W      .req        r10     /* saved original width */
			
 
				+    DUMMY       .req        r12     /* temporary register */
			
 
				+    .set        regs_shortage, 0
			
 
				+.elseif mask_bpp == 0
			
 
				+    ORIG_W      .req        r7      /* saved original width */
			
 
				+    DUMMY       .req        r8      /* temporary register */
			
 
				+    .set        regs_shortage, 0
			
 
				+.elseif src_bpp == 0
			
 
				+    ORIG_W      .req        r4      /* saved original width */
			
 
				+    DUMMY       .req        r5      /* temporary register */
			
 
				+    .set        regs_shortage, 0
			
 
				+.else
			
 
				+    ORIG_W      .req        r1      /* saved original width */
			
 
				+    DUMMY       .req        r1      /* temporary register */
			
 
				+    .set        regs_shortage, 1
			
 
				+.endif
			
 
				+
			
 
				+    .set mask_bpp_shift, -1
			
 
				+.if src_bpp == 32
			
 
				+    .set src_bpp_shift, 2
			
 
				+.elseif src_bpp == 24
			
 
				+    .set src_bpp_shift, 0
			
 
				+.elseif src_bpp == 16
			
 
				+    .set src_bpp_shift, 1
			
 
				+.elseif src_bpp == 8
			
 
				+    .set src_bpp_shift, 0
			
 
				+.elseif src_bpp == 0
			
 
				+    .set src_bpp_shift, -1
			
 
				+.else
			
 
				+    .error "requested src bpp (src_bpp) is not supported"
			
 
				+.endif
			
 
				+.if mask_bpp == 32
			
 
				+    .set mask_bpp_shift, 2
			
 
				+.elseif mask_bpp == 24
			
 
				+    .set mask_bpp_shift, 0
			
 
				+.elseif mask_bpp == 8
			
 
				+    .set mask_bpp_shift, 0
			
 
				+.elseif mask_bpp == 0
			
 
				+    .set mask_bpp_shift, -1
			
 
				+.else
			
 
				+    .error "requested mask bpp (mask_bpp) is not supported"
			
 
				+.endif
			
 
				+.if dst_w_bpp == 32
			
 
				+    .set dst_bpp_shift, 2
			
 
				+.elseif dst_w_bpp == 24
			
 
				+    .set dst_bpp_shift, 0
			
 
				+.elseif dst_w_bpp == 16
			
 
				+    .set dst_bpp_shift, 1
			
 
				+.elseif dst_w_bpp == 8
			
 
				+    .set dst_bpp_shift, 0
			
 
				+.else
			
 
				+    .error "requested dst bpp (dst_w_bpp) is not supported"
			
 
				+.endif
			
 
				+
			
 
				+.if (((flags) & FLAG_DST_READWRITE) != 0)
			
 
				+    .set dst_r_bpp, dst_w_bpp
			
 
				+.else
			
 
				+    .set dst_r_bpp, 0
			
 
				+.endif
			
 
				+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
			
 
				+    .set DEINTERLEAVE_32BPP_ENABLED, 1
			
 
				+.else
			
 
				+    .set DEINTERLEAVE_32BPP_ENABLED, 0
			
 
				+.endif
			
 
				+
			
 
				+.if prefetch_distance < 0 || prefetch_distance > 15
			
 
				+    .error "invalid prefetch distance (prefetch_distance)"
			
 
				+.endif
			
 
				+
			
 
				+.if src_bpp > 0
			
 
				+    ldr         SRC, [sp, #40]
			
 
				+.endif
			
 
				+.if mask_bpp > 0
			
 
				+    ldr         MASK, [sp, #48]
			
 
				+.endif
			
 
				+    PF mov      PF_X, #0
			
 
				+.if src_bpp > 0
			
 
				+    ldr         SRC_STRIDE, [sp, #44]
			
 
				+.endif
			
 
				+.if mask_bpp > 0
			
 
				+    ldr         MASK_STRIDE, [sp, #52]
			
 
				+.endif
			
 
				+    mov         DST_R, DST_W
			
 
				+
			
 
				+.if src_bpp == 24
			
 
				+    sub         SRC_STRIDE, SRC_STRIDE, W
			
 
				+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
			
 
				+.endif
			
 
				+.if mask_bpp == 24
			
 
				+    sub         MASK_STRIDE, MASK_STRIDE, W
			
 
				+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
			
 
				+.endif
			
 
				+.if dst_w_bpp == 24
			
 
				+    sub         DST_STRIDE, DST_STRIDE, W
			
 
				+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
			
 
				+.endif
			
 
				+
			
 
				+/*
			
 
				+ * Setup advanced prefetcher initial state
			
 
				+ */
			
 
				+    PF mov      PF_SRC, SRC
			
 
				+    PF mov      PF_DST, DST_R
			
 
				+    PF mov      PF_MASK, MASK
			
 
				+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
			
 
				+    PF mov      PF_CTL, H, lsl #4
			
 
				+    PF add      PF_CTL, #(prefetch_distance - 0x10)
			
 
				+
			
 
				+    init
			
 
				+.if regs_shortage
			
 
				+    push        {r0, r1}
			
 
				+.endif
			
 
				+    subs        H, H, #1
			
 
				+.if regs_shortage
			
 
				+    str         H, [sp, #4] /* save updated height to stack */
			
 
				+.else
			
 
				+    mov         ORIG_W, W
			
 
				+.endif
			
 
				+    blt         9f
			
 
				+    cmp         W, #(pixblock_size * 2)
			
 
				+    blt         8f
			
 
				+/*
			
 
				+ * This is the start of the pipelined loop, which if optimized for
			
 
				+ * long scanlines
			
 
				+ */
			
 
				+0:
			
 
				+    ensure_destination_ptr_alignment process_pixblock_head, \
			
 
				+                                     process_pixblock_tail, \
			
 
				+                                     process_pixblock_tail_head
			
 
				+
			
 
				+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
			
 
				+    pixld_a     pixblock_size, dst_r_bpp, \
			
 
				+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
			
 
				+    fetch_src_pixblock
			
 
				+    pixld       pixblock_size, mask_bpp, \
			
 
				+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				+    PF add      PF_X, PF_X, #pixblock_size
			
 
				+    process_pixblock_head
			
 
				+    cache_preload 0, pixblock_size
			
 
				+    cache_preload_simple
			
 
				+    subs        W, W, #(pixblock_size * 2)
			
 
				+    blt         2f
			
 
				+1:
			
 
				+    process_pixblock_tail_head
			
 
				+    cache_preload_simple
			
 
				+    subs        W, W, #pixblock_size
			
 
				+    bge         1b
			
 
				+2:
			
 
				+    process_pixblock_tail
			
 
				+    pixst_a     pixblock_size, dst_w_bpp, \
			
 
				+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
			
 
				+
			
 
				+    /* Process the remaining trailing pixels in the scanline */
			
 
				+    process_trailing_pixels 1, 1, \
			
 
				+                            process_pixblock_head, \
			
 
				+                            process_pixblock_tail, \
			
 
				+                            process_pixblock_tail_head
			
 
				+    advance_to_next_scanline 0b
			
 
				+
			
 
				+.if regs_shortage
			
 
				+    pop         {r0, r1}
			
 
				+.endif
			
 
				+    cleanup
			
 
				+    pop         {r4-r12, pc}  /* exit */
			
 
				+/*
			
 
				+ * This is the start of the loop, designed to process images with small width
			
 
				+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
			
 
				+ * nor prefetch are used.
			
 
				+ */
			
 
				+8:
			
 
				+    /* Process exactly pixblock_size pixels if needed */
			
 
				+    tst         W, #pixblock_size
			
 
				+    beq         1f
			
 
				+    pixld       pixblock_size, dst_r_bpp, \
			
 
				+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
			
 
				+    fetch_src_pixblock
			
 
				+    pixld       pixblock_size, mask_bpp, \
			
 
				+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				+    process_pixblock_head
			
 
				+    process_pixblock_tail
			
 
				+    pixst       pixblock_size, dst_w_bpp, \
			
 
				+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
			
 
				+1:
			
 
				+    /* Process the remaining trailing pixels in the scanline */
			
 
				+    process_trailing_pixels 0, 0, \
			
 
				+                            process_pixblock_head, \
			
 
				+                            process_pixblock_tail, \
			
 
				+                            process_pixblock_tail_head
			
 
				+    advance_to_next_scanline 8b
			
 
				+9:
			
 
				+.if regs_shortage
			
 
				+    pop         {r0, r1}
			
 
				+.endif
			
 
				+    cleanup
			
 
				+    pop         {r4-r12, pc}  /* exit */
			
 
				+
			
 
				+    .purgem     fetch_src_pixblock
			
 
				+    .purgem     pixld_src
			
 
				+
			
 
				+    .unreq      SRC
			
 
				+    .unreq      MASK
			
 
				+    .unreq      DST_R
			
 
				+    .unreq      DST_W
			
 
				+    .unreq      ORIG_W
			
 
				+    .unreq      W
			
 
				+    .unreq      H
			
 
				+    .unreq      SRC_STRIDE
			
 
				+    .unreq      DST_STRIDE
			
 
				+    .unreq      MASK_STRIDE
			
 
				+    .unreq      PF_CTL
			
 
				+    .unreq      PF_X
			
 
				+    .unreq      PF_SRC
			
 
				+    .unreq      PF_DST
			
 
				+    .unreq      PF_MASK
			
 
				+    .unreq      DUMMY
			
 
				+    .endfunc
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * A simplified variant of function generation template for a single
			
 
				+ * scanline processing (for implementing pixman combine functions)
			
 
				+ */
			
 
				+.macro generate_composite_function_scanline        use_nearest_scaling, \
			
 
				+                                                   fname, \
			
 
				+                                                   src_bpp_, \
			
 
				+                                                   mask_bpp_, \
			
 
				+                                                   dst_w_bpp_, \
			
 
				+                                                   flags, \
			
 
				+                                                   pixblock_size_, \
			
 
				+                                                   init, \
			
 
				+                                                   cleanup, \
			
 
				+                                                   process_pixblock_head, \
			
 
				+                                                   process_pixblock_tail, \
			
 
				+                                                   process_pixblock_tail_head, \
			
 
				+                                                   dst_w_basereg_ = 28, \
			
 
				+                                                   dst_r_basereg_ = 4, \
			
 
				+                                                   src_basereg_   = 0, \
			
 
				+                                                   mask_basereg_  = 24
			
 
				+
			
 
				+    pixman_asm_function fname
			
 
				+
			
 
				+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
			
 
				+/*
			
 
				+ * Make some macro arguments globally visible and accessible
			
 
				+ * from other macros
			
 
				+ */
			
 
				+    .set src_bpp, src_bpp_
			
 
				+    .set mask_bpp, mask_bpp_
			
 
				+    .set dst_w_bpp, dst_w_bpp_
			
 
				+    .set pixblock_size, pixblock_size_
			
 
				+    .set dst_w_basereg, dst_w_basereg_
			
 
				+    .set dst_r_basereg, dst_r_basereg_
			
 
				+    .set src_basereg, src_basereg_
			
 
				+    .set mask_basereg, mask_basereg_
			
 
				+
			
 
				+.if use_nearest_scaling != 0
			
 
				+    /*
			
 
				+     * Assign symbolic names to registers for nearest scaling
			
 
				+     */
			
 
				+    W           .req        r0
			
 
				+    DST_W       .req        r1
			
 
				+    SRC         .req        r2
			
 
				+    VX          .req        r3
			
 
				+    UNIT_X      .req        ip
			
 
				+    MASK        .req        lr
			
 
				+    TMP1        .req        r4
			
 
				+    TMP2        .req        r5
			
 
				+    DST_R       .req        r6
			
 
				+    SRC_WIDTH_FIXED .req        r7
			
 
				+
			
 
				+    .macro pixld_src x:vararg
			
 
				+        pixld_s x
			
 
				+    .endm
			
 
				+
			
 
				+    ldr         UNIT_X, [sp]
			
 
				+    push        {r4-r8, lr}
			
 
				+    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
			
 
				+    .if mask_bpp != 0
			
 
				+    ldr         MASK, [sp, #(24 + 8)]
			
 
				+    .endif
			
 
				+.else
			
 
				+    /*
			
 
				+     * Assign symbolic names to registers
			
 
				+     */
			
 
				+    W           .req        r0      /* width (is updated during processing) */
			
 
				+    DST_W       .req        r1      /* destination buffer pointer for writes */
			
 
				+    SRC         .req        r2      /* source buffer pointer */
			
 
				+    DST_R       .req        ip      /* destination buffer pointer for reads */
			
 
				+    MASK        .req        r3      /* mask pointer */
			
 
				+
			
 
				+    .macro pixld_src x:vararg
			
 
				+        pixld x
			
 
				+    .endm
			
 
				+.endif
			
 
				+
			
 
				+.if (((flags) & FLAG_DST_READWRITE) != 0)
			
 
				+    .set dst_r_bpp, dst_w_bpp
			
 
				+.else
			
 
				+    .set dst_r_bpp, 0
			
 
				+.endif
			
 
				+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
			
 
				+    .set DEINTERLEAVE_32BPP_ENABLED, 1
			
 
				+.else
			
 
				+    .set DEINTERLEAVE_32BPP_ENABLED, 0
			
 
				+.endif
			
 
				+
			
 
				+    .macro fetch_src_pixblock
			
 
				+        pixld_src   pixblock_size, src_bpp, \
			
 
				+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
			
 
				+    .endm
			
 
				+
			
 
				+    init
			
 
				+    mov         DST_R, DST_W
			
 
				+
			
 
				+    cmp         W, #pixblock_size
			
 
				+    blt         8f
			
 
				+
			
 
				+    ensure_destination_ptr_alignment process_pixblock_head, \
			
 
				+                                     process_pixblock_tail, \
			
 
				+                                     process_pixblock_tail_head
			
 
				+
			
 
				+    subs        W, W, #pixblock_size
			
 
				+    blt         7f
			
 
				+
			
 
				+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
			
 
				+    pixld_a     pixblock_size, dst_r_bpp, \
			
 
				+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
			
 
				+    fetch_src_pixblock
			
 
				+    pixld       pixblock_size, mask_bpp, \
			
 
				+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				+    process_pixblock_head
			
 
				+    subs        W, W, #pixblock_size
			
 
				+    blt         2f
			
 
				+1:
			
 
				+    process_pixblock_tail_head
			
 
				+    subs        W, W, #pixblock_size
			
 
				+    bge         1b
			
 
				+2:
			
 
				+    process_pixblock_tail
			
 
				+    pixst_a     pixblock_size, dst_w_bpp, \
			
 
				+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
			
 
				+7:
			
 
				+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
			
 
				+    process_trailing_pixels 0, 1, \
			
 
				+                            process_pixblock_head, \
			
 
				+                            process_pixblock_tail, \
			
 
				+                            process_pixblock_tail_head
			
 
				+
			
 
				+    cleanup
			
 
				+.if use_nearest_scaling != 0
			
 
				+    pop         {r4-r8, pc}  /* exit */
			
 
				+.else
			
 
				+    bx          lr  /* exit */
			
 
				+.endif
			
 
				+8:
			
 
				+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
			
 
				+    process_trailing_pixels 0, 0, \
			
 
				+                            process_pixblock_head, \
			
 
				+                            process_pixblock_tail, \
			
 
				+                            process_pixblock_tail_head
			
 
				+
			
 
				+    cleanup
			
 
				+
			
 
				+.if use_nearest_scaling != 0
			
 
				+    pop         {r4-r8, pc}  /* exit */
			
 
				+
			
 
				+    .unreq      DST_R
			
 
				+    .unreq      SRC
			
 
				+    .unreq      W
			
 
				+    .unreq      VX
			
 
				+    .unreq      UNIT_X
			
 
				+    .unreq      TMP1
			
 
				+    .unreq      TMP2
			
 
				+    .unreq      DST_W
			
 
				+    .unreq      MASK
			
 
				+    .unreq      SRC_WIDTH_FIXED
			
 
				+
			
 
				+.else
			
 
				+    bx          lr  /* exit */
			
 
				+
			
 
				+    .unreq      SRC
			
 
				+    .unreq      MASK
			
 
				+    .unreq      DST_R
			
 
				+    .unreq      DST_W
			
 
				+    .unreq      W
			
 
				+.endif
			
 
				+
			
 
				+    .purgem     fetch_src_pixblock
			
 
				+    .purgem     pixld_src
			
 
				+
			
 
				+    .endfunc
			
 
				+.endm
			
 
				+
			
 
				+.macro generate_composite_function_single_scanline x:vararg
			
 
				+    generate_composite_function_scanline 0, x
			
 
				+.endm
			
 
				+
			
 
				+.macro generate_composite_function_nearest_scanline x:vararg
			
 
				+    generate_composite_function_scanline 1, x
			
 
				+.endm
			
 
				+
			
 
				+/* Default prologue/epilogue, nothing special needs to be done */
			
 
				+
			
 
				+.macro default_init
			
 
				+.endm
			
 
				+
			
 
				+.macro default_cleanup
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Prologue/epilogue variant which additionally saves/restores d8-d15
			
 
				+ * registers (they need to be saved/restored by callee according to ABI).
			
 
				+ * This is required if the code needs to use all the NEON registers.
			
 
				+ */
			
 
				+
			
 
				+.macro default_init_need_all_regs
			
 
				+    vpush       {d8-d15}
			
 
				+.endm
			
 
				+
			
 
				+.macro default_cleanup_need_all_regs
			
 
				+    vpop        {d8-d15}
			
 
				+.endm
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
			
 
				+ * into a planar a8r8g8b8 format (with a, r, g, b color components
			
 
				+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
			
 
				+ *
			
 
				+ * Warning: the conversion is destructive and the original
			
 
				+ *          value (in) is lost.
			
 
				+ */
			
 
				+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
			
 
				+    vshrn.u16   out_r, in,    #8
			
 
				+    vshrn.u16   out_g, in,    #3
			
 
				+    vsli.u16    in,    in,    #5
			
 
				+    vmov.u8     out_a, #255
			
 
				+    vsri.u8     out_r, out_r, #5
			
 
				+    vsri.u8     out_g, out_g, #6
			
 
				+    vshrn.u16   out_b, in,    #2
			
 
				+.endm
			
 
				+
			
 
				+.macro convert_0565_to_x888 in, out_r, out_g, out_b
			
 
				+    vshrn.u16   out_r, in,    #8
			
 
				+    vshrn.u16   out_g, in,    #3
			
 
				+    vsli.u16    in,    in,    #5
			
 
				+    vsri.u8     out_r, out_r, #5
			
 
				+    vsri.u8     out_g, out_g, #6
			
 
				+    vshrn.u16   out_b, in,    #2
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
			
 
				+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
			
 
				+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
			
 
				+ * registers (tmp1, tmp2)
			
 
				+ */
			
 
				+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
			
 
				+    vshll.u8    tmp1, in_g, #8
			
 
				+    vshll.u8    out, in_r, #8
			
 
				+    vshll.u8    tmp2, in_b, #8
			
 
				+    vsri.u16    out, tmp1, #5
			
 
				+    vsri.u16    out, tmp2, #11
			
 
				+.endm
			
 
				+
			
 
				+/*
			
 
				+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
			
 
				+ * returned in (out0, out1) registers pair. Requires one temporary
			
 
				+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
			
 
				+ * value from 'in' is lost
			
 
				+ */
			
 
				+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
			
 
				+    vshl.u16    out0, in,   #5  /* G top 6 bits */
			
 
				+    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
			
 
				+    vsri.u16    in,   in,   #5  /* R is ready in top bits */
			
 
				+    vsri.u16    out0, out0, #6  /* G is ready in top bits */
			
 
				+    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
			
 
				+    vshr.u16    out1, in,   #8  /* R is in place */
			
 
				+    vsri.u16    out0, tmp,  #8  /* G & B is in place */
			
 
				+    vzip.u16    out0, out1      /* everything is in place */
			
 
				+.endm