пре 1 година · 0f351cd6af
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -292,7 +292,6 @@ dep_option(SDL_MMX                 "Use MMX assembly routines" ON "SDL_ASSEMBLY;
 
				 dep_option(SDL_ALTIVEC             "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF)
			
 
				 dep_option(SDL_ARMSIMD             "Use SIMD assembly blitters on ARM" OFF "SDL_ASSEMBLY;SDL_CPU_ARM32" OFF)
			
 
				 dep_option(SDL_ARMNEON             "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF)
			
 
				-dep_option(SDL_ARMNEON_BLITTERS    "Use NEON assembly blitters on ARM32" OFF "SDL_VIDEO;SDL_ASSEMBLY;SDL_ARMNEON;SDL_CPU_ARM32" OFF)
			
 
				 dep_option(SDL_LSX                 "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF)
			
 
				 dep_option(SDL_LASX                "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF)
			
 
				 
			
@@ -883,67 +882,6 @@ if(SDL_ASSEMBLY)
 
				       endif()
			
 
				     endif()
			
 
				 
			
 
				-    if(SDL_ARMSIMD)
			
 
				-      cmake_push_check_state()
			
 
				-      string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp")
			
 
				-      list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none)
			
 
				-      check_c_source_compiles("
			
 
				-        .text
			
 
				-        .arch armv6
			
 
				-        .object_arch armv4
			
 
				-        .arm
			
 
				-        .altmacro
			
 
				-        #ifndef __ARM_EABI__
			
 
				-        #error EABI is required (to be sure that calling conventions are compatible)
			
 
				-        #endif
			
 
				-        main:
			
 
				-        .global main
			
 
				-        pld [r0]
			
 
				-        uqadd8 r0, r0, r0
			
 
				-      " ARMSIMD_FOUND)
			
 
				-      cmake_pop_check_state()
			
 
				-
			
 
				-      if(ARMSIMD_FOUND)
			
 
				-        set(HAVE_ARMSIMD TRUE)
			
 
				-        set(SDL_ARM_SIMD_BLITTERS 1)
			
 
				-        enable_language(ASM)
			
 
				-        sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-simd*.S")
			
 
				-        set_property(SOURCE ${ARMSIMD_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp)
			
 
				-        set(WARN_ABOUT_ARM_SIMD_ASM_MIT TRUE)
			
 
				-      endif()
			
 
				-    endif()
			
 
				-
			
 
				-    if(SDL_ARMNEON_BLITTERS)
			
 
				-      cmake_push_check_state()
			
 
				-      string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp")
			
 
				-      list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none)
			
 
				-      check_c_source_compiles("
			
 
				-        .text
			
 
				-        .fpu neon
			
 
				-        .arch armv7a
			
 
				-        .object_arch armv4
			
 
				-        .eabi_attribute 10, 0
			
 
				-        .arm
			
 
				-        .altmacro
			
 
				-        #ifndef __ARM_EABI__
			
 
				-        #error EABI is required (to be sure that calling conventions are compatible)
			
 
				-        #endif
			
 
				-        main:
			
 
				-        .global main
			
 
				-        pld [r0]
			
 
				-        vmovn.u16 d0, q0
			
 
				-      " COMPILER_SUPPORTS_ARMNEON_ASSEMBLY)
			
 
				-      cmake_pop_check_state()
			
 
				-      if(COMPILER_SUPPORTS_ARMNEON_ASSEMBLY)
			
 
				-        set(HAVE_ARMNEON_BLITTERS TRUE)
			
 
				-        set(SDL_ARM_NEON_BLITTERS 1)
			
 
				-        enable_language(ASM)
			
 
				-        sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-neon*.S")
			
 
				-        set_property(SOURCE ${ARMNEON_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp)
			
 
				-        set(WARN_ABOUT_ARM_NEON_ASM_MIT TRUE)
			
 
				-      endif()
			
 
				-    endif()
			
 
				-
			
 
				     if(SDL_ARMNEON)
			
 
				       check_c_source_compiles("
			
 
				         #include <arm_neon.h>
			
--- a/cmake/3rdparty.cmake
+++ b/cmake/3rdparty.cmake
@@ -25,10 +25,6 @@ function(get_clang_tidy_ignored_files OUTVAR)
 
				       # HIDAPI Steam controller
			
 
				       "controller_constants.h"
			
 
				       "controller_structs.h"
			
 
				-      # Nokia Pixman
			
 
				-      "pixman-arm-asm.h"
			
 
				-      "pixman-arm-neon-asm.h"
			
 
				-      "pixman-arm-simd-asm.h"
			
 
				       # YUV2RGB
			
 
				       "yuv_rgb.c"
			
 
				       "yuv_rgb_lsx_func.h"
			
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -23,12 +23,6 @@
 
				 #ifndef SDL_blit_h_
			
 
				 #define SDL_blit_h_
			
 
				 
			
 
				-/* pixman ARM blitters are 32 bit only : */
			
 
				-#if defined(__aarch64__) || defined(_M_ARM64)
			
 
				-#undef SDL_ARM_SIMD_BLITTERS
			
 
				-#undef SDL_ARM_NEON_BLITTERS
			
 
				-#endif
			
 
				-
			
 
				 /* Table to do pixel byte expansion */
			
 
				 extern const Uint8 *SDL_expand_byte[9];
			
 
				 extern const Uint16 SDL_expand_byte_10[];
			
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -421,66 +421,6 @@ static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
 
				 
			
 
				 #endif /* SDL_MMX_INTRINSICS */
			
 
				 
			
 
				-#ifdef SDL_ARM_SIMD_BLITTERS
			
 
				-void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
			
 
				-
			
 
				-static void BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int32_t width = info->dst_w;
			
 
				-    int32_t height = info->dst_h;
			
 
				-    uint16_t *dstp = (uint16_t *)info->dst;
			
 
				-    int32_t dststride = width + (info->dst_skip >> 1);
			
 
				-    uint32_t *srcp = (uint32_t *)info->src;
			
 
				-    int32_t srcstride = width + (info->src_skip >> 2);
			
 
				-
			
 
				-    BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
			
 
				-}
			
 
				-
			
 
				-void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
			
 
				-
			
 
				-static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int32_t width = info->dst_w;
			
 
				-    int32_t height = info->dst_h;
			
 
				-    uint32_t *dstp = (uint32_t *)info->dst;
			
 
				-    int32_t dststride = width + (info->dst_skip >> 2);
			
 
				-    uint32_t *srcp = (uint32_t *)info->src;
			
 
				-    int32_t srcstride = width + (info->src_skip >> 2);
			
 
				-
			
 
				-    BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#ifdef SDL_ARM_NEON_BLITTERS
			
 
				-void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
			
 
				-
			
 
				-static void BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int32_t width = info->dst_w;
			
 
				-    int32_t height = info->dst_h;
			
 
				-    uint16_t *dstp = (uint16_t *)info->dst;
			
 
				-    int32_t dststride = width + (info->dst_skip >> 1);
			
 
				-    uint32_t *srcp = (uint32_t *)info->src;
			
 
				-    int32_t srcstride = width + (info->src_skip >> 2);
			
 
				-
			
 
				-    BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
			
 
				-}
			
 
				-
			
 
				-void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
			
 
				-
			
 
				-static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int32_t width = info->dst_w;
			
 
				-    int32_t height = info->dst_h;
			
 
				-    uint32_t *dstp = (uint32_t *)info->dst;
			
 
				-    int32_t dststride = width + (info->dst_skip >> 2);
			
 
				-    uint32_t *srcp = (uint32_t *)info->src;
			
 
				-    int32_t srcstride = width + (info->src_skip >> 2);
			
 
				-
			
 
				-    BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
			
 
				 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
			
 
				 {
			
@@ -1274,21 +1214,7 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
 
				             }
			
 
				 
			
 
				         case 2:
			
 
				-#if defined(SDL_ARM_NEON_BLITTERS) || defined(SDL_ARM_SIMD_BLITTERS)
			
 
				-            if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
			
 
				-#ifdef SDL_ARM_NEON_BLITTERS
			
 
				-                if (SDL_HasNEON()) {
			
 
				-                    return BlitARGBto565PixelAlphaARMNEON;
			
 
				-                }
			
 
				-#endif
			
 
				-#ifdef SDL_ARM_SIMD_BLITTERS
			
 
				-                if (SDL_HasARMSIMD()) {
			
 
				-                    return BlitARGBto565PixelAlphaARMSIMD;
			
 
				-                }
			
 
				-#endif
			
 
				-            }
			
 
				-#endif
			
 
				-            if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
			
 
				+            if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
			
 
				                 if (df->Gmask == 0x7e0) {
			
 
				                     return BlitARGBto565PixelAlpha;
			
 
				                 } else if (df->Gmask == 0x3e0) {
			
@@ -1311,18 +1237,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
 
				                     }
			
 
				                 }
			
 
				 #endif /* SDL_MMX_INTRINSICS */
			
 
				-                if (sf->Amask == 0xff000000) {
			
 
				-#ifdef SDL_ARM_NEON_BLITTERS
			
 
				-                    if (SDL_HasNEON()) {
			
 
				-                        return BlitRGBtoRGBPixelAlphaARMNEON;
			
 
				-                    }
			
 
				-#endif
			
 
				-#ifdef SDL_ARM_SIMD_BLITTERS
			
 
				-                    if (SDL_HasARMSIMD()) {
			
 
				-                        return BlitRGBtoRGBPixelAlphaARMSIMD;
			
 
				-                    }
			
 
				-#endif
			
 
				-                }
			
 
				             }
			
 
				             return BlitNtoNPixelAlpha;
			
 
				 
			
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -247,54 +247,6 @@ int SDL_FillSurfaceRect(SDL_Surface *dst, const SDL_Rect *rect, Uint32 color)
 
				     return SDL_FillSurfaceRects(dst, rect, 1, color);
			
 
				 }
			
 
				 
			
 
				-#ifdef SDL_ARM_NEON_BLITTERS
			
 
				-void FillSurfaceRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
			
 
				-void FillSurfaceRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
			
 
				-void FillSurfaceRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
			
 
				-
			
 
				-static void fill_8_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
			
 
				-{
			
 
				-    FillSurfaceRect8ARMNEONAsm(w, h, (uint8_t *)pixels, pitch >> 0, color);
			
 
				-    return;
			
 
				-}
			
 
				-
			
 
				-static void fill_16_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
			
 
				-{
			
 
				-    FillSurfaceRect16ARMNEONAsm(w, h, (uint16_t *)pixels, pitch >> 1, color);
			
 
				-    return;
			
 
				-}
			
 
				-
			
 
				-static void fill_32_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
			
 
				-{
			
 
				-    FillSurfaceRect32ARMNEONAsm(w, h, (uint32_t *)pixels, pitch >> 2, color);
			
 
				-    return;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#ifdef SDL_ARM_SIMD_BLITTERS
			
 
				-void FillSurfaceRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
			
 
				-void FillSurfaceRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
			
 
				-void FillSurfaceRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
			
 
				-
			
 
				-static void fill_8_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
			
 
				-{
			
 
				-    FillSurfaceRect8ARMSIMDAsm(w, h, (uint8_t *)pixels, pitch >> 0, color);
			
 
				-    return;
			
 
				-}
			
 
				-
			
 
				-static void fill_16_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
			
 
				-{
			
 
				-    FillSurfaceRect16ARMSIMDAsm(w, h, (uint16_t *)pixels, pitch >> 1, color);
			
 
				-    return;
			
 
				-}
			
 
				-
			
 
				-static void fill_32_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
			
 
				-{
			
 
				-    FillSurfaceRect32ARMSIMDAsm(w, h, (uint32_t *)pixels, pitch >> 2, color);
			
 
				-    return;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count,
			
 
				                   Uint32 color)
			
 
				 {
			
@@ -339,39 +291,8 @@ int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count,
 
				         return SDL_SetError("SDL_FillSurfaceRects(): Unsupported surface format");
			
 
				     }
			
 
				 
			
 
				-#ifdef SDL_ARM_NEON_BLITTERS
			
 
				-    if (SDL_HasNEON() && dst->format->bytes_per_pixel != 3 && !fill_function) {
			
 
				-        switch (dst->format->bytes_per_pixel) {
			
 
				-        case 1:
			
 
				-            fill_function = fill_8_neon;
			
 
				-            break;
			
 
				-        case 2:
			
 
				-            fill_function = fill_16_neon;
			
 
				-            break;
			
 
				-        case 4:
			
 
				-            fill_function = fill_32_neon;
			
 
				-            break;
			
 
				-        }
			
 
				-    }
			
 
				-#endif
			
 
				-#ifdef SDL_ARM_SIMD_BLITTERS
			
 
				-    if (SDL_HasARMSIMD() && dst->format->bytes_per_pixel != 3 && !fill_function) {
			
 
				-        switch (dst->format->bytes_per_pixel) {
			
 
				-        case 1:
			
 
				-            fill_function = fill_8_simd;
			
 
				-            break;
			
 
				-        case 2:
			
 
				-            fill_function = fill_16_simd;
			
 
				-            break;
			
 
				-        case 4:
			
 
				-            fill_function = fill_32_simd;
			
 
				-            break;
			
 
				-        }
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				-    if (!fill_function) {
			
 
				-        switch (dst->format->bytes_per_pixel) {
			
 
				+    if (fill_function == NULL) {
			
 
				+        switch (dst->format->BytesPerPixel) {
			
 
				         case 1:
			
 
				         {
			
 
				             color |= (color << 8);
			
--- a/src/video/arm/pixman-arm-asm.h
+++ b/src/video/arm/pixman-arm-asm.h
@@ -1,36 +0,0 @@
 
				-/*
			
 
				- * Copyright © 2010 Nokia Corporation
			
 
				- *
			
 
				- * Permission to use, copy, modify, distribute, and sell this software and its
			
 
				- * documentation for any purpose is hereby granted without fee, provided that
			
 
				- * the above copyright notice appear in all copies and that both that
			
 
				- * copyright notice and this permission notice appear in supporting
			
 
				- * documentation, and that the name of Mozilla Corporation not be used in
			
 
				- * advertising or publicity pertaining to distribution of the software without
			
 
				- * specific, written prior permission.  Mozilla Corporation makes no
			
 
				- * representations about the suitability of this software for any purpose.  It
			
 
				- * is provided "as is" without express or implied warranty.
			
 
				- *
			
 
				- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
			
 
				- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
			
 
				- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
			
 
				- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
			
 
				- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
			
 
				- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
			
 
				- * SOFTWARE.
			
 
				- *
			
 
				- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-/* Supplementary macro for setting function attributes */
			
 
				-.macro pixman_asm_function fname
			
 
				-	.func fname
			
 
				-	.global fname
			
 
				-#ifdef __ELF__
			
 
				-	.hidden fname
			
 
				-	.type fname, %function
			
 
				-#endif
			
 
				-fname:
			
 
				-.endm
			
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -1,375 +0,0 @@
 
				-/*
			
 
				- * Copyright © 2009 Nokia Corporation
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				- * copy of this software and associated documentation files (the "Software"),
			
 
				- * to deal in the Software without restriction, including without limitation
			
 
				- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				- * and/or sell copies of the Software, and to permit persons to whom the
			
 
				- * Software is furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice (including the next
			
 
				- * paragraph) shall be included in all copies or substantial portions of the
			
 
				- * Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				- * DEALINGS IN THE SOFTWARE.
			
 
				- *
			
 
				- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Copyright (c) 2018 RISC OS Open Ltd
			
 
				- *
			
 
				- * This software is provided 'as-is', without any express or implied
			
 
				- * warranty.  In no event will the authors be held liable for any damages
			
 
				- * arising from the use of this software.
			
 
				- *
			
 
				- * Permission is granted to anyone to use this software for any purpose,
			
 
				- * including commercial applications, and to alter it and redistribute it
			
 
				- * freely, subject to the following restrictions:
			
 
				- *
			
 
				- * 1. The origin of this software must not be misrepresented; you must not
			
 
				- *    claim that you wrote the original software. If you use this software
			
 
				- *    in a product, an acknowledgment in the product documentation would be
			
 
				- *    appreciated but is not required.
			
 
				- * 2. Altered source versions must be plainly marked as such, and must not be
			
 
				- *    misrepresented as being the original software.
			
 
				- * 3. This notice may not be removed or altered from any source distribution.
			
 
				- */
			
 
				-
			
 
				-/* Prevent the stack from becoming executable for no reason... */
			
 
				-#if defined(__linux__) && defined(__ELF__)
			
 
				-.section .note.GNU-stack,"",%progbits
			
 
				-#endif
			
 
				-
			
 
				-    .text
			
 
				-    .fpu neon
			
 
				-    .arch armv7a
			
 
				-    .object_arch armv4
			
 
				-    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
			
 
				-    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
			
 
				-    .arm
			
 
				-    .altmacro
			
 
				-    .p2align 2
			
 
				-
			
 
				-#include "pixman-arm-asm.h"
			
 
				-#include "pixman-arm-neon-asm.h"
			
 
				-
			
 
				-/* Global configuration options and preferences */
			
 
				-
			
 
				-/*
			
 
				- * The code can optionally make use of unaligned memory accesses to improve
			
 
				- * performance of handling leading/trailing pixels for each scanline.
			
 
				- * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
			
 
				- * example in linux if unaligned memory accesses are not configured to
			
 
				- * generate.exceptions.
			
 
				- */
			
 
				-.set RESPECT_STRICT_ALIGNMENT, 1
			
 
				-
			
 
				-/*
			
 
				- * Set default prefetch type. There is a choice between the following options:
			
 
				- *
			
 
				- * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
			
 
				- * as NOP to workaround some HW bugs or for whatever other reason)
			
 
				- *
			
 
				- * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
			
 
				- * advanced prefetch introduces heavy overhead)
			
 
				- *
			
 
				- * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
			
 
				- * which can run ARM and NEON instructions simultaneously so that extra ARM
			
 
				- * instructions do not add (many) extra cycles, but improve prefetch efficiency)
			
 
				- *
			
 
				- * Note: some types of function can't support advanced prefetch and fallback
			
 
				- *       to simple one (those which handle 24bpp pixels)
			
 
				- */
			
 
				-.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
			
 
				-
			
 
				-/* Prefetch distance in pixels for simple prefetch */
			
 
				-.set PREFETCH_DISTANCE_SIMPLE, 64
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-/* We can actually do significantly better than the Pixman macros, at least for
			
 
				- * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
			
 
				- * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
			
 
				- */
			
 
				-
			
 
				-.macro generate_fillrect_function name, bpp, log2Bpp
			
 
				-/*
			
 
				- * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
			
 
				- * On entry:
			
 
				- * a1 = width, pixels
			
 
				- * a2 = height, rows
			
 
				- * a3 = pointer to top-left destination pixel
			
 
				- * a4 = stride, pixels
			
 
				- * [sp] = pixel value to fill with
			
 
				- * Within the function:
			
 
				- * v1 = width remaining
			
 
				- * v2 = vst offset
			
 
				- * v3 = alternate pointer
			
 
				- * ip = data ARM register
			
 
				- */
			
 
				-pixman_asm_function name
			
 
				-    vld1.\bpp   {d0[],d1[]}, [sp]
			
 
				-    sub         a4, a1
			
 
				-    vld1.\bpp   {d2[],d3[]}, [sp]
			
 
				-    cmp         a1, #(15+64) >> \log2Bpp
			
 
				-    push        {v1-v3,lr}
			
 
				-    vmov        ip, s0
			
 
				-    blo         51f
			
 
				-
			
 
				-    /* Long-row case */
			
 
				-    mov         v2, #64
			
 
				-1:  mov         v1, a1
			
 
				-    ands        v3, a3, #15
			
 
				-    beq         2f
			
 
				-    /* Leading pixels */
			
 
				-    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
			
 
				-    sub         v1, v1, v3, lsr #\log2Bpp
			
 
				-    rbit        v3, v3
			
 
				-.if bpp <= 16
			
 
				-.if bpp == 8
			
 
				-    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
			
 
				-    strneb      ip, [a3], #1
			
 
				-    tst         v3, #1<<30
			
 
				-.else
			
 
				-    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
			
 
				-.endif
			
 
				-    strneh      ip, [a3], #2
			
 
				-.endif
			
 
				-    movs        v3, v3, lsl #3
			
 
				-    vstmcs      a3!, {s0}
			
 
				-    vstmmi      a3!, {d0}
			
 
				-2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
			
 
				-    add         v3, a3, #32
			
 
				-    /* Inner loop */
			
 
				-3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
			
 
				-    subs        v1, v1, #64 >> \log2Bpp
			
 
				-    vst1.\bpp   {q0-q1}, [v3 :128], v2
			
 
				-    bhs         3b
			
 
				-    /* Trailing pixels */
			
 
				-4:  movs        v1, v1, lsl #27 + \log2Bpp
			
 
				-    bcc         5f
			
 
				-    vst1.\bpp   {q0-q1}, [a3 :128]!
			
 
				-5:  bpl         6f
			
 
				-    vst1.\bpp   {q0}, [a3 :128]!
			
 
				-6:  movs        v1, v1, lsl #2
			
 
				-    vstmcs      a3!, {d0}
			
 
				-    vstmmi      a3!, {s0}
			
 
				-.if bpp <= 16
			
 
				-    movs        v1, v1, lsl #2
			
 
				-    strcsh      ip, [a3], #2
			
 
				-.if bpp == 8
			
 
				-    strmib      ip, [a3], #1
			
 
				-.endif
			
 
				-.endif
			
 
				-    subs        a2, a2, #1
			
 
				-    add         a3, a3, a4, lsl #\log2Bpp
			
 
				-    bhi         1b
			
 
				-    pop         {v1-v3,pc}
			
 
				-
			
 
				-    /* Short-row case */
			
 
				-51: movs        v1, a1
			
 
				-.if bpp == 8
			
 
				-    tst         a3, #3
			
 
				-    beq         53f
			
 
				-52: subs        v1, v1, #1
			
 
				-    blo         57f
			
 
				-    strb        ip, [a3], #1
			
 
				-    tst         a3, #3
			
 
				-    bne         52b
			
 
				-.elseif bpp == 16
			
 
				-    tstne       a3, #2
			
 
				-    subne       v1, v1, #1
			
 
				-    strneh      ip, [a3], #2
			
 
				-.endif
			
 
				-53: cmp         v1, #32 >> \log2Bpp
			
 
				-    bcc         54f
			
 
				-    vst1.\bpp   {q0-q1}, [a3]!
			
 
				-    sub         v1, v1, #32 >> \log2Bpp
			
 
				-    /* Trailing pixels */
			
 
				-54: movs        v1, v1, lsl #27 + \log2Bpp
			
 
				-    bcc         55f
			
 
				-    vst1.\bpp   {q0-q1}, [a3]!
			
 
				-55: bpl         56f
			
 
				-    vst1.\bpp   {q0}, [a3]!
			
 
				-56: movs        v1, v1, lsl #2
			
 
				-    vstmcs      a3!, {d0}
			
 
				-    vstmmi      a3!, {s0}
			
 
				-.if bpp <= 16
			
 
				-    movs        v1, v1, lsl #2
			
 
				-    strcsh      ip, [a3], #2
			
 
				-.if bpp == 8
			
 
				-    strmib      ip, [a3], #1
			
 
				-.endif
			
 
				-.endif
			
 
				-    subs        a2, a2, #1
			
 
				-    add         a3, a3, a4, lsl #\log2Bpp
			
 
				-    bhi         51b
			
 
				-57: pop         {v1-v3,pc}
			
 
				-
			
 
				-.endfunc
			
 
				-.endm
			
 
				-
			
 
				-generate_fillrect_function FillSurfaceRect32ARMNEONAsm, 32, 2
			
 
				-generate_fillrect_function FillSurfaceRect16ARMNEONAsm, 16, 1
			
 
				-generate_fillrect_function FillSurfaceRect8ARMNEONAsm,  8,  0
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_process_pixblock_head
			
 
				-    vmvn        d30, d3  /* get inverted source alpha */
			
 
				-    vmov        d31, d7  /* dest alpha is always unchanged */
			
 
				-    vmull.u8    q14, d0, d3
			
 
				-    vmlal.u8    q14, d4, d30
			
 
				-    vmull.u8    q0, d1, d3
			
 
				-    vmlal.u8    q0, d5, d30
			
 
				-    vmull.u8    q1, d2, d3
			
 
				-    vmlal.u8    q1, d6, d30
			
 
				-    vrshr.u16   q2, q14, #8
			
 
				-    vrshr.u16   q3, q0, #8
			
 
				-    vraddhn.u16 d28, q14, q2
			
 
				-    vrshr.u16   q2, q1, #8
			
 
				-    vraddhn.u16 d29, q0, q3
			
 
				-    vraddhn.u16 d30, q1, q2
			
 
				-.endm
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_process_pixblock_tail
			
 
				-    /* nothing */
			
 
				-.endm
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
			
 
				-    vld4.8      {d0-d3}, [SRC]!
			
 
				-                                    PF add PF_X, PF_X, #8
			
 
				-        vst4.8      {d28-d31}, [DST_W :128]!
			
 
				-                                    PF tst PF_CTL, #0xF
			
 
				-    vld4.8      {d4-d7}, [DST_R :128]!
			
 
				-                                    PF addne PF_X, PF_X, #8
			
 
				-    vmvn        d30, d3  /* get inverted source alpha */
			
 
				-    vmov        d31, d7  /* dest alpha is always unchanged */
			
 
				-    vmull.u8    q14, d0, d3
			
 
				-                                    PF subne PF_CTL, PF_CTL, #1
			
 
				-    vmlal.u8    q14, d4, d30
			
 
				-                                    PF cmp PF_X, ORIG_W
			
 
				-    vmull.u8    q0, d1, d3
			
 
				-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
			
 
				-    vmlal.u8    q0, d5, d30
			
 
				-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
			
 
				-    vmull.u8    q1, d2, d3
			
 
				-                                    PF subge PF_X, PF_X, ORIG_W
			
 
				-    vmlal.u8    q1, d6, d30
			
 
				-                                    PF subges PF_CTL, PF_CTL, #0x10
			
 
				-    vrshr.u16   q2, q14, #8
			
 
				-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
			
 
				-    vrshr.u16   q3, q0, #8
			
 
				-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
			
 
				-    vraddhn.u16 d28, q14, q2
			
 
				-    vrshr.u16   q2, q1, #8
			
 
				-    vraddhn.u16 d29, q0, q3
			
 
				-    vraddhn.u16 d30, q1, q2
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
			
 
				-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
			
 
				-    8, /* number of pixels, processed in a single block */ \
			
 
				-    5, /* prefetch distance */ \
			
 
				-    default_init, \
			
 
				-    default_cleanup, \
			
 
				-    RGBtoRGBPixelAlpha_process_pixblock_head, \
			
 
				-    RGBtoRGBPixelAlpha_process_pixblock_tail, \
			
 
				-    RGBtoRGBPixelAlpha_process_pixblock_tail_head
			
 
				-
			
 
				- /******************************************************************************/
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_process_pixblock_head
			
 
				-    vmvn        d6, d3
			
 
				-    vshr.u8     d1, #2
			
 
				-    vshr.u8     d3, #3
			
 
				-    vshr.u8     d0, #3
			
 
				-    vshrn.u16   d7, q2, #3
			
 
				-    vshrn.u16   d25, q2, #8
			
 
				-    vbic.i16    q2, #0xe0
			
 
				-    vshr.u8     d6, #3
			
 
				-    vshr.u8     d7, #2
			
 
				-    vshr.u8     d2, #3
			
 
				-    vmovn.u16   d24, q2
			
 
				-    vshr.u8     d25, #3
			
 
				-    vmull.u8    q13, d1, d3
			
 
				-    vmlal.u8    q13, d7, d6
			
 
				-    vmull.u8    q14, d0, d3
			
 
				-    vmlal.u8    q14, d24, d6
			
 
				-    vmull.u8    q15, d2, d3
			
 
				-    vmlal.u8    q15, d25, d6
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_process_pixblock_tail
			
 
				-    vsra.u16    q13, #5
			
 
				-    vsra.u16    q14, #5
			
 
				-    vsra.u16    q15, #5
			
 
				-    vrshr.u16   q13, #5
			
 
				-    vrshr.u16   q14, #5
			
 
				-    vrshr.u16   q15, #5
			
 
				-    vsli.u16    q14, q13, #5
			
 
				-    vsli.u16    q14, q15, #11
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_process_pixblock_tail_head
			
 
				-    vld4.8      {d0-d3}, [SRC]!
			
 
				-                                    PF add PF_X, PF_X, #8
			
 
				-        vsra.u16    q13, #5
			
 
				-                                    PF tst PF_CTL, #0xF
			
 
				-        vsra.u16    q14, #5
			
 
				-                                    PF addne PF_X, PF_X, #8
			
 
				-        vsra.u16    q15, #5
			
 
				-                                    PF subne PF_CTL, PF_CTL, #1
			
 
				-        vrshr.u16   q13, #5
			
 
				-                                    PF cmp PF_X, ORIG_W
			
 
				-        vrshr.u16   q14, #5
			
 
				-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
			
 
				-        vrshr.u16   q15, #5
			
 
				-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
			
 
				-    vld1.8      {d4-d5}, [DST_R]!
			
 
				-                                    PF subge PF_X, PF_X, ORIG_W
			
 
				-        vsli.u16    q14, q13, #5
			
 
				-                                    PF subges PF_CTL, PF_CTL, #0x10
			
 
				-        vsli.u16    q14, q15, #11
			
 
				-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
			
 
				-        vst1.8      {q14}, [DST_W :128]!
			
 
				-    vmvn        d6, d3
			
 
				-    vshr.u8     d1, #2
			
 
				-    vshr.u8     d3, #3
			
 
				-    vshr.u8     d0, #3
			
 
				-    vshrn.u16   d7, q2, #3
			
 
				-    vshrn.u16   d25, q2, #8
			
 
				-    vbic.i16    q2, #0xe0
			
 
				-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
			
 
				-    vshr.u8     d6, #3
			
 
				-    vshr.u8     d7, #2
			
 
				-    vshr.u8     d2, #3
			
 
				-    vmovn.u16   d24, q2
			
 
				-    vshr.u8     d25, #3
			
 
				-    vmull.u8    q13, d1, d3
			
 
				-    vmlal.u8    q13, d7, d6
			
 
				-    vmull.u8    q14, d0, d3
			
 
				-    vmlal.u8    q14, d24, d6
			
 
				-    vmull.u8    q15, d2, d3
			
 
				-    vmlal.u8    q15, d25, d6
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
			
 
				-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
			
 
				-    8, /* number of pixels, processed in a single block */ \
			
 
				-    6, /* prefetch distance */ \
			
 
				-    default_init, \
			
 
				-    default_cleanup, \
			
 
				-    ARGBto565PixelAlpha_process_pixblock_head, \
			
 
				-    ARGBto565PixelAlpha_process_pixblock_tail, \
			
 
				-    ARGBto565PixelAlpha_process_pixblock_tail_head
			
--- a/src/video/arm/pixman-arm-neon-asm.h
+++ b/src/video/arm/pixman-arm-neon-asm.h
@@ -1,1184 +0,0 @@
 
				-/*
			
 
				- * Copyright © 2009 Nokia Corporation
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				- * copy of this software and associated documentation files (the "Software"),
			
 
				- * to deal in the Software without restriction, including without limitation
			
 
				- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				- * and/or sell copies of the Software, and to permit persons to whom the
			
 
				- * Software is furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice (including the next
			
 
				- * paragraph) shall be included in all copies or substantial portions of the
			
 
				- * Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				- * DEALINGS IN THE SOFTWARE.
			
 
				- *
			
 
				- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This file contains a macro ('generate_composite_function') which can
			
 
				- * construct 2D image processing functions, based on a common template.
			
 
				- * Any combinations of source, destination and mask images with 8bpp,
			
 
				- * 16bpp, 24bpp, 32bpp color formats are supported.
			
 
				- *
			
 
				- * This macro takes care of:
			
 
				- *  - handling of leading and trailing unaligned pixels
			
 
				- *  - doing most of the work related to L2 cache preload
			
 
				- *  - encourages the use of software pipelining for better instructions
			
 
				- *    scheduling
			
 
				- *
			
 
				- * The user of this macro has to provide some configuration parameters
			
 
				- * (bit depths for the images, prefetch distance, etc.) and a set of
			
 
				- * macros, which should implement basic code chunks responsible for
			
 
				- * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
			
 
				- * examples.
			
 
				- *
			
 
				- * TODO:
			
 
				- *  - try overlapped pixel method (from Ian Rickards) when processing
			
 
				- *    exactly two blocks of pixels
			
 
				- *  - maybe add an option to do reverse scanline processing
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Bit flags for 'generate_composite_function' macro which are used
			
 
				- * to tune generated functions behavior.
			
 
				- */
			
 
				-.set FLAG_DST_WRITEONLY,       0
			
 
				-.set FLAG_DST_READWRITE,       1
			
 
				-.set FLAG_DEINTERLEAVE_32BPP,  2
			
 
				-
			
 
				-/*
			
 
				- * Offset in stack where mask and source pointer/stride can be accessed
			
 
				- * from 'init' macro. This is useful for doing special handling for solid mask.
			
 
				- */
			
 
				-.set ARGS_STACK_OFFSET,        40
			
 
				-
			
 
				-/*
			
 
				- * Constants for selecting preferable prefetch type.
			
 
				- */
			
 
				-.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
			
 
				-.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
			
 
				-.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
			
 
				-
			
 
				-/*
			
 
				- * Definitions of supplementary pixld/pixst macros (for partial load/store of
			
 
				- * pixel data).
			
 
				- */
			
 
				-
			
 
				-.macro pixldst1 op, elem_size, reg1, mem_operand, abits
			
 
				-.if abits > 0
			
 
				-    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
			
 
				-.else
			
 
				-    op&.&elem_size {d&reg1}, [&mem_operand&]!
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
			
 
				-.if abits > 0
			
 
				-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
			
 
				-.else
			
 
				-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
			
 
				-.if abits > 0
			
 
				-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
			
 
				-.else
			
 
				-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
			
 
				-    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
			
 
				-.endm
			
 
				-
			
 
				-.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
			
 
				-    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
			
 
				-.endm
			
 
				-
			
 
				-.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
			
 
				-    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
			
 
				-.endm
			
 
				-
			
 
				-.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
			
 
				-.if numbytes == 32
			
 
				-    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
			
 
				-                              %(basereg+6), %(basereg+7), mem_operand, abits
			
 
				-.elseif numbytes == 16
			
 
				-    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
			
 
				-.elseif numbytes == 8
			
 
				-    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
			
 
				-.elseif numbytes == 4
			
 
				-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
			
 
				-        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
			
 
				-    .elseif elem_size == 16
			
 
				-        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
			
 
				-        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
			
 
				-    .else
			
 
				-        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
			
 
				-        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
			
 
				-        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
			
 
				-        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
			
 
				-    .endif
			
 
				-.elseif numbytes == 2
			
 
				-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
			
 
				-        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
			
 
				-    .else
			
 
				-        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
			
 
				-        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
			
 
				-    .endif
			
 
				-.elseif numbytes == 1
			
 
				-    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
			
 
				-.else
			
 
				-    .error "unsupported size: numbytes"
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld numpix, bpp, basereg, mem_operand, abits=0
			
 
				-.if bpp > 0
			
 
				-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				-    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
			
 
				-                      %(basereg+6), %(basereg+7), mem_operand, abits
			
 
				-.elseif (bpp == 24) && (numpix == 8)
			
 
				-    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
			
 
				-.elseif (bpp == 24) && (numpix == 4)
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
			
 
				-.elseif (bpp == 24) && (numpix == 2)
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
			
 
				-.elseif (bpp == 24) && (numpix == 1)
			
 
				-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
			
 
				-.else
			
 
				-    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
			
 
				-.endif
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixst numpix, bpp, basereg, mem_operand, abits=0
			
 
				-.if bpp > 0
			
 
				-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				-    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
			
 
				-                      %(basereg+6), %(basereg+7), mem_operand, abits
			
 
				-.elseif (bpp == 24) && (numpix == 8)
			
 
				-    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
			
 
				-.elseif (bpp == 24) && (numpix == 4)
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
			
 
				-.elseif (bpp == 24) && (numpix == 2)
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
			
 
				-.elseif (bpp == 24) && (numpix == 1)
			
 
				-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
			
 
				-.else
			
 
				-    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
			
 
				-.endif
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld_a numpix, bpp, basereg, mem_operand
			
 
				-.if (bpp * numpix) <= 128
			
 
				-    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
			
 
				-.else
			
 
				-    pixld numpix, bpp, basereg, mem_operand, 128
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixst_a numpix, bpp, basereg, mem_operand
			
 
				-.if (bpp * numpix) <= 128
			
 
				-    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
			
 
				-.else
			
 
				-    pixst numpix, bpp, basereg, mem_operand, 128
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
			
 
				- * aliases to be defined)
			
 
				- */
			
 
				-.macro pixld1_s elem_size, reg1, mem_operand
			
 
				-.if elem_size == 16
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP1, mem_operand, TMP1, asl #1
			
 
				-    mov     TMP2, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP2, mem_operand, TMP2, asl #1
			
 
				-    vld1.16 {d&reg1&[0]}, [TMP1, :16]
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP1, mem_operand, TMP1, asl #1
			
 
				-    vld1.16 {d&reg1&[1]}, [TMP2, :16]
			
 
				-    mov     TMP2, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP2, mem_operand, TMP2, asl #1
			
 
				-    vld1.16 {d&reg1&[2]}, [TMP1, :16]
			
 
				-    vld1.16 {d&reg1&[3]}, [TMP2, :16]
			
 
				-.elseif elem_size == 32
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP1, mem_operand, TMP1, asl #2
			
 
				-    mov     TMP2, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP2, mem_operand, TMP2, asl #2
			
 
				-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
			
 
				-    vld1.32 {d&reg1&[1]}, [TMP2, :32]
			
 
				-.else
			
 
				-    .error "unsupported"
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld2_s elem_size, reg1, reg2, mem_operand
			
 
				-.if 0 /* elem_size == 32 */
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    add     VX, VX, UNIT_X, asl #1
			
 
				-    add     TMP1, mem_operand, TMP1, asl #2
			
 
				-    mov     TMP2, VX, asr #16
			
 
				-    sub     VX, VX, UNIT_X
			
 
				-    add     TMP2, mem_operand, TMP2, asl #2
			
 
				-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    add     VX, VX, UNIT_X, asl #1
			
 
				-    add     TMP1, mem_operand, TMP1, asl #2
			
 
				-    vld1.32 {d&reg2&[0]}, [TMP2, :32]
			
 
				-    mov     TMP2, VX, asr #16
			
 
				-    add     VX, VX, UNIT_X
			
 
				-    add     TMP2, mem_operand, TMP2, asl #2
			
 
				-    vld1.32 {d&reg1&[1]}, [TMP1, :32]
			
 
				-    vld1.32 {d&reg2&[1]}, [TMP2, :32]
			
 
				-.else
			
 
				-    pixld1_s elem_size, reg1, mem_operand
			
 
				-    pixld1_s elem_size, reg2, mem_operand
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld0_s elem_size, reg1, idx, mem_operand
			
 
				-.if elem_size == 16
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP1, mem_operand, TMP1, asl #1
			
 
				-    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
			
 
				-.elseif elem_size == 32
			
 
				-    mov     TMP1, VX, asr #16
			
 
				-    adds    VX, VX, UNIT_X
			
 
				-5:  subpls  VX, VX, SRC_WIDTH_FIXED
			
 
				-    bpl     5b
			
 
				-    add     TMP1, mem_operand, TMP1, asl #2
			
 
				-    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
			
 
				-.if numbytes == 32
			
 
				-    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
			
 
				-    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
			
 
				-    pixdeinterleave elem_size, %(basereg+4)
			
 
				-.elseif numbytes == 16
			
 
				-    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
			
 
				-.elseif numbytes == 8
			
 
				-    pixld1_s elem_size, %(basereg+1), mem_operand
			
 
				-.elseif numbytes == 4
			
 
				-    .if elem_size == 32
			
 
				-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
			
 
				-    .elseif elem_size == 16
			
 
				-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
			
 
				-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
			
 
				-    .else
			
 
				-        pixld0_s elem_size, %(basereg+0), 4, mem_operand
			
 
				-        pixld0_s elem_size, %(basereg+0), 5, mem_operand
			
 
				-        pixld0_s elem_size, %(basereg+0), 6, mem_operand
			
 
				-        pixld0_s elem_size, %(basereg+0), 7, mem_operand
			
 
				-    .endif
			
 
				-.elseif numbytes == 2
			
 
				-    .if elem_size == 16
			
 
				-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
			
 
				-    .else
			
 
				-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
			
 
				-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
			
 
				-    .endif
			
 
				-.elseif numbytes == 1
			
 
				-    pixld0_s elem_size, %(basereg+0), 1, mem_operand
			
 
				-.else
			
 
				-    .error "unsupported size: numbytes"
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld_s numpix, bpp, basereg, mem_operand
			
 
				-.if bpp > 0
			
 
				-    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro vuzp8 reg1, reg2
			
 
				-    vuzp.8 d&reg1, d&reg2
			
 
				-.endm
			
 
				-
			
 
				-.macro vzip8 reg1, reg2
			
 
				-    vzip.8 d&reg1, d&reg2
			
 
				-.endm
			
 
				-
			
 
				-/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
			
 
				-.macro pixdeinterleave bpp, basereg
			
 
				-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				-    vuzp8 %(basereg+0), %(basereg+1)
			
 
				-    vuzp8 %(basereg+2), %(basereg+3)
			
 
				-    vuzp8 %(basereg+1), %(basereg+3)
			
 
				-    vuzp8 %(basereg+0), %(basereg+2)
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
			
 
				-.macro pixinterleave bpp, basereg
			
 
				-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
			
 
				-    vzip8 %(basereg+0), %(basereg+2)
			
 
				-    vzip8 %(basereg+1), %(basereg+3)
			
 
				-    vzip8 %(basereg+2), %(basereg+3)
			
 
				-    vzip8 %(basereg+0), %(basereg+1)
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * This is a macro for implementing cache preload. The main idea is that
			
 
				- * cache preload logic is mostly independent from the rest of pixels
			
 
				- * processing code. It starts at the top left pixel and moves forward
			
 
				- * across pixels and can jump across scanlines. Prefetch distance is
			
 
				- * handled in an 'incremental' way: it starts from 0 and advances to the
			
 
				- * optimal distance over time. After reaching optimal prefetch distance,
			
 
				- * it is kept constant. There are some checks which prevent prefetching
			
 
				- * unneeded pixel lines below the image (but it still can prefetch a bit
			
 
				- * more data on the right side of the image - not a big issue and may
			
 
				- * be actually helpful when rendering text glyphs). Additional trick is
			
 
				- * the use of LDR instruction for prefetch instead of PLD when moving to
			
 
				- * the next line, the point is that we have a high chance of getting TLB
			
 
				- * miss in this case, and PLD would be useless.
			
 
				- *
			
 
				- * This sounds like it may introduce a noticeable overhead (when working with
			
 
				- * fully cached data). But in reality, due to having a separate pipeline and
			
 
				- * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
			
 
				- * execute simultaneously with NEON and be completely shadowed by it. Thus
			
 
				- * we get no performance overhead at all (*). This looks like a very nice
			
 
				- * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
			
 
				- * but still can implement some rather advanced prefetch logic in software
			
 
				- * for almost zero cost!
			
 
				- *
			
 
				- * (*) The overhead of the prefetcher is visible when running some trivial
			
 
				- * pixels processing like simple copy. Anyway, having prefetch is a must
			
 
				- * when working with the graphics data.
			
 
				- */
			
 
				-.macro PF a, x:vararg
			
 
				-.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
			
 
				-    a x
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro cache_preload std_increment, boost_increment
			
 
				-.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
			
 
				-.if regs_shortage
			
 
				-    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
			
 
				-.endif
			
 
				-.if std_increment != 0
			
 
				-    PF add PF_X, PF_X, #std_increment
			
 
				-.endif
			
 
				-    PF tst PF_CTL, #0xF
			
 
				-    PF addne PF_X, PF_X, #boost_increment
			
 
				-    PF subne PF_CTL, PF_CTL, #1
			
 
				-    PF cmp PF_X, ORIG_W
			
 
				-.if src_bpp_shift >= 0
			
 
				-    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
			
 
				-.endif
			
 
				-.if dst_r_bpp != 0
			
 
				-    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
			
 
				-.endif
			
 
				-.if mask_bpp_shift >= 0
			
 
				-    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
			
 
				-.endif
			
 
				-    PF subge PF_X, PF_X, ORIG_W
			
 
				-    PF subges PF_CTL, PF_CTL, #0x10
			
 
				-.if src_bpp_shift >= 0
			
 
				-    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
			
 
				-.endif
			
 
				-.if dst_r_bpp != 0
			
 
				-    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
			
 
				-.endif
			
 
				-.if mask_bpp_shift >= 0
			
 
				-    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
			
 
				-.endif
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro cache_preload_simple
			
 
				-.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
			
 
				-.if src_bpp > 0
			
 
				-    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
			
 
				-.endif
			
 
				-.if dst_r_bpp > 0
			
 
				-    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
			
 
				-.endif
			
 
				-.if mask_bpp > 0
			
 
				-    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
			
 
				-.endif
			
 
				-.endif
			
 
				-.endm
			
 
				-
			
 
				-.macro fetch_mask_pixblock
			
 
				-    pixld       pixblock_size, mask_bpp, \
			
 
				-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Macro which is used to process leading pixels until destination
			
 
				- * pointer is properly aligned (at 16 bytes boundary). When destination
			
 
				- * buffer uses 16bpp format, this is unnecessary, or even pointless.
			
 
				- */
			
 
				-.macro ensure_destination_ptr_alignment process_pixblock_head, \
			
 
				-                                        process_pixblock_tail, \
			
 
				-                                        process_pixblock_tail_head
			
 
				-.if dst_w_bpp != 24
			
 
				-    tst         DST_R, #0xF
			
 
				-    beq         2f
			
 
				-
			
 
				-.irp lowbit, 1, 2, 4, 8, 16
			
 
				-local skip1
			
 
				-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
			
 
				-.if lowbit < 16 /* we don't need more than 16-byte alignment */
			
 
				-    tst         DST_R, #lowbit
			
 
				-    beq         1f
			
 
				-.endif
			
 
				-    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
			
 
				-    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
			
 
				-.if dst_r_bpp > 0
			
 
				-    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
			
 
				-.else
			
 
				-    add         DST_R, DST_R, #lowbit
			
 
				-.endif
			
 
				-    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
			
 
				-    sub         W, W, #(lowbit * 8 / dst_w_bpp)
			
 
				-1:
			
 
				-.endif
			
 
				-.endr
			
 
				-    pixdeinterleave src_bpp, src_basereg
			
 
				-    pixdeinterleave mask_bpp, mask_basereg
			
 
				-    pixdeinterleave dst_r_bpp, dst_r_basereg
			
 
				-
			
 
				-    process_pixblock_head
			
 
				-    cache_preload 0, pixblock_size
			
 
				-    cache_preload_simple
			
 
				-    process_pixblock_tail
			
 
				-
			
 
				-    pixinterleave dst_w_bpp, dst_w_basereg
			
 
				-.irp lowbit, 1, 2, 4, 8, 16
			
 
				-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
			
 
				-.if lowbit < 16 /* we don't need more than 16-byte alignment */
			
 
				-    tst         DST_W, #lowbit
			
 
				-    beq         1f
			
 
				-.endif
			
 
				-    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
			
 
				-1:
			
 
				-.endif
			
 
				-.endr
			
 
				-.endif
			
 
				-2:
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Special code for processing up to (pixblock_size - 1) remaining
			
 
				- * trailing pixels. As SIMD processing performs operation on
			
 
				- * pixblock_size pixels, anything smaller than this has to be loaded
			
 
				- * and stored in a special way. Loading and storing of pixel data is
			
 
				- * performed in such a way that we fill some 'slots' in the NEON
			
 
				- * registers (some slots naturally are unused), then perform compositing
			
 
				- * operation as usual. In the end, the data is taken from these 'slots'
			
 
				- * and saved to memory.
			
 
				- *
			
 
				- * cache_preload_flag - allows to suppress prefetch if
			
 
				- *                      set to 0
			
 
				- * dst_aligned_flag   - selects whether destination buffer
			
 
				- *                      is aligned
			
 
				- */
			
 
				-.macro process_trailing_pixels cache_preload_flag, \
			
 
				-                               dst_aligned_flag, \
			
 
				-                               process_pixblock_head, \
			
 
				-                               process_pixblock_tail, \
			
 
				-                               process_pixblock_tail_head
			
 
				-    tst         W, #(pixblock_size - 1)
			
 
				-    beq         2f
			
 
				-.irp chunk_size, 16, 8, 4, 2, 1
			
 
				-.if pixblock_size > chunk_size
			
 
				-    tst         W, #chunk_size
			
 
				-    beq         1f
			
 
				-    pixld_src   chunk_size, src_bpp, src_basereg, SRC
			
 
				-    pixld       chunk_size, mask_bpp, mask_basereg, MASK
			
 
				-.if dst_aligned_flag != 0
			
 
				-    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
			
 
				-.else
			
 
				-    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
			
 
				-.endif
			
 
				-.if cache_preload_flag != 0
			
 
				-    PF add      PF_X, PF_X, #chunk_size
			
 
				-.endif
			
 
				-1:
			
 
				-.endif
			
 
				-.endr
			
 
				-    pixdeinterleave src_bpp, src_basereg
			
 
				-    pixdeinterleave mask_bpp, mask_basereg
			
 
				-    pixdeinterleave dst_r_bpp, dst_r_basereg
			
 
				-
			
 
				-    process_pixblock_head
			
 
				-.if cache_preload_flag != 0
			
 
				-    cache_preload 0, pixblock_size
			
 
				-    cache_preload_simple
			
 
				-.endif
			
 
				-    process_pixblock_tail
			
 
				-    pixinterleave dst_w_bpp, dst_w_basereg
			
 
				-.irp chunk_size, 16, 8, 4, 2, 1
			
 
				-.if pixblock_size > chunk_size
			
 
				-    tst         W, #chunk_size
			
 
				-    beq         1f
			
 
				-.if dst_aligned_flag != 0
			
 
				-    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
			
 
				-.else
			
 
				-    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
			
 
				-.endif
			
 
				-1:
			
 
				-.endif
			
 
				-.endr
			
 
				-2:
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Macro, which performs all the needed operations to switch to the next
			
 
				- * scanline and start the next loop iteration unless all the scanlines
			
 
				- * are already processed.
			
 
				- */
			
 
				-.macro advance_to_next_scanline start_of_loop_label
			
 
				-.if regs_shortage
			
 
				-    ldrd        W, [sp] /* load W and H (width and height) from stack */
			
 
				-.else
			
 
				-    mov         W, ORIG_W
			
 
				-.endif
			
 
				-    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
			
 
				-.if src_bpp != 0
			
 
				-    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
			
 
				-.endif
			
 
				-.if mask_bpp != 0
			
 
				-    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
			
 
				-.endif
			
 
				-.if (dst_w_bpp != 24)
			
 
				-    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
			
 
				-.endif
			
 
				-.if (src_bpp != 24) && (src_bpp != 0)
			
 
				-    sub         SRC, SRC, W, lsl #src_bpp_shift
			
 
				-.endif
			
 
				-.if (mask_bpp != 24) && (mask_bpp != 0)
			
 
				-    sub         MASK, MASK, W, lsl #mask_bpp_shift
			
 
				-.endif
			
 
				-    subs        H, H, #1
			
 
				-    mov         DST_R, DST_W
			
 
				-.if regs_shortage
			
 
				-    str         H, [sp, #4] /* save updated height to stack */
			
 
				-.endif
			
 
				-    bge         start_of_loop_label
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Registers are allocated in the following way by default:
			
 
				- * d0, d1, d2, d3     - reserved for loading source pixel data
			
 
				- * d4, d5, d6, d7     - reserved for loading destination pixel data
			
 
				- * d24, d25, d26, d27 - reserved for loading mask pixel data
			
 
				- * d28, d29, d30, d31 - final destination pixel data for writeback to memory
			
 
				- */
			
 
				-.macro generate_composite_function fname, \
			
 
				-                                   src_bpp_, \
			
 
				-                                   mask_bpp_, \
			
 
				-                                   dst_w_bpp_, \
			
 
				-                                   flags, \
			
 
				-                                   pixblock_size_, \
			
 
				-                                   prefetch_distance, \
			
 
				-                                   init, \
			
 
				-                                   cleanup, \
			
 
				-                                   process_pixblock_head, \
			
 
				-                                   process_pixblock_tail, \
			
 
				-                                   process_pixblock_tail_head, \
			
 
				-                                   dst_w_basereg_ = 28, \
			
 
				-                                   dst_r_basereg_ = 4, \
			
 
				-                                   src_basereg_   = 0, \
			
 
				-                                   mask_basereg_  = 24
			
 
				-
			
 
				-    pixman_asm_function fname
			
 
				-
			
 
				-    push        {r4-r12, lr}        /* save all registers */
			
 
				-
			
 
				-/*
			
 
				- * Select prefetch type for this function. If prefetch distance is
			
 
				- * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
			
 
				- * has to be used instead of ADVANCED.
			
 
				- */
			
 
				-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
			
 
				-.if prefetch_distance == 0
			
 
				-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
			
 
				-.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
			
 
				-        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
			
 
				-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
			
 
				-.endif
			
 
				-
			
 
				-/*
			
 
				- * Make some macro arguments globally visible and accessible
			
 
				- * from other macros
			
 
				- */
			
 
				-    .set src_bpp, src_bpp_
			
 
				-    .set mask_bpp, mask_bpp_
			
 
				-    .set dst_w_bpp, dst_w_bpp_
			
 
				-    .set pixblock_size, pixblock_size_
			
 
				-    .set dst_w_basereg, dst_w_basereg_
			
 
				-    .set dst_r_basereg, dst_r_basereg_
			
 
				-    .set src_basereg, src_basereg_
			
 
				-    .set mask_basereg, mask_basereg_
			
 
				-
			
 
				-    .macro pixld_src x:vararg
			
 
				-        pixld x
			
 
				-    .endm
			
 
				-    .macro fetch_src_pixblock
			
 
				-        pixld_src   pixblock_size, src_bpp, \
			
 
				-                    (src_basereg - pixblock_size * src_bpp / 64), SRC
			
 
				-    .endm
			
 
				-/*
			
 
				- * Assign symbolic names to registers
			
 
				- */
			
 
				-    W           .req        r0      /* width (is updated during processing) */
			
 
				-    H           .req        r1      /* height (is updated during processing) */
			
 
				-    DST_W       .req        r2      /* destination buffer pointer for writes */
			
 
				-    DST_STRIDE  .req        r3      /* destination image stride */
			
 
				-    SRC         .req        r4      /* source buffer pointer */
			
 
				-    SRC_STRIDE  .req        r5      /* source image stride */
			
 
				-    DST_R       .req        r6      /* destination buffer pointer for reads */
			
 
				-
			
 
				-    MASK        .req        r7      /* mask pointer */
			
 
				-    MASK_STRIDE .req        r8      /* mask stride */
			
 
				-
			
 
				-    PF_CTL      .req        r9      /* combined lines counter and prefetch */
			
 
				-                                    /* distance increment counter */
			
 
				-    PF_X        .req        r10     /* pixel index in a scanline for current */
			
 
				-                                    /* pretetch position */
			
 
				-    PF_SRC      .req        r11     /* pointer to source scanline start */
			
 
				-                                    /* for prefetch purposes */
			
 
				-    PF_DST      .req        r12     /* pointer to destination scanline start */
			
 
				-                                    /* for prefetch purposes */
			
 
				-    PF_MASK     .req        r14     /* pointer to mask scanline start */
			
 
				-                                    /* for prefetch purposes */
			
 
				-/*
			
 
				- * Check whether we have enough registers for all the local variables.
			
 
				- * If we don't have enough registers, original width and height are
			
 
				- * kept on top of stack (and 'regs_shortage' variable is set to indicate
			
 
				- * this for the rest of code). Even if there are enough registers, the
			
 
				- * allocation scheme may be a bit different depending on whether source
			
 
				- * or mask is not used.
			
 
				- */
			
 
				-.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
			
 
				-    ORIG_W      .req        r10     /* saved original width */
			
 
				-    DUMMY       .req        r12     /* temporary register */
			
 
				-    .set        regs_shortage, 0
			
 
				-.elseif mask_bpp == 0
			
 
				-    ORIG_W      .req        r7      /* saved original width */
			
 
				-    DUMMY       .req        r8      /* temporary register */
			
 
				-    .set        regs_shortage, 0
			
 
				-.elseif src_bpp == 0
			
 
				-    ORIG_W      .req        r4      /* saved original width */
			
 
				-    DUMMY       .req        r5      /* temporary register */
			
 
				-    .set        regs_shortage, 0
			
 
				-.else
			
 
				-    ORIG_W      .req        r1      /* saved original width */
			
 
				-    DUMMY       .req        r1      /* temporary register */
			
 
				-    .set        regs_shortage, 1
			
 
				-.endif
			
 
				-
			
 
				-    .set mask_bpp_shift, -1
			
 
				-.if src_bpp == 32
			
 
				-    .set src_bpp_shift, 2
			
 
				-.elseif src_bpp == 24
			
 
				-    .set src_bpp_shift, 0
			
 
				-.elseif src_bpp == 16
			
 
				-    .set src_bpp_shift, 1
			
 
				-.elseif src_bpp == 8
			
 
				-    .set src_bpp_shift, 0
			
 
				-.elseif src_bpp == 0
			
 
				-    .set src_bpp_shift, -1
			
 
				-.else
			
 
				-    .error "requested src bpp (src_bpp) is not supported"
			
 
				-.endif
			
 
				-.if mask_bpp == 32
			
 
				-    .set mask_bpp_shift, 2
			
 
				-.elseif mask_bpp == 24
			
 
				-    .set mask_bpp_shift, 0
			
 
				-.elseif mask_bpp == 8
			
 
				-    .set mask_bpp_shift, 0
			
 
				-.elseif mask_bpp == 0
			
 
				-    .set mask_bpp_shift, -1
			
 
				-.else
			
 
				-    .error "requested mask bpp (mask_bpp) is not supported"
			
 
				-.endif
			
 
				-.if dst_w_bpp == 32
			
 
				-    .set dst_bpp_shift, 2
			
 
				-.elseif dst_w_bpp == 24
			
 
				-    .set dst_bpp_shift, 0
			
 
				-.elseif dst_w_bpp == 16
			
 
				-    .set dst_bpp_shift, 1
			
 
				-.elseif dst_w_bpp == 8
			
 
				-    .set dst_bpp_shift, 0
			
 
				-.else
			
 
				-    .error "requested dst bpp (dst_w_bpp) is not supported"
			
 
				-.endif
			
 
				-
			
 
				-.if (((flags) & FLAG_DST_READWRITE) != 0)
			
 
				-    .set dst_r_bpp, dst_w_bpp
			
 
				-.else
			
 
				-    .set dst_r_bpp, 0
			
 
				-.endif
			
 
				-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
			
 
				-    .set DEINTERLEAVE_32BPP_ENABLED, 1
			
 
				-.else
			
 
				-    .set DEINTERLEAVE_32BPP_ENABLED, 0
			
 
				-.endif
			
 
				-
			
 
				-.if prefetch_distance < 0 || prefetch_distance > 15
			
 
				-    .error "invalid prefetch distance (prefetch_distance)"
			
 
				-.endif
			
 
				-
			
 
				-.if src_bpp > 0
			
 
				-    ldr         SRC, [sp, #40]
			
 
				-.endif
			
 
				-.if mask_bpp > 0
			
 
				-    ldr         MASK, [sp, #48]
			
 
				-.endif
			
 
				-    PF mov      PF_X, #0
			
 
				-.if src_bpp > 0
			
 
				-    ldr         SRC_STRIDE, [sp, #44]
			
 
				-.endif
			
 
				-.if mask_bpp > 0
			
 
				-    ldr         MASK_STRIDE, [sp, #52]
			
 
				-.endif
			
 
				-    mov         DST_R, DST_W
			
 
				-
			
 
				-.if src_bpp == 24
			
 
				-    sub         SRC_STRIDE, SRC_STRIDE, W
			
 
				-    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
			
 
				-.endif
			
 
				-.if mask_bpp == 24
			
 
				-    sub         MASK_STRIDE, MASK_STRIDE, W
			
 
				-    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
			
 
				-.endif
			
 
				-.if dst_w_bpp == 24
			
 
				-    sub         DST_STRIDE, DST_STRIDE, W
			
 
				-    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
			
 
				-.endif
			
 
				-
			
 
				-/*
			
 
				- * Setup advanced prefetcher initial state
			
 
				- */
			
 
				-    PF mov      PF_SRC, SRC
			
 
				-    PF mov      PF_DST, DST_R
			
 
				-    PF mov      PF_MASK, MASK
			
 
				-    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
			
 
				-    PF mov      PF_CTL, H, lsl #4
			
 
				-    PF add      PF_CTL, #(prefetch_distance - 0x10)
			
 
				-
			
 
				-    init
			
 
				-.if regs_shortage
			
 
				-    push        {r0, r1}
			
 
				-.endif
			
 
				-    subs        H, H, #1
			
 
				-.if regs_shortage
			
 
				-    str         H, [sp, #4] /* save updated height to stack */
			
 
				-.else
			
 
				-    mov         ORIG_W, W
			
 
				-.endif
			
 
				-    blt         9f
			
 
				-    cmp         W, #(pixblock_size * 2)
			
 
				-    blt         8f
			
 
				-/*
			
 
				- * This is the start of the pipelined loop, which if optimized for
			
 
				- * long scanlines
			
 
				- */
			
 
				-0:
			
 
				-    ensure_destination_ptr_alignment process_pixblock_head, \
			
 
				-                                     process_pixblock_tail, \
			
 
				-                                     process_pixblock_tail_head
			
 
				-
			
 
				-    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
			
 
				-    pixld_a     pixblock_size, dst_r_bpp, \
			
 
				-                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
			
 
				-    fetch_src_pixblock
			
 
				-    pixld       pixblock_size, mask_bpp, \
			
 
				-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				-    PF add      PF_X, PF_X, #pixblock_size
			
 
				-    process_pixblock_head
			
 
				-    cache_preload 0, pixblock_size
			
 
				-    cache_preload_simple
			
 
				-    subs        W, W, #(pixblock_size * 2)
			
 
				-    blt         2f
			
 
				-1:
			
 
				-    process_pixblock_tail_head
			
 
				-    cache_preload_simple
			
 
				-    subs        W, W, #pixblock_size
			
 
				-    bge         1b
			
 
				-2:
			
 
				-    process_pixblock_tail
			
 
				-    pixst_a     pixblock_size, dst_w_bpp, \
			
 
				-                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
			
 
				-
			
 
				-    /* Process the remaining trailing pixels in the scanline */
			
 
				-    process_trailing_pixels 1, 1, \
			
 
				-                            process_pixblock_head, \
			
 
				-                            process_pixblock_tail, \
			
 
				-                            process_pixblock_tail_head
			
 
				-    advance_to_next_scanline 0b
			
 
				-
			
 
				-.if regs_shortage
			
 
				-    pop         {r0, r1}
			
 
				-.endif
			
 
				-    cleanup
			
 
				-    pop         {r4-r12, pc}  /* exit */
			
 
				-/*
			
 
				- * This is the start of the loop, designed to process images with small width
			
 
				- * (less than pixblock_size * 2 pixels). In this case neither pipelining
			
 
				- * nor prefetch are used.
			
 
				- */
			
 
				-8:
			
 
				-    /* Process exactly pixblock_size pixels if needed */
			
 
				-    tst         W, #pixblock_size
			
 
				-    beq         1f
			
 
				-    pixld       pixblock_size, dst_r_bpp, \
			
 
				-                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
			
 
				-    fetch_src_pixblock
			
 
				-    pixld       pixblock_size, mask_bpp, \
			
 
				-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				-    process_pixblock_head
			
 
				-    process_pixblock_tail
			
 
				-    pixst       pixblock_size, dst_w_bpp, \
			
 
				-                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
			
 
				-1:
			
 
				-    /* Process the remaining trailing pixels in the scanline */
			
 
				-    process_trailing_pixels 0, 0, \
			
 
				-                            process_pixblock_head, \
			
 
				-                            process_pixblock_tail, \
			
 
				-                            process_pixblock_tail_head
			
 
				-    advance_to_next_scanline 8b
			
 
				-9:
			
 
				-.if regs_shortage
			
 
				-    pop         {r0, r1}
			
 
				-.endif
			
 
				-    cleanup
			
 
				-    pop         {r4-r12, pc}  /* exit */
			
 
				-
			
 
				-    .purgem     fetch_src_pixblock
			
 
				-    .purgem     pixld_src
			
 
				-
			
 
				-    .unreq      SRC
			
 
				-    .unreq      MASK
			
 
				-    .unreq      DST_R
			
 
				-    .unreq      DST_W
			
 
				-    .unreq      ORIG_W
			
 
				-    .unreq      W
			
 
				-    .unreq      H
			
 
				-    .unreq      SRC_STRIDE
			
 
				-    .unreq      DST_STRIDE
			
 
				-    .unreq      MASK_STRIDE
			
 
				-    .unreq      PF_CTL
			
 
				-    .unreq      PF_X
			
 
				-    .unreq      PF_SRC
			
 
				-    .unreq      PF_DST
			
 
				-    .unreq      PF_MASK
			
 
				-    .unreq      DUMMY
			
 
				-    .endfunc
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * A simplified variant of function generation template for a single
			
 
				- * scanline processing (for implementing pixman combine functions)
			
 
				- */
			
 
				-.macro generate_composite_function_scanline        use_nearest_scaling, \
			
 
				-                                                   fname, \
			
 
				-                                                   src_bpp_, \
			
 
				-                                                   mask_bpp_, \
			
 
				-                                                   dst_w_bpp_, \
			
 
				-                                                   flags, \
			
 
				-                                                   pixblock_size_, \
			
 
				-                                                   init, \
			
 
				-                                                   cleanup, \
			
 
				-                                                   process_pixblock_head, \
			
 
				-                                                   process_pixblock_tail, \
			
 
				-                                                   process_pixblock_tail_head, \
			
 
				-                                                   dst_w_basereg_ = 28, \
			
 
				-                                                   dst_r_basereg_ = 4, \
			
 
				-                                                   src_basereg_   = 0, \
			
 
				-                                                   mask_basereg_  = 24
			
 
				-
			
 
				-    pixman_asm_function fname
			
 
				-
			
 
				-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
			
 
				-/*
			
 
				- * Make some macro arguments globally visible and accessible
			
 
				- * from other macros
			
 
				- */
			
 
				-    .set src_bpp, src_bpp_
			
 
				-    .set mask_bpp, mask_bpp_
			
 
				-    .set dst_w_bpp, dst_w_bpp_
			
 
				-    .set pixblock_size, pixblock_size_
			
 
				-    .set dst_w_basereg, dst_w_basereg_
			
 
				-    .set dst_r_basereg, dst_r_basereg_
			
 
				-    .set src_basereg, src_basereg_
			
 
				-    .set mask_basereg, mask_basereg_
			
 
				-
			
 
				-.if use_nearest_scaling != 0
			
 
				-    /*
			
 
				-     * Assign symbolic names to registers for nearest scaling
			
 
				-     */
			
 
				-    W           .req        r0
			
 
				-    DST_W       .req        r1
			
 
				-    SRC         .req        r2
			
 
				-    VX          .req        r3
			
 
				-    UNIT_X      .req        ip
			
 
				-    MASK        .req        lr
			
 
				-    TMP1        .req        r4
			
 
				-    TMP2        .req        r5
			
 
				-    DST_R       .req        r6
			
 
				-    SRC_WIDTH_FIXED .req        r7
			
 
				-
			
 
				-    .macro pixld_src x:vararg
			
 
				-        pixld_s x
			
 
				-    .endm
			
 
				-
			
 
				-    ldr         UNIT_X, [sp]
			
 
				-    push        {r4-r8, lr}
			
 
				-    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
			
 
				-    .if mask_bpp != 0
			
 
				-    ldr         MASK, [sp, #(24 + 8)]
			
 
				-    .endif
			
 
				-.else
			
 
				-    /*
			
 
				-     * Assign symbolic names to registers
			
 
				-     */
			
 
				-    W           .req        r0      /* width (is updated during processing) */
			
 
				-    DST_W       .req        r1      /* destination buffer pointer for writes */
			
 
				-    SRC         .req        r2      /* source buffer pointer */
			
 
				-    DST_R       .req        ip      /* destination buffer pointer for reads */
			
 
				-    MASK        .req        r3      /* mask pointer */
			
 
				-
			
 
				-    .macro pixld_src x:vararg
			
 
				-        pixld x
			
 
				-    .endm
			
 
				-.endif
			
 
				-
			
 
				-.if (((flags) & FLAG_DST_READWRITE) != 0)
			
 
				-    .set dst_r_bpp, dst_w_bpp
			
 
				-.else
			
 
				-    .set dst_r_bpp, 0
			
 
				-.endif
			
 
				-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
			
 
				-    .set DEINTERLEAVE_32BPP_ENABLED, 1
			
 
				-.else
			
 
				-    .set DEINTERLEAVE_32BPP_ENABLED, 0
			
 
				-.endif
			
 
				-
			
 
				-    .macro fetch_src_pixblock
			
 
				-        pixld_src   pixblock_size, src_bpp, \
			
 
				-                    (src_basereg - pixblock_size * src_bpp / 64), SRC
			
 
				-    .endm
			
 
				-
			
 
				-    init
			
 
				-    mov         DST_R, DST_W
			
 
				-
			
 
				-    cmp         W, #pixblock_size
			
 
				-    blt         8f
			
 
				-
			
 
				-    ensure_destination_ptr_alignment process_pixblock_head, \
			
 
				-                                     process_pixblock_tail, \
			
 
				-                                     process_pixblock_tail_head
			
 
				-
			
 
				-    subs        W, W, #pixblock_size
			
 
				-    blt         7f
			
 
				-
			
 
				-    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
			
 
				-    pixld_a     pixblock_size, dst_r_bpp, \
			
 
				-                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
			
 
				-    fetch_src_pixblock
			
 
				-    pixld       pixblock_size, mask_bpp, \
			
 
				-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
			
 
				-    process_pixblock_head
			
 
				-    subs        W, W, #pixblock_size
			
 
				-    blt         2f
			
 
				-1:
			
 
				-    process_pixblock_tail_head
			
 
				-    subs        W, W, #pixblock_size
			
 
				-    bge         1b
			
 
				-2:
			
 
				-    process_pixblock_tail
			
 
				-    pixst_a     pixblock_size, dst_w_bpp, \
			
 
				-                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
			
 
				-7:
			
 
				-    /* Process the remaining trailing pixels in the scanline (dst aligned) */
			
 
				-    process_trailing_pixels 0, 1, \
			
 
				-                            process_pixblock_head, \
			
 
				-                            process_pixblock_tail, \
			
 
				-                            process_pixblock_tail_head
			
 
				-
			
 
				-    cleanup
			
 
				-.if use_nearest_scaling != 0
			
 
				-    pop         {r4-r8, pc}  /* exit */
			
 
				-.else
			
 
				-    bx          lr  /* exit */
			
 
				-.endif
			
 
				-8:
			
 
				-    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
			
 
				-    process_trailing_pixels 0, 0, \
			
 
				-                            process_pixblock_head, \
			
 
				-                            process_pixblock_tail, \
			
 
				-                            process_pixblock_tail_head
			
 
				-
			
 
				-    cleanup
			
 
				-
			
 
				-.if use_nearest_scaling != 0
			
 
				-    pop         {r4-r8, pc}  /* exit */
			
 
				-
			
 
				-    .unreq      DST_R
			
 
				-    .unreq      SRC
			
 
				-    .unreq      W
			
 
				-    .unreq      VX
			
 
				-    .unreq      UNIT_X
			
 
				-    .unreq      TMP1
			
 
				-    .unreq      TMP2
			
 
				-    .unreq      DST_W
			
 
				-    .unreq      MASK
			
 
				-    .unreq      SRC_WIDTH_FIXED
			
 
				-
			
 
				-.else
			
 
				-    bx          lr  /* exit */
			
 
				-
			
 
				-    .unreq      SRC
			
 
				-    .unreq      MASK
			
 
				-    .unreq      DST_R
			
 
				-    .unreq      DST_W
			
 
				-    .unreq      W
			
 
				-.endif
			
 
				-
			
 
				-    .purgem     fetch_src_pixblock
			
 
				-    .purgem     pixld_src
			
 
				-
			
 
				-    .endfunc
			
 
				-.endm
			
 
				-
			
 
				-.macro generate_composite_function_single_scanline x:vararg
			
 
				-    generate_composite_function_scanline 0, x
			
 
				-.endm
			
 
				-
			
 
				-.macro generate_composite_function_nearest_scanline x:vararg
			
 
				-    generate_composite_function_scanline 1, x
			
 
				-.endm
			
 
				-
			
 
				-/* Default prologue/epilogue, nothing special needs to be done */
			
 
				-
			
 
				-.macro default_init
			
 
				-.endm
			
 
				-
			
 
				-.macro default_cleanup
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Prologue/epilogue variant which additionally saves/restores d8-d15
			
 
				- * registers (they need to be saved/restored by callee according to ABI).
			
 
				- * This is required if the code needs to use all the NEON registers.
			
 
				- */
			
 
				-
			
 
				-.macro default_init_need_all_regs
			
 
				-    vpush       {d8-d15}
			
 
				-.endm
			
 
				-
			
 
				-.macro default_cleanup_need_all_regs
			
 
				-    vpop        {d8-d15}
			
 
				-.endm
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-/*
			
 
				- * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
			
 
				- * into a planar a8r8g8b8 format (with a, r, g, b color components
			
 
				- * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
			
 
				- *
			
 
				- * Warning: the conversion is destructive and the original
			
 
				- *          value (in) is lost.
			
 
				- */
			
 
				-.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
			
 
				-    vshrn.u16   out_r, in,    #8
			
 
				-    vshrn.u16   out_g, in,    #3
			
 
				-    vsli.u16    in,    in,    #5
			
 
				-    vmov.u8     out_a, #255
			
 
				-    vsri.u8     out_r, out_r, #5
			
 
				-    vsri.u8     out_g, out_g, #6
			
 
				-    vshrn.u16   out_b, in,    #2
			
 
				-.endm
			
 
				-
			
 
				-.macro convert_0565_to_x888 in, out_r, out_g, out_b
			
 
				-    vshrn.u16   out_r, in,    #8
			
 
				-    vshrn.u16   out_g, in,    #3
			
 
				-    vsli.u16    in,    in,    #5
			
 
				-    vsri.u8     out_r, out_r, #5
			
 
				-    vsri.u8     out_g, out_g, #6
			
 
				-    vshrn.u16   out_b, in,    #2
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
			
 
				- * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
			
 
				- * pixels packed in 128-bit register (out). Requires two temporary 128-bit
			
 
				- * registers (tmp1, tmp2)
			
 
				- */
			
 
				-.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
			
 
				-    vshll.u8    tmp1, in_g, #8
			
 
				-    vshll.u8    out, in_r, #8
			
 
				-    vshll.u8    tmp2, in_b, #8
			
 
				-    vsri.u16    out, tmp1, #5
			
 
				-    vsri.u16    out, tmp2, #11
			
 
				-.endm
			
 
				-
			
 
				-/*
			
 
				- * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
			
 
				- * returned in (out0, out1) registers pair. Requires one temporary
			
 
				- * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
			
 
				- * value from 'in' is lost
			
 
				- */
			
 
				-.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
			
 
				-    vshl.u16    out0, in,   #5  /* G top 6 bits */
			
 
				-    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
			
 
				-    vsri.u16    in,   in,   #5  /* R is ready in top bits */
			
 
				-    vsri.u16    out0, out0, #6  /* G is ready in top bits */
			
 
				-    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
			
 
				-    vshr.u16    out1, in,   #8  /* R is in place */
			
 
				-    vsri.u16    out0, tmp,  #8  /* G & B is in place */
			
 
				-    vzip.u16    out0, out1      /* everything is in place */
			
 
				-.endm
			
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -1,532 +0,0 @@
 
				-/*
			
 
				- * Copyright (c) 2016 RISC OS Open Ltd
			
 
				- *
			
 
				- * This software is provided 'as-is', without any express or implied
			
 
				- * warranty.  In no event will the authors be held liable for any damages
			
 
				- * arising from the use of this software.
			
 
				- *
			
 
				- * Permission is granted to anyone to use this software for any purpose,
			
 
				- * including commercial applications, and to alter it and redistribute it
			
 
				- * freely, subject to the following restrictions:
			
 
				- *
			
 
				- * 1. The origin of this software must not be misrepresented; you must not
			
 
				- *    claim that you wrote the original software. If you use this software
			
 
				- *    in a product, an acknowledgment in the product documentation would be
			
 
				- *    appreciated but is not required.
			
 
				- * 2. Altered source versions must be plainly marked as such, and must not be
			
 
				- *    misrepresented as being the original software.
			
 
				- * 3. This notice may not be removed or altered from any source distribution.
			
 
				- */
			
 
				-
			
 
				-/* Prevent the stack from becoming executable */
			
 
				-#if defined(__linux__) && defined(__ELF__)
			
 
				-.section .note.GNU-stack,"",%progbits
			
 
				-#endif
			
 
				-
			
 
				-	.text
			
 
				-	.arch armv6
			
 
				-	.object_arch armv4
			
 
				-	.arm
			
 
				-	.altmacro
			
 
				-	.p2align 2
			
 
				-
			
 
				-#include "pixman-arm-asm.h"
			
 
				-#include "pixman-arm-simd-asm.h"
			
 
				-
			
 
				-/* A head macro should do all processing which results in an output of up to
			
 
				- * 16 bytes, as far as the final load instruction. The corresponding tail macro
			
 
				- * should complete the processing of the up-to-16 bytes. The calling macro will
			
 
				- * sometimes choose to insert a preload or a decrement of X between them.
			
 
				- *   cond           ARM condition code for code block
			
 
				- *   numbytes       Number of output bytes that should be generated this time
			
 
				- *   firstreg       First WK register in which to place output
			
 
				- *   unaligned_src  Whether to use non-wordaligned loads of source image
			
 
				- *   unaligned_mask Whether to use non-wordaligned loads of mask image
			
 
				- *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
			
 
				- */
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-.macro FillRect32_init
			
 
				-        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
			
 
				-        mov     STRIDE_S, SRC
			
 
				-        mov     MASK, SRC
			
 
				-        mov     STRIDE_M, SRC
			
 
				-.endm
			
 
				-
			
 
				-.macro FillRect16_init
			
 
				-        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
			
 
				-        orr     SRC, SRC, lsl #16
			
 
				-        mov     STRIDE_S, SRC
			
 
				-        mov     MASK, SRC
			
 
				-        mov     STRIDE_M, SRC
			
 
				-.endm
			
 
				-
			
 
				-.macro FillRect8_init
			
 
				-        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
			
 
				-        orr     SRC, SRC, lsl #8
			
 
				-        orr     SRC, SRC, lsl #16
			
 
				-        mov     STRIDE_S, SRC
			
 
				-        mov     MASK, SRC
			
 
				-        mov     STRIDE_M, SRC
			
 
				-.endm
			
 
				-
			
 
				-.macro FillRect_process_tail  cond, numbytes, firstreg
			
 
				-    WK4     .req    SRC
			
 
				-    WK5     .req    STRIDE_S
			
 
				-    WK6     .req    MASK
			
 
				-    WK7     .req    STRIDE_M
			
 
				-        pixst   cond, numbytes, 4, DST
			
 
				-    .unreq  WK4
			
 
				-    .unreq  WK5
			
 
				-    .unreq  WK6
			
 
				-    .unreq  WK7
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    FillSurfaceRect32ARMSIMDAsm, 0, 0, 32, \
			
 
				-    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
			
 
				-    0, /* prefetch distance doesn't apply */ \
			
 
				-    FillRect32_init \
			
 
				-    nop_macro, /* newline */ \
			
 
				-    nop_macro /* cleanup */ \
			
 
				-    nop_macro /* process head */ \
			
 
				-    FillRect_process_tail
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    FillSurfaceRect16ARMSIMDAsm, 0, 0, 16, \
			
 
				-    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
			
 
				-    0, /* prefetch distance doesn't apply */ \
			
 
				-    FillRect16_init \
			
 
				-    nop_macro, /* newline */ \
			
 
				-    nop_macro /* cleanup */ \
			
 
				-    nop_macro /* process head */ \
			
 
				-    FillRect_process_tail
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    FillSurfaceRect8ARMSIMDAsm, 0, 0, 8, \
			
 
				-    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
			
 
				-    0, /* prefetch distance doesn't apply */ \
			
 
				-    FillRect8_init \
			
 
				-    nop_macro, /* newline */ \
			
 
				-    nop_macro /* cleanup */ \
			
 
				-    nop_macro /* process head */ \
			
 
				-    FillRect_process_tail
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-/* This differs from the over_8888_8888 routine in Pixman in that the destination
			
 
				- * alpha component is always left unchanged, and RGB components are not
			
 
				- * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
			
 
				- * renormalisation is done by multiplying by 257/256 (with rounding) rather than
			
 
				- * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
			
 
				- */
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_init
			
 
				-        line_saved_regs STRIDE_S, ORIG_W
			
 
				-        mov     MASK, #0x80
			
 
				-.endm
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
			
 
				-        uxtb    tmp3, s
			
 
				-        uxtb    tmp0, d
			
 
				-        sub     tmp0, tmp3, tmp0
			
 
				-        uxtb    tmp3, s, ror #16
			
 
				-        uxtb    tmp1, d, ror #16
			
 
				-        sub     tmp1, tmp3, tmp1
			
 
				-        uxtb    tmp3, s, ror #8
			
 
				-        mov     s, s, lsr #24
			
 
				-        uxtb    tmp2, d, ror #8
			
 
				-        sub     tmp2, tmp3, tmp2
			
 
				-        smlabb  tmp0, tmp0, s, half
			
 
				-        smlabb  tmp1, tmp1, s, half
			
 
				-        smlabb  tmp2, tmp2, s, half
			
 
				-        add     tmp0, tmp0, asr #8
			
 
				-        add     tmp1, tmp1, asr #8
			
 
				-        add     tmp2, tmp2, asr #8
			
 
				-        pkhbt   tmp0, tmp0, tmp1, lsl #16
			
 
				-        and     tmp2, tmp2, #0xff00
			
 
				-        uxtb16  tmp0, tmp0, ror #8
			
 
				-        orr     tmp0, tmp0, tmp2
			
 
				-        uadd8   d, d, tmp0
			
 
				-.endm
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
			
 
				-        and     d, d, #0xff000000
			
 
				-        bic     s, s, #0xff000000
			
 
				-        orr     d, d, s
			
 
				-.endm
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
			
 
				- .if numbytes == 16
			
 
				-        ldm     SRC!, {WK0, WK1}
			
 
				-        ldm     SRC!, {STRIDE_S, STRIDE_M}
			
 
				-        ldrd    WK2, WK3, [DST], #16
			
 
				-        orr     SCRATCH, WK0, WK1
			
 
				-        and     ORIG_W, WK0, WK1
			
 
				-        orr     SCRATCH, SCRATCH, STRIDE_S
			
 
				-        and     ORIG_W, ORIG_W, STRIDE_S
			
 
				-        orr     SCRATCH, SCRATCH, STRIDE_M
			
 
				-        and     ORIG_W, ORIG_W, STRIDE_M
			
 
				-        tst     SCRATCH, #0xff000000
			
 
				- .elseif numbytes == 8
			
 
				-        ldm     SRC!, {WK0, WK1}
			
 
				-        ldm     DST!, {WK2, WK3}
			
 
				-        orr     SCRATCH, WK0, WK1
			
 
				-        and     ORIG_W, WK0, WK1
			
 
				-        tst     SCRATCH, #0xff000000
			
 
				- .else // numbytes == 4
			
 
				-        ldr     WK0, [SRC], #4
			
 
				-        ldr     WK2, [DST], #4
			
 
				-        tst     WK0, #0xff000000
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
			
 
				-        beq     20f @ all transparent
			
 
				- .if numbytes == 16
			
 
				-        cmp     ORIG_W, #0xff000000
			
 
				-        bhs     10f @ all opaque
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        strd    WK2, WK3, [DST, #-16]
			
 
				-        ldrd    WK0, WK1, [SRC, #-8]
			
 
				-        ldrd    WK2, WK3, [DST, #-8]
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        b       19f
			
 
				-10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
			
 
				-        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
			
 
				-        strd    WK2, WK3, [DST, #-16]
			
 
				-        ldrd    WK0, WK1, [SRC, #-8]
			
 
				-        ldrd    WK2, WK3, [DST, #-8]
			
 
				-        RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
			
 
				-        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
			
 
				-19:     strd    WK2, WK3, [DST, #-8]
			
 
				- .elseif numbytes == 8
			
 
				-        cmp     ORIG_W, #0xff000000
			
 
				-        bhs     10f @ all opaque
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        b       19f
			
 
				-10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
			
 
				-        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
			
 
				-19:     strd    WK2, WK3, [DST, #-8]
			
 
				- .else // numbytes == 4
			
 
				-        cmp     WK0, #0xff000000
			
 
				-        bhs     10f @ opaque
			
 
				-        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
			
 
				-        b       19f
			
 
				-10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
			
 
				-19:     str     WK2, [DST, #-4]
			
 
				- .endif
			
 
				-20:
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
			
 
				-    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
			
 
				-    2, /* prefetch distance */ \
			
 
				-    RGBtoRGBPixelAlpha_init, \
			
 
				-    nop_macro, /* newline */ \
			
 
				-    nop_macro, /* cleanup */ \
			
 
				-    RGBtoRGBPixelAlpha_process_head, \
			
 
				-    RGBtoRGBPixelAlpha_process_tail
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_init
			
 
				-        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
			
 
				-        mov     MASK, #0x001f
			
 
				-        mov     STRIDE_M, #0x0010
			
 
				-        orr     MASK, MASK, MASK, lsl #16
			
 
				-        orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_newline
			
 
				-        mov     STRIDE_S, #0x0200
			
 
				-.endm
			
 
				-
			
 
				-/* On entry:
			
 
				- * s1 holds 1 32bpp source pixel
			
 
				- * d holds 1 16bpp destination pixel
			
 
				- * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
			
 
				- * other registers are temporaries
			
 
				- * On exit:
			
 
				- * Constant registers preserved
			
 
				- */
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
			
 
				-        mov     alpha, s, lsr #27
			
 
				-        and     misc, s, #0xfc00
			
 
				-        and     g, d, #0x07e0
			
 
				-        pkhbt   rb, d, d, lsl #5
			
 
				-        rsb     misc, g, misc, lsr #5
			
 
				-        and     s, rbmask, s, lsr #3
			
 
				-        and     rb, rbmask, rb
			
 
				-        sub     s, s, rb
			
 
				-        smlabb  misc, misc, alpha, ghalf
			
 
				-        mla     s, s, alpha, rbhalf
			
 
				-        add     misc, misc, misc, lsl #5
			
 
				-        add     g, g, misc, asr #10
			
 
				-        add     s, s, s, lsl #5
			
 
				-        and     g, g, #0x07e0
			
 
				-        add     rb, rb, s, asr #10
			
 
				-        and     rb, rb, rbmask
			
 
				-        pkhbt   rb, rb, rb, lsl #11
			
 
				-        orr     d, rb, g
			
 
				-        orr     d, d, rb, lsr #16
			
 
				-.endm
			
 
				-
			
 
				-/* On entry:
			
 
				- * s1 holds 1 32bpp source pixel
			
 
				- * d holds 1 16bpp destination pixel
			
 
				- * rbmask holds 0x001f001f
			
 
				- * On exit:
			
 
				- * Constant registers preserved
			
 
				- */
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
			
 
				-        and     d, rbmask, s, lsr #3
			
 
				-        and     s, s, #0xfc00
			
 
				-        orr     d, d, d, lsr #5
			
 
				-        orr     d, d, s, lsr #5
			
 
				-.endm
			
 
				-
			
 
				-/* On entry:
			
 
				- * s1, s2 hold 2 32bpp source pixels
			
 
				- * d holds 2 16bpp destination pixels
			
 
				- * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
			
 
				- * other registers are temporaries
			
 
				- * On exit:
			
 
				- * Constant registers preserved
			
 
				- * Blended results have been written through destination pointer
			
 
				- */
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
			
 
				-        mov     alpha, s1, lsr #27
			
 
				-        and     misc, s1, #0xfc00
			
 
				-        and     g, d, #0x07e0
			
 
				-        pkhbt   rb, d, d, lsl #5
			
 
				-        rsb     misc, g, misc, lsr #5
			
 
				-        and     s1, rbmask, s1, lsr #3
			
 
				-        and     rb, rbmask, rb
			
 
				-        sub     s1, s1, rb
			
 
				-        smlabb  misc, misc, alpha, ghalf
			
 
				-        mla     s1, s1, alpha, rbhalf
			
 
				-          uxth    d, d, ror #16
			
 
				-        add     misc, misc, misc, lsl #5
			
 
				-          mov     alpha, s2, lsr #27
			
 
				-        add     g, g, misc, asr #10
			
 
				-        add     s1, s1, s1, lsl #5
			
 
				-        and     g, g, #0x07e0
			
 
				-        add     rb, rb, s1, asr #10
			
 
				-        and     rb, rb, rbmask
			
 
				-          and     misc, s2, #0xfc00
			
 
				-        pkhbt   rb, rb, rb, lsl #11
			
 
				-          and     s1, d, #0x07e0
			
 
				-          pkhbt   d, d, d, lsl #5
			
 
				-          rsb     misc, s1, misc, lsr #5
			
 
				-          and     s2, rbmask, s2, lsr #3
			
 
				-          and     d, rbmask, d
			
 
				-          sub     s2, s2, d
			
 
				-          smlabb  misc, misc, alpha, ghalf
			
 
				-          mla     s2, s2, alpha, rbhalf
			
 
				-        orr     alpha, rb, g
			
 
				-          add     misc, misc, misc, lsl #5
			
 
				-        orr     alpha, alpha, rb, lsr #16
			
 
				-          add     s1, s1, misc, asr #10
			
 
				-          add     s2, s2, s2, lsl #5
			
 
				-          and     s1, s1, #0x07e0
			
 
				-          add     d, d, s2, asr #10
			
 
				-          and     d, d, rbmask
			
 
				-        strh    alpha, [DST, #-4]
			
 
				-          pkhbt   d, d, d, lsl #11
			
 
				-          orr     alpha, d, s1
			
 
				-          orr     alpha, alpha, d, lsr #16
			
 
				-          strh    alpha, [DST, #-2]
			
 
				-.endm
			
 
				-
			
 
				-/* On entry:
			
 
				- * s1, s2 hold 2 32bpp source pixels
			
 
				- * rbmask holds 0x001f001f
			
 
				- * other registers are temporaries
			
 
				- * On exit:
			
 
				- * Constant registers preserved
			
 
				- * Blended results have been written through destination pointer
			
 
				- */
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
			
 
				-        and     g, s1, #0xfc00
			
 
				-        and     d, rbmask, s1, lsr #3
			
 
				-          and     s1, rbmask, s2, lsr #3
			
 
				-        orr     d, d, d, lsr #5
			
 
				-        orr     d, d, g, lsr #5
			
 
				-          and     g, s2, #0xfc00
			
 
				-        strh    d, [DST, #-4]
			
 
				-          orr     s1, s1, s1, lsr #5
			
 
				-          orr     s1, s1, g, lsr #5
			
 
				-          strh    s1, [DST, #-2]
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_2pixels_head
			
 
				-        ldrd    WK0, WK1, [SRC], #8
			
 
				-        ldr     WK2, [DST], #4
			
 
				-        orr     SCRATCH, WK0, WK1
			
 
				-        and     ORIG_W, WK0, WK1
			
 
				-        tst     SCRATCH, #0xff000000
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_2pixels_tail
			
 
				-        beq     20f @ all transparent
			
 
				-        cmp     ORIG_W, #0xff000000
			
 
				-        bhs     10f @ all opaque
			
 
				-        ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
			
 
				-        b       20f
			
 
				-10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
			
 
				-20:
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
			
 
				- .if numbytes == 16
			
 
				-        ARGBto565PixelAlpha_2pixels_head
			
 
				-        ARGBto565PixelAlpha_2pixels_tail
			
 
				-        ARGBto565PixelAlpha_2pixels_head
			
 
				-        ARGBto565PixelAlpha_2pixels_tail
			
 
				- .endif
			
 
				- .if numbytes >= 8
			
 
				-        ARGBto565PixelAlpha_2pixels_head
			
 
				-        ARGBto565PixelAlpha_2pixels_tail
			
 
				- .endif
			
 
				- .if numbytes >= 4
			
 
				-        ARGBto565PixelAlpha_2pixels_head
			
 
				- .else // numbytes == 2
			
 
				-        ldr     WK0, [SRC], #4
			
 
				-        ldrh    WK2, [DST], #2
			
 
				-        tst     WK0, #0xff000000
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
			
 
				- .if numbytes >= 4
			
 
				-        ARGBto565PixelAlpha_2pixels_tail
			
 
				- .else // numbytes == 2
			
 
				-        beq     20f @ all transparent
			
 
				-        cmp     WK0, #0xff000000
			
 
				-        bhs     10f @ opaque
			
 
				-        ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
			
 
				-        b       19f
			
 
				-10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
			
 
				-19:     strh    WK2, [DST, #-2]
			
 
				-20:
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
			
 
				-    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
			
 
				-    2, /* prefetch distance */ \
			
 
				-    ARGBto565PixelAlpha_init, \
			
 
				-    ARGBto565PixelAlpha_newline, \
			
 
				-    nop_macro, /* cleanup */ \
			
 
				-    ARGBto565PixelAlpha_process_head, \
			
 
				-    ARGBto565PixelAlpha_process_tail
			
 
				-
			
 
				- /******************************************************************************/
			
 
				-
			
 
				-.macro BGR888toRGB888_1pixel cond, reg, tmp
			
 
				-        uxtb16&cond  tmp, WK&reg, ror #8
			
 
				-        uxtb16&cond  WK&reg, WK&reg, ror #16
			
 
				-        orr&cond     WK&reg, WK&reg, tmp, lsl #8
			
 
				-.endm
			
 
				-
			
 
				-.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
			
 
				-        uxtb16&cond  tmp1, WK&reg1, ror #8
			
 
				-        uxtb16&cond  WK&reg1, WK&reg1, ror #16
			
 
				-        uxtb16&cond  tmp2, WK&reg2, ror #8
			
 
				-        uxtb16&cond  WK&reg2, WK&reg2, ror #16
			
 
				-        orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
			
 
				-        orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
			
 
				-.endm
			
 
				-
			
 
				-.macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
			
 
				-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
			
 
				-.endm
			
 
				-
			
 
				-.macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
			
 
				- .if numbytes >= 8
			
 
				-        BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
			
 
				-  .if numbytes == 16
			
 
				-        BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
			
 
				-  .endif
			
 
				- .else @ numbytes == 4
			
 
				-        BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    Blit_XBGR8888_XRGB8888ARMSIMDAsm, 32, 0, 32, \
			
 
				-    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
			
 
				-    2, /* prefetch distance */ \
			
 
				-    nop_macro, /* init */ \
			
 
				-    nop_macro, /* newline */ \
			
 
				-    nop_macro, /* cleanup */ \
			
 
				-    BGR888toRGB888_process_head, \
			
 
				-    BGR888toRGB888_process_tail
			
 
				-
			
 
				-/******************************************************************************/
			
 
				-
			
 
				-.macro RGB444toRGB888_init
			
 
				-        ldr     MASK, =0x0f0f0f0f
			
 
				-        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
			
 
				-        msr     CPSR_s, #0x50000
			
 
				-.endm
			
 
				-
			
 
				-.macro RGB444toRGB888_1pixel reg, mask, tmp
			
 
				-        pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
			
 
				-        and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
			
 
				-        orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
			
 
				-        pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
			
 
				-        pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
			
 
				-        sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
			
 
				-.endm
			
 
				-
			
 
				-.macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
			
 
				-        and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
			
 
				-        and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
			
 
				-        orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
			
 
				-        orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
			
 
				-        pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
			
 
				-        pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
			
 
				-        pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
			
 
				-        pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
			
 
				-        pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
			
 
				-        pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
			
 
				-        sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
			
 
				-        sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
			
 
				-.endm
			
 
				-
			
 
				-.macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
			
 
				-        pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
			
 
				-.endm
			
 
				-
			
 
				-.macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
			
 
				- .if numbytes >= 8
			
 
				-  .if numbytes == 16
			
 
				-        RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
			
 
				-  .endif
			
 
				-        RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
			
 
				- .else @ numbytes == 4
			
 
				-        RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-generate_composite_function \
			
 
				-    Blit_RGB444_XRGB8888ARMSIMDAsm, 16, 0, 32, \
			
 
				-    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
			
 
				-    2, /* prefetch distance */ \
			
 
				-    RGB444toRGB888_init, \
			
 
				-    nop_macro, /* newline */ \
			
 
				-    nop_macro, /* cleanup */ \
			
 
				-    RGB444toRGB888_process_head, \
			
 
				-    RGB444toRGB888_process_tail
			
--- a/src/video/arm/pixman-arm-simd-asm.h
+++ b/src/video/arm/pixman-arm-simd-asm.h
@@ -1,1034 +0,0 @@
 
				-/*
			
 
				- * Copyright (c) 2012 Raspberry Pi Foundation
			
 
				- * Copyright (c) 2012 RISC OS Open Ltd
			
 
				- *
			
 
				- * This software is provided 'as-is', without any express or implied
			
 
				- * warranty.  In no event will the authors be held liable for any damages
			
 
				- * arising from the use of this software.
			
 
				- *
			
 
				- * Permission is granted to anyone to use this software for any purpose,
			
 
				- * including commercial applications, and to alter it and redistribute it
			
 
				- * freely, subject to the following restrictions:
			
 
				- *
			
 
				- * 1. The origin of this software must not be misrepresented; you must not
			
 
				- *    claim that you wrote the original software. If you use this software
			
 
				- *    in a product, an acknowledgment in the product documentation would be
			
 
				- *    appreciated but is not required.
			
 
				- * 2. Altered source versions must be plainly marked as such, and must not be
			
 
				- *    misrepresented as being the original software.
			
 
				- * 3. This notice may not be removed or altered from any source distribution.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Because the alignment of pixel data to cachelines, and even the number of
			
 
				- * cachelines per row can vary from row to row, and because of the need to
			
 
				- * preload each scanline once and only once, this prefetch strategy treats
			
 
				- * each row of pixels independently. When a pixel row is long enough, there
			
 
				- * are three distinct phases of prefetch:
			
 
				- * * an inner loop section, where each time a cacheline of data is
			
 
				- *    processed, another cacheline is preloaded (the exact distance ahead is
			
 
				- *    determined empirically using profiling results from lowlevel-blt-bench)
			
 
				- * * a leading section, where enough cachelines are preloaded to ensure no
			
 
				- *    cachelines escape being preloaded when the inner loop starts
			
 
				- * * a trailing section, where a limited number (0 or more) of cachelines
			
 
				- *    are preloaded to deal with data (if any) that hangs off the end of the
			
 
				- *    last iteration of the inner loop, plus any trailing bytes that were not
			
 
				- *    enough to make up one whole iteration of the inner loop
			
 
				- * 
			
 
				- * There are (in general) three distinct code paths, selected between
			
 
				- * depending upon how long the pixel row is. If it is long enough that there
			
 
				- * is at least one iteration of the inner loop (as described above) then
			
 
				- * this is described as the "wide" case. If it is shorter than that, but
			
 
				- * there are still enough bytes output that there is at least one 16-byte-
			
 
				- * long, 16-byte-aligned write to the destination (the optimum type of
			
 
				- * write), then this is the "medium" case. If it is not even this long, then
			
 
				- * this is the "narrow" case, and there is no attempt to align writes to
			
 
				- * 16-byte boundaries. In the "medium" and "narrow" cases, all the
			
 
				- * cachelines containing data from the pixel row are prefetched up-front.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Determine whether we put the arguments on the stack for debugging.
			
 
				- */
			
 
				-#undef DEBUG_PARAMS
			
 
				-
			
 
				-/*
			
 
				- * Bit flags for 'generate_composite_function' macro which are used
			
 
				- * to tune generated functions behavior.
			
 
				- */
			
 
				-.set FLAG_DST_WRITEONLY,         0
			
 
				-.set FLAG_DST_READWRITE,         1
			
 
				-.set FLAG_COND_EXEC,             0
			
 
				-.set FLAG_BRANCH_OVER,           2
			
 
				-.set FLAG_PROCESS_PRESERVES_PSR, 0
			
 
				-.set FLAG_PROCESS_CORRUPTS_PSR,  4
			
 
				-.set FLAG_PROCESS_DOESNT_STORE,  0
			
 
				-.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
			
 
				-.set FLAG_NO_SPILL_LINE_VARS,        0
			
 
				-.set FLAG_SPILL_LINE_VARS_WIDE,      16
			
 
				-.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
			
 
				-.set FLAG_SPILL_LINE_VARS,           48
			
 
				-.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
			
 
				-.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
			
 
				-.set FLAG_PROCESS_PRESERVES_WK0,     0
			
 
				-.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
			
 
				-.set FLAG_PRELOAD_DST,               0
			
 
				-.set FLAG_NO_PRELOAD_DST,            256
			
 
				-
			
 
				-/*
			
 
				- * Number of bytes by which to adjust preload offset of destination
			
 
				- * buffer (allows preload instruction to be moved before the load(s))
			
 
				- */
			
 
				-.set DST_PRELOAD_BIAS, 0
			
 
				-
			
 
				-/*
			
 
				- * Offset into stack where mask and source pointer/stride can be accessed.
			
 
				- */
			
 
				-#ifdef DEBUG_PARAMS
			
 
				-.set ARGS_STACK_OFFSET,        (9*4+9*4)
			
 
				-#else
			
 
				-.set ARGS_STACK_OFFSET,        (9*4)
			
 
				-#endif
			
 
				-
			
 
				-/*
			
 
				- * Offset into stack where space allocated during init macro can be accessed.
			
 
				- */
			
 
				-.set LOCALS_STACK_OFFSET,     0
			
 
				-
			
 
				-/*
			
 
				- * Constants for selecting preferable prefetch type.
			
 
				- */
			
 
				-.set PREFETCH_TYPE_NONE,       0
			
 
				-.set PREFETCH_TYPE_STANDARD,   1
			
 
				-
			
 
				-/*
			
 
				- * Definitions of macros for load/store of pixel data.
			
 
				- */
			
 
				-
			
 
				-.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
			
 
				- .if numbytes == 16
			
 
				-  .if unaligned == 1
			
 
				-        op&r&cond    WK&reg0, [base], #4
			
 
				-        op&r&cond    WK&reg1, [base], #4
			
 
				-        op&r&cond    WK&reg2, [base], #4
			
 
				-        op&r&cond    WK&reg3, [base], #4
			
 
				-  .else
			
 
				-        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
			
 
				-  .endif
			
 
				- .elseif numbytes == 8
			
 
				-  .if unaligned == 1
			
 
				-        op&r&cond    WK&reg0, [base], #4
			
 
				-        op&r&cond    WK&reg1, [base], #4
			
 
				-  .else
			
 
				-        op&m&cond&ia base!, {WK&reg0,WK&reg1}
			
 
				-  .endif
			
 
				- .elseif numbytes == 4
			
 
				-        op&r&cond    WK&reg0, [base], #4
			
 
				- .elseif numbytes == 2
			
 
				-        op&r&cond&h  WK&reg0, [base], #2
			
 
				- .elseif numbytes == 1
			
 
				-        op&r&cond&b  WK&reg0, [base], #1
			
 
				- .else
			
 
				-  .error "unsupported size: numbytes"
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
			
 
				- .if numbytes == 16
			
 
				-        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
			
 
				- .elseif numbytes == 8
			
 
				-        stm&cond&db base, {WK&reg0,WK&reg1}
			
 
				- .elseif numbytes == 4
			
 
				-        str&cond    WK&reg0, [base, #-4]
			
 
				- .elseif numbytes == 2
			
 
				-        str&cond&h  WK&reg0, [base, #-2]
			
 
				- .elseif numbytes == 1
			
 
				-        str&cond&b  WK&reg0, [base, #-1]
			
 
				- .else
			
 
				-  .error "unsupported size: numbytes"
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro pixld cond, numbytes, firstreg, base, unaligned
			
 
				-        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
			
 
				-.endm
			
 
				-
			
 
				-.macro pixst cond, numbytes, firstreg, base
			
 
				- .if (flags) & FLAG_DST_READWRITE
			
 
				-        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
			
 
				- .else
			
 
				-        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro PF a, x:vararg
			
 
				- .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
			
 
				-        a x
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro preload_leading_step1  bpp, ptr, base
			
 
				-/* If the destination is already 16-byte aligned, then we need to preload
			
 
				- * between 0 and prefetch_distance (inclusive) cache lines ahead so there
			
 
				- * are no gaps when the inner loop starts.
			
 
				- */
			
 
				- .if bpp > 0
			
 
				-        PF  bic,    ptr, base, #31
			
 
				-  .set OFFSET, 0
			
 
				-  .rept prefetch_distance+1
			
 
				-        PF  pld,    [ptr, #OFFSET]
			
 
				-   .set OFFSET, OFFSET+32
			
 
				-  .endr
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro preload_leading_step2  bpp, bpp_shift, ptr, base
			
 
				-/* However, if the destination is not 16-byte aligned, we may need to
			
 
				- * preload more cache lines than that. The question we need to ask is:
			
 
				- * are the bytes corresponding to the leading pixels more than the amount
			
 
				- * by which the source pointer will be rounded down for preloading, and if
			
 
				- * so, by how many cache lines? Effectively, we want to calculate
			
 
				- *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
			
 
				- *     inner_loop_offset = (src+leading_bytes)&31
			
 
				- *     extra_needed = leading_bytes - inner_loop_offset
			
 
				- * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
			
 
				- * possible when there are 4 src bytes for every 1 dst byte).
			
 
				- */
			
 
				- .if bpp > 0
			
 
				-  .ifc base,DST
			
 
				-        /* The test can be simplified further when preloading the destination */
			
 
				-        PF  tst,    base, #16
			
 
				-        PF  beq,    61f
			
 
				-  .else
			
 
				-   .if bpp/dst_w_bpp == 4
			
 
				-        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
			
 
				-        PF  and,    SCRATCH, SCRATCH, #31
			
 
				-        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
			
 
				-        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
			
 
				-        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
			
 
				-        PF  bcs,    61f
			
 
				-        PF  bpl,    60f
			
 
				-        PF  pld,    [ptr, #32*(prefetch_distance+2)]
			
 
				-   .else
			
 
				-        PF  mov,    SCRATCH, base, lsl #32-5
			
 
				-        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
			
 
				-        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
			
 
				-        PF  bls,    61f
			
 
				-   .endif
			
 
				-  .endif
			
 
				-60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
			
 
				-61:
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
			
 
				-.macro preload_middle   bpp, base, scratch_holds_offset
			
 
				- .if bpp > 0
			
 
				-        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
			
 
				-  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
			
 
				-   .if scratch_holds_offset
			
 
				-        PF  pld,    [base, SCRATCH]
			
 
				-   .else
			
 
				-        PF  bic,    SCRATCH, base, #31
			
 
				-        PF  pld,    [SCRATCH, #32*prefetch_distance]
			
 
				-   .endif
			
 
				-  .endif
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro preload_trailing  bpp, bpp_shift, base
			
 
				- .if bpp > 0
			
 
				-  .if bpp*pix_per_block > 256
			
 
				-        /* Calculations are more complex if more than one fetch per block */
			
 
				-        PF  and,    WK1, base, #31
			
 
				-        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
			
 
				-        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
			
 
				-        PF  bic,    SCRATCH, base, #31
			
 
				-80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
			
 
				-        PF  add,    SCRATCH, SCRATCH, #32
			
 
				-        PF  subs,   WK1, WK1, #32
			
 
				-        PF  bhi,    80b
			
 
				-  .else
			
 
				-        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
			
 
				-        PF  mov,    SCRATCH, base, lsl #32-5
			
 
				-        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
			
 
				-        PF  adceqs, SCRATCH, SCRATCH, #0
			
 
				-        /* The instruction above has two effects: ensures Z is only
			
 
				-         * set if C was clear (so Z indicates that both shifted quantities
			
 
				-         * were 0), and clears C if Z was set (so C indicates that the sum
			
 
				-         * of the shifted quantities was greater and not equal to 32) */
			
 
				-        PF  beq,    82f
			
 
				-        PF  bic,    SCRATCH, base, #31
			
 
				-        PF  bcc,    81f
			
 
				-        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
			
 
				-81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
			
 
				-82:
			
 
				-  .endif
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro preload_line    narrow_case, bpp, bpp_shift, base
			
 
				-/* "narrow_case" - just means that the macro was invoked from the "narrow"
			
 
				- *    code path rather than the "medium" one - because in the narrow case,
			
 
				- *    the row of pixels is known to output no more than 30 bytes, then
			
 
				- *    (assuming the source pixels are no wider than the the destination
			
 
				- *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
			
 
				- *    meaning there's no need for a loop.
			
 
				- * "bpp" - number of bits per pixel in the channel (source, mask or
			
 
				- *    destination) that's being preloaded, or 0 if this channel is not used
			
 
				- *    for reading
			
 
				- * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
			
 
				- * "base" - base address register of channel to preload (SRC, MASK or DST)
			
 
				- */
			
 
				- .if bpp > 0
			
 
				-  .if narrow_case && (bpp <= dst_w_bpp)
			
 
				-        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
			
 
				-        PF  bic,    WK0, base, #31
			
 
				-        PF  pld,    [WK0]
			
 
				-        PF  add,    WK1, base, X, LSL #bpp_shift
			
 
				-        PF  sub,    WK1, WK1, #1
			
 
				-        PF  bic,    WK1, WK1, #31
			
 
				-        PF  cmp,    WK1, WK0
			
 
				-        PF  beq,    90f
			
 
				-        PF  pld,    [WK1]
			
 
				-90:
			
 
				-  .else
			
 
				-        PF  bic,    WK0, base, #31
			
 
				-        PF  pld,    [WK0]
			
 
				-        PF  add,    WK1, base, X, lsl #bpp_shift
			
 
				-        PF  sub,    WK1, WK1, #1
			
 
				-        PF  bic,    WK1, WK1, #31
			
 
				-        PF  cmp,    WK1, WK0
			
 
				-        PF  beq,    92f
			
 
				-91:     PF  add,    WK0, WK0, #32
			
 
				-        PF  cmp,    WK0, WK1
			
 
				-        PF  pld,    [WK0]
			
 
				-        PF  bne,    91b
			
 
				-92:
			
 
				-  .endif
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
			
 
				-        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
			
 
				- .if decrementx
			
 
				-        sub&cond X, X, #8*numbytes/dst_w_bpp
			
 
				- .endif
			
 
				-        process_tail  cond, numbytes, firstreg
			
 
				- .if !((flags) & FLAG_PROCESS_DOES_STORE)
			
 
				-        pixst   cond, numbytes, firstreg, DST
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
			
 
				- .if (flags) & FLAG_BRANCH_OVER
			
 
				-  .ifc cond,mi
			
 
				-        bpl     100f
			
 
				-  .endif
			
 
				-  .ifc cond,cs
			
 
				-        bcc     100f
			
 
				-  .endif
			
 
				-  .ifc cond,ne
			
 
				-        beq     100f
			
 
				-  .endif
			
 
				-        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
			
 
				-100:
			
 
				- .else
			
 
				-        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
			
 
				- .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
			
 
				-        /* Can't interleave reads and writes */
			
 
				-        test
			
 
				-        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
			
 
				-  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
			
 
				-        test
			
 
				-  .endif
			
 
				-        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
			
 
				- .else
			
 
				-        /* Can interleave reads and writes for better scheduling */
			
 
				-        test
			
 
				-        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
			
 
				-        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
			
 
				-  .if decrementx
			
 
				-        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
			
 
				-        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
			
 
				-  .endif
			
 
				-        process_tail  cond1, numbytes1, firstreg1
			
 
				-        process_tail  cond2, numbytes2, firstreg2
			
 
				-        pixst   cond1, numbytes1, firstreg1, DST
			
 
				-        pixst   cond2, numbytes2, firstreg2, DST
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro test_bits_1_0_ptr
			
 
				- .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
			
 
				-        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
			
 
				- .else
			
 
				-        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro test_bits_3_2_ptr
			
 
				- .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
			
 
				-        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
			
 
				- .else
			
 
				-        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro leading_15bytes  process_head, process_tail
			
 
				-        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
			
 
				- .set DECREMENT_X, 1
			
 
				- .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
			
 
				-  .set DECREMENT_X, 0
			
 
				-        sub     X, X, WK0, lsr #dst_bpp_shift
			
 
				-        str     X, [sp, #LINE_SAVED_REG_COUNT*4]
			
 
				-        mov     X, WK0
			
 
				- .endif
			
 
				-        /* Use unaligned loads in all cases for simplicity */
			
 
				- .if dst_w_bpp == 8
			
 
				-        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
			
 
				- .elseif dst_w_bpp == 16
			
 
				-        test_bits_1_0_ptr
			
 
				-        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
			
 
				- .endif
			
 
				-        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
			
 
				- .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
			
 
				-        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro test_bits_3_2_pix
			
 
				-        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
			
 
				-.endm
			
 
				-
			
 
				-.macro test_bits_1_0_pix
			
 
				- .if dst_w_bpp == 8
			
 
				-        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
			
 
				- .else
			
 
				-        movs    SCRATCH, X, lsr #1
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
			
 
				-        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
			
 
				- .if dst_w_bpp == 16
			
 
				-        test_bits_1_0_pix
			
 
				-        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
			
 
				- .elseif dst_w_bpp == 8
			
 
				-        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
			
 
				-110:
			
 
				- .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
			
 
				- .rept pix_per_block*dst_w_bpp/128
			
 
				-        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
			
 
				-  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
			
 
				-        preload_middle  src_bpp, SRC, 1
			
 
				-  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
			
 
				-        preload_middle  mask_bpp, MASK, 1
			
 
				-  .else
			
 
				-        preload_middle  src_bpp, SRC, 0
			
 
				-        preload_middle  mask_bpp, MASK, 0
			
 
				-  .endif
			
 
				-  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
			
 
				-        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
			
 
				-         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
			
 
				-         * preloads for, to achieve staggered prefetches for multiple channels, because there are
			
 
				-         * always two STMs per prefetch, so there is always an opposite STM on which to put the
			
 
				-         * preload. Note, no need to BIC the base register here */
			
 
				-        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
			
 
				-  .endif
			
 
				-        process_tail  , 16, 0
			
 
				-  .if !((flags) & FLAG_PROCESS_DOES_STORE)
			
 
				-        pixst   , 16, 0, DST
			
 
				-  .endif
			
 
				-  .set SUBBLOCK, SUBBLOCK+1
			
 
				- .endr
			
 
				-        subs    X, X, #pix_per_block
			
 
				-        bhs     110b
			
 
				-.endm
			
 
				-
			
 
				-.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
			
 
				-        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
			
 
				- .if dst_r_bpp > 0
			
 
				-        tst     DST, #16
			
 
				-        bne     111f
			
 
				-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
			
 
				-        b       112f
			
 
				-111:
			
 
				- .endif
			
 
				-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
			
 
				-112:
			
 
				-        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
			
 
				- .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
			
 
				-        PF  and,    WK0, X, #pix_per_block-1
			
 
				- .endif
			
 
				-        preload_trailing  src_bpp, src_bpp_shift, SRC
			
 
				-        preload_trailing  mask_bpp, mask_bpp_shift, MASK
			
 
				- .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
			
 
				-        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
			
 
				- .endif
			
 
				-        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
			
 
				-        /* The remainder of the line is handled identically to the medium case */
			
 
				-        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
			
 
				-.endm
			
 
				-
			
 
				-.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
			
 
				-120:
			
 
				-        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
			
 
				-        process_tail  , 16, 0
			
 
				- .if !((flags) & FLAG_PROCESS_DOES_STORE)
			
 
				-        pixst   , 16, 0, DST
			
 
				- .endif
			
 
				-        subs    X, X, #128/dst_w_bpp
			
 
				-        bhs     120b
			
 
				-        /* Trailing pixels */
			
 
				-        tst     X, #128/dst_w_bpp - 1
			
 
				-        beq     exit_label
			
 
				-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
			
 
				-.endm
			
 
				-
			
 
				-.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
			
 
				-        tst     X, #16*8/dst_w_bpp
			
 
				-        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
			
 
				-        /* Trailing pixels */
			
 
				-        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
			
 
				-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
			
 
				-.endm
			
 
				-
			
 
				-.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
			
 
				- /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
			
 
				- .if mask_bpp == 8 || mask_bpp == 16
			
 
				-        tst     MASK, #3
			
 
				-        bne     141f
			
 
				- .endif
			
 
				-  .if src_bpp == 8 || src_bpp == 16
			
 
				-        tst     SRC, #3
			
 
				-        bne     140f
			
 
				-  .endif
			
 
				-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
			
 
				-  .if src_bpp == 8 || src_bpp == 16
			
 
				-        b       exit_label
			
 
				-140:
			
 
				-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
			
 
				-  .endif
			
 
				- .if mask_bpp == 8 || mask_bpp == 16
			
 
				-        b       exit_label
			
 
				-141:
			
 
				-  .if src_bpp == 8 || src_bpp == 16
			
 
				-        tst     SRC, #3
			
 
				-        bne     142f
			
 
				-  .endif
			
 
				-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
			
 
				-  .if src_bpp == 8 || src_bpp == 16
			
 
				-        b       exit_label
			
 
				-142:
			
 
				-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
			
 
				-  .endif
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
			
 
				- .if SINGLE_SCANLINE
			
 
				-  .ifc "last_one",""
			
 
				-        b       198f
			
 
				-  .endif
			
 
				- .else
			
 
				- .if vars_spilled
			
 
				-        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
			
 
				-        /* This is ldmia sp,{} */
			
 
				-        .word   0xE89D0000 | LINE_SAVED_REGS
			
 
				- .endif
			
 
				-        subs    Y, Y, #1
			
 
				- .if vars_spilled
			
 
				-  .if (LINE_SAVED_REGS) & (1<<1)
			
 
				-        str     Y, [sp]
			
 
				-  .endif
			
 
				- .endif
			
 
				-        add     DST, DST, STRIDE_D
			
 
				- .if src_bpp > 0
			
 
				-        add     SRC, SRC, STRIDE_S
			
 
				- .endif
			
 
				- .if mask_bpp > 0
			
 
				-        add     MASK, MASK, STRIDE_M
			
 
				- .endif
			
 
				- .if restore_x
			
 
				-        mov     X, ORIG_W
			
 
				- .endif
			
 
				-        bhs     loop_label
			
 
				- .ifc "last_one",""
			
 
				-  .if vars_spilled
			
 
				-        b       197f
			
 
				-  .else
			
 
				-        b       198f
			
 
				-  .endif
			
 
				- .else
			
 
				-  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
			
 
				-        b       198f
			
 
				-  .endif
			
 
				- .endif
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-
			
 
				-.macro generate_composite_function_common fname, \
			
 
				-                                          src_bpp_, \
			
 
				-                                          mask_bpp_, \
			
 
				-                                          dst_w_bpp_, \
			
 
				-                                          flags_, \
			
 
				-                                          prefetch_distance_, \
			
 
				-                                          init, \
			
 
				-                                          newline, \
			
 
				-                                          cleanup, \
			
 
				-                                          process_head, \
			
 
				-                                          process_tail, \
			
 
				-                                          process_inner_loop
			
 
				-
			
 
				-    pixman_asm_function fname
			
 
				-
			
 
				-/*
			
 
				- * Make some macro arguments globally visible and accessible
			
 
				- * from other macros
			
 
				- */
			
 
				- .set src_bpp, src_bpp_
			
 
				- .set mask_bpp, mask_bpp_
			
 
				- .set dst_w_bpp, dst_w_bpp_
			
 
				- .set flags, flags_
			
 
				- .set prefetch_distance, prefetch_distance_
			
 
				-
			
 
				-/*
			
 
				- * Select prefetch type for this function.
			
 
				- */
			
 
				- .if prefetch_distance == 0
			
 
				-  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
			
 
				- .else
			
 
				-  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
			
 
				- .endif
			
 
				-
			
 
				- .if src_bpp == 32
			
 
				-  .set src_bpp_shift, 2
			
 
				- .elseif src_bpp == 24
			
 
				-  .set src_bpp_shift, 0
			
 
				- .elseif src_bpp == 16
			
 
				-  .set src_bpp_shift, 1
			
 
				- .elseif src_bpp == 8
			
 
				-  .set src_bpp_shift, 0
			
 
				- .elseif src_bpp == 0
			
 
				-  .set src_bpp_shift, -1
			
 
				- .else
			
 
				-  .error "requested src bpp (src_bpp) is not supported"
			
 
				- .endif
			
 
				-
			
 
				- .if mask_bpp == 32
			
 
				-  .set mask_bpp_shift, 2
			
 
				- .elseif mask_bpp == 24
			
 
				-  .set mask_bpp_shift, 0
			
 
				- .elseif mask_bpp == 8
			
 
				-  .set mask_bpp_shift, 0
			
 
				- .elseif mask_bpp == 0
			
 
				-  .set mask_bpp_shift, -1
			
 
				- .else
			
 
				-  .error "requested mask bpp (mask_bpp) is not supported"
			
 
				- .endif
			
 
				-
			
 
				- .if dst_w_bpp == 32
			
 
				-  .set dst_bpp_shift, 2
			
 
				- .elseif dst_w_bpp == 24
			
 
				-  .set dst_bpp_shift, 0
			
 
				- .elseif dst_w_bpp == 16
			
 
				-  .set dst_bpp_shift, 1
			
 
				- .elseif dst_w_bpp == 8
			
 
				-  .set dst_bpp_shift, 0
			
 
				- .else
			
 
				-  .error "requested dst bpp (dst_w_bpp) is not supported"
			
 
				- .endif
			
 
				-
			
 
				- .if (((flags) & FLAG_DST_READWRITE) != 0)
			
 
				-  .set dst_r_bpp, dst_w_bpp
			
 
				- .else
			
 
				-  .set dst_r_bpp, 0
			
 
				- .endif
			
 
				-
			
 
				- .set pix_per_block, 16*8/dst_w_bpp
			
 
				- .if src_bpp != 0
			
 
				-  .if 32*8/src_bpp > pix_per_block
			
 
				-   .set pix_per_block, 32*8/src_bpp
			
 
				-  .endif
			
 
				- .endif
			
 
				- .if mask_bpp != 0
			
 
				-  .if 32*8/mask_bpp > pix_per_block
			
 
				-   .set pix_per_block, 32*8/mask_bpp
			
 
				-  .endif
			
 
				- .endif
			
 
				- .if dst_r_bpp != 0
			
 
				-  .if 32*8/dst_r_bpp > pix_per_block
			
 
				-   .set pix_per_block, 32*8/dst_r_bpp
			
 
				-  .endif
			
 
				- .endif
			
 
				-
			
 
				-/* The standard entry conditions set up by pixman-arm-common.h are:
			
 
				- * r0 = width (pixels)
			
 
				- * r1 = height (rows)
			
 
				- * r2 = pointer to top-left pixel of destination
			
 
				- * r3 = destination stride (pixels)
			
 
				- * [sp] = source pixel value, or pointer to top-left pixel of source
			
 
				- * [sp,#4] = 0 or source stride (pixels)
			
 
				- * The following arguments are unused for non-mask operations
			
 
				- * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
			
 
				- * [sp,#12] = 0 or mask stride (pixels)
			
 
				- *
			
 
				- * or in the single-scanline case:
			
 
				- * r0 = width (pixels)
			
 
				- * r1 = pointer to top-left pixel of destination
			
 
				- * r2 = pointer to top-left pixel of source
			
 
				- * The following argument is unused for non-mask operations
			
 
				- * r3 = pointer to top-left pixel of mask
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Assign symbolic names to registers
			
 
				- */
			
 
				-    X           .req    r0  /* pixels to go on this line */
			
 
				- .if SINGLE_SCANLINE
			
 
				-    DST         .req    r1  /* destination pixel pointer */
			
 
				-    SRC         .req    r2  /* source pixel pointer */
			
 
				-    MASK        .req    r3  /* mask pixel pointer (if applicable) */
			
 
				-    Y           .req    r4  /* temporary */
			
 
				-    STRIDE_D    .req    r5  /* temporary */
			
 
				-    STRIDE_S    .req    r6  /* temporary */
			
 
				-    STRIDE_M    .req    r7  /* temporary */
			
 
				- .else
			
 
				-    Y           .req    r1  /* lines to go */
			
 
				-    DST         .req    r2  /* destination pixel pointer */
			
 
				-    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
			
 
				-    SRC         .req    r4  /* source pixel pointer */
			
 
				-    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
			
 
				-    MASK        .req    r6  /* mask pixel pointer (if applicable) */
			
 
				-    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
			
 
				- .endif
			
 
				-    WK0         .req    r8  /* pixel data registers */
			
 
				-    WK1         .req    r9
			
 
				-    WK2         .req    r10
			
 
				-    WK3         .req    r11
			
 
				-    SCRATCH     .req    r12
			
 
				-    ORIG_W      .req    r14 /* width (pixels) */
			
 
				-
			
 
				-        push    {r4-r11, lr}        /* save all registers */
			
 
				-
			
 
				- .if !SINGLE_SCANLINE
			
 
				-        subs    Y, Y, #1
			
 
				-        blo     199f
			
 
				- .endif
			
 
				-
			
 
				-#ifdef DEBUG_PARAMS
			
 
				-        sub     sp, sp, #9*4
			
 
				-#endif
			
 
				-
			
 
				- .if !SINGLE_SCANLINE
			
 
				- .if src_bpp > 0
			
 
				-        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
			
 
				-        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
			
 
				- .endif
			
 
				- .if mask_bpp > 0
			
 
				-        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
			
 
				-        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
			
 
				- .endif
			
 
				- .endif
			
 
				-        
			
 
				-#ifdef DEBUG_PARAMS
			
 
				-        add     Y, Y, #1
			
 
				-        stmia   sp, {r0-r7,pc}
			
 
				-        sub     Y, Y, #1
			
 
				-#endif
			
 
				-
			
 
				-        init
			
 
				-
			
 
				- .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
			
 
				-        /* Reserve a word in which to store X during leading pixels */
			
 
				-        sub     sp, sp, #4
			
 
				-  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
			
 
				-  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
			
 
				- .endif
			
 
				-        
			
 
				- .if !SINGLE_SCANLINE
			
 
				-        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
			
 
				-        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
			
 
				- .if src_bpp > 0
			
 
				-        lsl     STRIDE_S, #src_bpp_shift
			
 
				-        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
			
 
				- .endif
			
 
				- .if mask_bpp > 0
			
 
				-        lsl     STRIDE_M, #mask_bpp_shift
			
 
				-        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
			
 
				- .endif
			
 
				- .endif
			
 
				- 
			
 
				-        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
			
 
				-        cmp     X, #2*16*8/dst_w_bpp - 1
			
 
				-        blo     170f
			
 
				- .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
			
 
				-        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
			
 
				-        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
			
 
				-        blo     160f
			
 
				-
			
 
				-        /* Wide case */
			
 
				-        /* Adjust X so that the decrement instruction can also test for
			
 
				-         * inner loop termination. We want it to stop when there are
			
 
				-         * (prefetch_distance+1) complete blocks to go. */
			
 
				-        sub     X, X, #(prefetch_distance+2)*pix_per_block
			
 
				-  .if !SINGLE_SCANLINE
			
 
				-        mov     ORIG_W, X
			
 
				-  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
			
 
				-        /* This is stmdb sp!,{} */
			
 
				-        .word   0xE92D0000 | LINE_SAVED_REGS
			
 
				-   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
			
 
				-   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
			
 
				-  .endif
			
 
				-  .endif
			
 
				-151:    /* New line */
			
 
				-        newline
			
 
				-        preload_leading_step1  src_bpp, WK1, SRC
			
 
				-        preload_leading_step1  mask_bpp, WK2, MASK
			
 
				-  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
			
 
				-        preload_leading_step1  dst_r_bpp, WK3, DST
			
 
				-  .endif
			
 
				-        
			
 
				-        ands    WK0, DST, #15
			
 
				-        beq     154f
			
 
				-        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
			
 
				-
			
 
				-        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
			
 
				-        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
			
 
				-  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
			
 
				-        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
			
 
				-  .endif
			
 
				-
			
 
				-        leading_15bytes  process_head, process_tail
			
 
				-        
			
 
				-154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
			
 
				-  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
			
 
				-        and     SCRATCH, SRC, #31
			
 
				-        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
			
 
				-  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
			
 
				-        and     SCRATCH, MASK, #31
			
 
				-        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
			
 
				-  .endif
			
 
				-  .ifc "process_inner_loop",""
			
 
				-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
			
 
				-  .else
			
 
				-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
			
 
				-  .endif
			
 
				-
			
 
				-157:    /* Check for another line */
			
 
				-        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
			
 
				-  .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE)
			
 
				-   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
			
 
				-   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
			
 
				-  .endif
			
 
				- .endif
			
 
				-
			
 
				- .ltorg
			
 
				-
			
 
				-160:    /* Medium case */
			
 
				- .if !SINGLE_SCANLINE
			
 
				-        mov     ORIG_W, X
			
 
				- .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
			
 
				-        /* This is stmdb sp!,{} */
			
 
				-        .word   0xE92D0000 | LINE_SAVED_REGS
			
 
				-  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
			
 
				-  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
			
 
				- .endif
			
 
				- .endif
			
 
				-161:    /* New line */
			
 
				-        newline
			
 
				-        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
			
 
				-        preload_line 0, mask_bpp, mask_bpp_shift, MASK
			
 
				- .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
			
 
				-        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
			
 
				- .endif
			
 
				-        
			
 
				-        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
			
 
				-        ands    WK0, DST, #15
			
 
				-        beq     164f
			
 
				-        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
			
 
				-        
			
 
				-        leading_15bytes  process_head, process_tail
			
 
				-        
			
 
				-164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
			
 
				-        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
			
 
				-        
			
 
				-167:    /* Check for another line */
			
 
				-        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
			
 
				-
			
 
				- .ltorg
			
 
				-
			
 
				-170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
			
 
				- .if !SINGLE_SCANLINE
			
 
				- .if dst_w_bpp < 32
			
 
				-        mov     ORIG_W, X
			
 
				- .endif
			
 
				- .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
			
 
				-        /* This is stmdb sp!,{} */
			
 
				-        .word   0xE92D0000 | LINE_SAVED_REGS
			
 
				- .endif
			
 
				- .endif
			
 
				-171:    /* New line */
			
 
				-        newline
			
 
				-        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
			
 
				-        preload_line 1, mask_bpp, mask_bpp_shift, MASK
			
 
				- .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
			
 
				-        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
			
 
				- .endif
			
 
				-        
			
 
				- .if dst_w_bpp == 8
			
 
				-        tst     DST, #3
			
 
				-        beq     174f
			
 
				-172:    subs    X, X, #1
			
 
				-        blo     177f
			
 
				-        process_head  , 1, 0, 1, 1, 0
			
 
				-        process_tail  , 1, 0
			
 
				-  .if !((flags) & FLAG_PROCESS_DOES_STORE)
			
 
				-        pixst   , 1, 0, DST
			
 
				-  .endif
			
 
				-        tst     DST, #3
			
 
				-        bne     172b
			
 
				- .elseif dst_w_bpp == 16
			
 
				-        tst     DST, #2
			
 
				-        beq     174f
			
 
				-        subs    X, X, #1
			
 
				-        blo     177f
			
 
				-        process_head  , 2, 0, 1, 1, 0
			
 
				-        process_tail  , 2, 0
			
 
				-  .if !((flags) & FLAG_PROCESS_DOES_STORE)
			
 
				-        pixst   , 2, 0, DST
			
 
				-  .endif
			
 
				- .endif
			
 
				-
			
 
				-174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
			
 
				-        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
			
 
				-
			
 
				-177:    /* Check for another line */
			
 
				-        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
			
 
				- .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE)
			
 
				-  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
			
 
				-  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
			
 
				- .endif
			
 
				-
			
 
				-197:
			
 
				- .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS)
			
 
				-        add     sp, sp, #LINE_SAVED_REG_COUNT*4
			
 
				- .endif
			
 
				-198:
			
 
				- .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
			
 
				-  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
			
 
				-  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
			
 
				-        add     sp, sp, #4
			
 
				- .endif
			
 
				-
			
 
				-        cleanup
			
 
				-
			
 
				-#ifdef DEBUG_PARAMS
			
 
				-        add     sp, sp, #9*4 /* junk the debug copy of arguments */
			
 
				-#endif
			
 
				-199:
			
 
				-        pop     {r4-r11, pc}  /* exit */
			
 
				-
			
 
				- .ltorg
			
 
				-
			
 
				-    .unreq  X
			
 
				-    .unreq  Y
			
 
				-    .unreq  DST
			
 
				-    .unreq  STRIDE_D
			
 
				-    .unreq  SRC
			
 
				-    .unreq  STRIDE_S
			
 
				-    .unreq  MASK
			
 
				-    .unreq  STRIDE_M
			
 
				-    .unreq  WK0
			
 
				-    .unreq  WK1
			
 
				-    .unreq  WK2
			
 
				-    .unreq  WK3
			
 
				-    .unreq  SCRATCH
			
 
				-    .unreq  ORIG_W
			
 
				-    .endfunc
			
 
				-.endm
			
 
				-
			
 
				-.macro generate_composite_function fname, \
			
 
				-                                   src_bpp_, \
			
 
				-                                   mask_bpp_, \
			
 
				-                                   dst_w_bpp_, \
			
 
				-                                   flags_, \
			
 
				-                                   prefetch_distance_, \
			
 
				-                                   init, \
			
 
				-                                   newline, \
			
 
				-                                   cleanup, \
			
 
				-                                   process_head, \
			
 
				-                                   process_tail, \
			
 
				-                                   process_inner_loop
			
 
				- .set SINGLE_SCANLINE, 0
			
 
				-generate_composite_function_common \
			
 
				-    fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
			
 
				-    init, newline, cleanup, process_head, process_tail, process_inner_loop
			
 
				-.endm
			
 
				-
			
 
				-.macro generate_composite_function_single_scanline fname, \
			
 
				-                                                   src_bpp_, \
			
 
				-                                                   mask_bpp_, \
			
 
				-                                                   dst_w_bpp_, \
			
 
				-                                                   flags_, \
			
 
				-                                                   prefetch_distance_, \
			
 
				-                                                   init, \
			
 
				-                                                   newline, \
			
 
				-                                                   cleanup, \
			
 
				-                                                   process_head, \
			
 
				-                                                   process_tail, \
			
 
				-                                                   process_inner_loop
			
 
				- .set SINGLE_SCANLINE, 1
			
 
				-generate_composite_function_common \
			
 
				-    fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
			
 
				-    init, newline, cleanup, process_head, process_tail, process_inner_loop
			
 
				-.endm
			
 
				-
			
 
				-.macro line_saved_regs  x:vararg
			
 
				- .set LINE_SAVED_REGS, 0
			
 
				- .set LINE_SAVED_REG_COUNT, 0
			
 
				- .irp SAVED_REG,x
			
 
				-  .ifc "SAVED_REG","Y"
			
 
				-   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
			
 
				-   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
			
 
				-  .endif
			
 
				-  .ifc "SAVED_REG","STRIDE_D"
			
 
				-   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
			
 
				-   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
			
 
				-  .endif
			
 
				-  .ifc "SAVED_REG","STRIDE_S"
			
 
				-   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
			
 
				-   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
			
 
				-  .endif
			
 
				-  .ifc "SAVED_REG","STRIDE_M"
			
 
				-   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
			
 
				-   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
			
 
				-  .endif
			
 
				-  .ifc "SAVED_REG","ORIG_W"
			
 
				-   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
			
 
				-   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
			
 
				-  .endif
			
 
				- .endr
			
 
				- .if SINGLE_SCANLINE
			
 
				-  .set LINE_SAVED_REG_COUNT, 0
			
 
				- .endif
			
 
				-.endm
			
 
				-
			
 
				-.macro nop_macro x:vararg
			
 
				-.endm