5 years ago · 72f8044a42
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -281,6 +281,27 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
 
				         return SDL_SetError("SDL_FillRects() passed NULL rects");
			
 
				     }
			
 
				 
			
 
				+#if SDL_ARM_NEON_BLITTERS
			
 
				+    if (SDL_HasNEON() && dst->format->BytesPerPixel != 3) {
			
 
				+        void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
			
 
				+        void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
			
 
				+        void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
			
 
				+        switch (dst->format->BytesPerPixel) {
			
 
				+        case 1:
			
 
				+            FillRect8ARMNEONAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
			
 
				+            break;
			
 
				+        case 2:
			
 
				+            FillRect16ARMNEONAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
			
 
				+            break;
			
 
				+        case 4:
			
 
				+            FillRect32ARMNEONAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        SDL_UnlockSurface(dst);
			
 
				+        return(0);
			
 
				+    }
			
 
				+#endif
			
 
				 #if SDL_ARM_SIMD_BLITTERS
			
 
				     if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
			
 
				         void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
			
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -95,6 +95,134 @@
 
				 
			
 
				 /******************************************************************************/
			
 
				 
			
 
				+/* We can actually do significantly better than the Pixman macros, at least for
			
 
				+ * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
			
 
				+ * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
			
 
				+ */
			
 
				+
			
 
				+.macro generate_fillrect_function name, bpp, log2Bpp
			
 
				+/*
			
 
				+ * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
			
 
				+ * On entry:
			
 
				+ * a1 = width, pixels
			
 
				+ * a2 = height, rows
			
 
				+ * a3 = pointer to top-left destination pixel
			
 
				+ * a4 = stride, pixels
			
 
				+ * [sp] = pixel value to fill with
			
 
				+ * Within the function:
			
 
				+ * v1 = width remaining
			
 
				+ * v2 = vst offset
			
 
				+ * v3 = alternate pointer
			
 
				+ * ip = data ARM register
			
 
				+ */
			
 
				+pixman_asm_function name
			
 
				+    vld1.\bpp   {d0[],d1[]}, [sp]
			
 
				+    sub         a4, a1
			
 
				+    vld1.\bpp   {d2[],d3[]}, [sp]
			
 
				+    cmp         a1, #(15+64) >> \log2Bpp
			
 
				+    push        {v1-v3,lr}
			
 
				+    vmov        ip, s0
			
 
				+    blo         51f
			
 
				+
			
 
				+    /* Long-row case */
			
 
				+    mov         v2, #64
			
 
				+1:  mov         v1, a1
			
 
				+    ands        v3, a3, #15
			
 
				+    beq         2f
			
 
				+    /* Leading pixels */
			
 
				+    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
			
 
				+    sub         v1, v1, v3, lsr #\log2Bpp
			
 
				+    rbit        v3, v3
			
 
				+.if bpp <= 16
			
 
				+.if bpp == 8
			
 
				+    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
			
 
				+    strneb      ip, [a3], #1
			
 
				+    tst         v3, #1<<30
			
 
				+.else
			
 
				+    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
			
 
				+.endif
			
 
				+    strneh      ip, [a3], #2
			
 
				+.endif
			
 
				+    movs        v3, v3, lsl #3
			
 
				+    vstmcs      a3!, {s0}
			
 
				+    vstmmi      a3!, {d0}
			
 
				+2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
			
 
				+    add         v3, a3, #32
			
 
				+    /* Inner loop */
			
 
				+3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
			
 
				+    subs        v1, v1, #64 >> \log2Bpp
			
 
				+    vst1.\bpp   {q0-q1}, [v3 :128], v2
			
 
				+    bhs         3b
			
 
				+    /* Trailing pixels */
			
 
				+4:  movs        v1, v1, lsl #27 + \log2Bpp
			
 
				+    bcc         5f
			
 
				+    vst1.\bpp   {q0-q1}, [a3 :128]!
			
 
				+5:  bpl         6f
			
 
				+    vst1.\bpp   {q0}, [a3 :128]!
			
 
				+6:  movs        v1, v1, lsl #2
			
 
				+    vstmcs      a3!, {d0}
			
 
				+    vstmmi      a3!, {s0}
			
 
				+.if bpp <= 16
			
 
				+    movs        v1, v1, lsl #2
			
 
				+    strcsh      ip, [a3], #2
			
 
				+.if bpp == 8
			
 
				+    strmib      ip, [a3], #1
			
 
				+.endif
			
 
				+.endif
			
 
				+    subs        a2, a2, #1
			
 
				+    add         a3, a3, a4, lsl #\log2Bpp
			
 
				+    bhi         1b
			
 
				+    pop         {v1-v3,pc}
			
 
				+
			
 
				+    /* Short-row case */
			
 
				+51: movs        v1, a1
			
 
				+.if bpp == 8
			
 
				+    tst         a3, #3
			
 
				+    beq         53f
			
 
				+52: subs        v1, v1, #1
			
 
				+    blo         57f
			
 
				+    strb        ip, [a3], #1
			
 
				+    tst         a3, #3
			
 
				+    bne         52b
			
 
				+.elseif bpp == 16
			
 
				+    tstne       a3, #2
			
 
				+    subne       v1, v1, #1
			
 
				+    strneh      ip, [a3], #2
			
 
				+.endif
			
 
				+53: cmp         v1, #32 >> \log2Bpp
			
 
				+    bcc         54f
			
 
				+    vst1.\bpp   {q0-q1}, [a3]!
			
 
				+    sub         v1, v1, #32 >> \log2Bpp
			
 
				+    /* Trailing pixels */
			
 
				+54: movs        v1, v1, lsl #27 + \log2Bpp
			
 
				+    bcc         55f
			
 
				+    vst1.\bpp   {q0-q1}, [a3]!
			
 
				+55: bpl         56f
			
 
				+    vst1.\bpp   {q0}, [a3]!
			
 
				+56: movs        v1, v1, lsl #2
			
 
				+    vstmcs      a3!, {d0}
			
 
				+    vstmmi      a3!, {s0}
			
 
				+.if bpp <= 16
			
 
				+    movs        v1, v1, lsl #2
			
 
				+    strcsh      ip, [a3], #2
			
 
				+.if bpp == 8
			
 
				+    strmib      ip, [a3], #1
			
 
				+.endif
			
 
				+.endif
			
 
				+    subs        a2, a2, #1
			
 
				+    add         a3, a3, a4, lsl #\log2Bpp
			
 
				+    bhi         51b
			
 
				+57: pop         {v1-v3,pc}
			
 
				+
			
 
				+.endfunc
			
 
				+.endm
			
 
				+
			
 
				+generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
			
 
				+generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
			
 
				+generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				 .macro RGBtoRGBPixelAlpha_process_pixblock_head
			
 
				     vmvn        d30, d3  /* get inverted source alpha */
			
 
				     vmov        d31, d7  /* dest alpha is always unchanged */