Bläddra i källkod

Revert "Remove buffer in SSE4.1, use unpacklo and packus intrinsics"

This reverts commit 149cd55840e7fdfd5ecde5c545aede5822abd14e.
Isaac Aronson 1 år sedan
förälder
incheckning
f6f12d0451
2 ändrade filer med 30 tillägg och 33 borttagningar
  1. 2 7
      src/video/SDL_blit_A_avx2.c
  2. 28 26
      src/video/SDL_blit_A_sse4_1.c

+ 2 - 7
src/video/SDL_blit_A_avx2.c

@@ -84,10 +84,9 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
                 Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
                 Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
                 __m128i c_src = _mm_loadu_si64(src_ptr);
-                c_src = _mm_unpacklo_epi8(_mm_shuffle_epi8(c_src, colorShiftMask), _mm_setzero_si128());
-                __m128i c_dst = _mm_unpacklo_epi8(_mm_loadu_si64(dst_ptr), _mm_setzero_si128());
+                c_src = _mm_shuffle_epi8(c_src, colorShiftMask);
+                __m128i c_dst = _mm_loadu_si64(dst_ptr);
                 __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask);
-                c_mix = _mm_packus_epi16(c_mix, _mm_setzero_si128());
                 _mm_storeu_si64(dst_ptr, c_mix);
                 remaining_pixels -= 2;
                 offset += 2;
@@ -104,11 +103,7 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
                 __m128i c_src = _mm_loadu_si32(&pixel);
                 __m128i c_dst = _mm_loadu_si32(dst_ptr);
                 #endif
-                c_src = _mm_unpacklo_epi8(c_src, _mm_setzero_si128());
-                c_dst = _mm_unpacklo_epi8(c_dst, _mm_setzero_si128());
                 __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask);
-                mixed_pixel = _mm_srli_epi16(mixed_pixel, 8);
-                mixed_pixel = _mm_unpacklo_epi8(mixed_pixel, _mm_setzero_si128());
                 /* Old GCC has bad or no _mm_storeu_si32 */
                 #if defined(__GNUC__) && (__GNUC__ < 11)
                 *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);

+ 28 - 26
src/video/SDL_blit_A_sse4_1.c

@@ -13,33 +13,38 @@
  * A helper function to create an alpha mask for use with MixRGBA_SSE4_1 based on pixel format
  */
 __m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatAlphaMask_SSE4_1(const SDL_PixelFormat* dstfmt) {
-    Uint8 index = dstfmt->Ashift / 4;
+    Uint8 index = dstfmt->Ashift / 8;
     /* Handle case where bad input sent */
     if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) {
-        index = 6;
+        index = 3;
     }
     return _mm_set_epi8(
-            -1, index + 8, -1, index + 8, -1, index + 8, -1, index + 8,
+            -1, index + 4, -1, index + 4, -1, index + 4, -1, index + 4,
             -1, index, -1, index, -1, index, -1, index);
 }
 
 /**
  * Using the SSE4.1 instruction set, blit four pixels with alpha blending
- * @param src_color A pointer to two 32-bit pixels of ARGB format to blit into dst
- * @param dst_color A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending
+ * @param src A pointer to two 32-bit pixels of ARGB format to blit into dst
+ * @param dst A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending
  * @return A 128-bit wide vector of two alpha-blended pixels in ARGB format
  */
-__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(const __m128i src_color, const __m128i dst_color, const __m128i alphaMask) {
+__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(const __m128i src, const __m128i dst, const __m128i alphaMask) {
+    __m128i src_color = _mm_cvtepu8_epi16(src);
+    __m128i dst_color = _mm_cvtepu8_epi16(dst);
     /**
      * Combines a shuffle and an _mm_cvtepu8_epi16 operation into one operation by moving the lower 8 bits of the alpha
      * channel around to create 16-bit integers.
      */
-    __m128i alpha = _mm_shuffle_epi8(src_color, alphaMask);
+    __m128i alpha = _mm_shuffle_epi8(src, alphaMask);
     __m128i sub = _mm_sub_epi16(src_color, dst_color);
     __m128i mul = _mm_mullo_epi16(sub, alpha);
-    mul = _mm_srli_epi16(mul, 8);
+    const __m128i SHUFFLE_REDUCE = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1,
+        15, 13, 11, 9, 7, 5, 3, 1);
+    __m128i reduced = _mm_shuffle_epi8(mul, SHUFFLE_REDUCE);
 
-    return _mm_add_epi8(mul, dst_color);
+    return _mm_add_epi8(reduced, dst);
 }
 
 Uint32 AlignPixelToSDL_PixelFormat(Uint32 color, const SDL_PixelFormat* srcfmt, const SDL_PixelFormat* dstfmt) {
@@ -102,22 +107,23 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
     SDL_PixelFormat *dstfmt = info->dst_fmt;
 
     int chunks = width / 4;
+    Uint8 *buffer = (Uint8*)SDL_malloc(chunks * 16 * sizeof(Uint8));
     const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt);
     const __m128i alphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt);
 
     while (height--) {
         /* Process 4-wide chunks of source color data that may be in wrong format into buffer */
         for (int i = 0; i < chunks; i += 1) {
-            __m128i c_src = _mm_loadu_si128((__m128i*)(src + i * 16));
-            c_src = _mm_shuffle_epi8(c_src, colorShiftMask);
-            __m128i c_dst = _mm_loadu_si128((__m128i*)(dst + i * 16));
-            __m128i src_lo = _mm_unpacklo_epi8(c_src, _mm_setzero_si128());
-            __m128i dst_lo = _mm_unpacklo_epi8(c_dst, _mm_setzero_si128());
-            __m128i mix_lo = MixRGBA_SSE4_1(src_lo, dst_lo, alphaMask);
-            __m128i src_hi = _mm_unpackhi_epi8(c_src, _mm_setzero_si128());
-            __m128i dst_hi = _mm_unpackhi_epi8(c_dst, _mm_setzero_si128());
-            __m128i mix_hi = MixRGBA_SSE4_1(src_hi, dst_hi, alphaMask);
-            _mm_storeu_si128((__m128i*)(dst + i * 16), _mm_packus_epi16(mix_lo, mix_hi));
+            __m128i colors = _mm_loadu_si128((__m128i*)(src + i * 16));
+            _mm_storeu_si128((__m128i*)(buffer + i * 16), _mm_shuffle_epi8(colors, colorShiftMask));
+        }
+
+        /* Alpha-blend in 2-wide chunks from buffer into destination */
+        for (int i = 0; i < chunks * 2; i += 1) {
+            __m128i c_src = _mm_loadu_si64((buffer + (i * 8)));
+            __m128i c_dst = _mm_loadu_si64((dst + i * 8));
+            __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask);
+            _mm_storeu_si64(dst + i * 8, c_mix);
         }
 
         /* Handle remaining pixels when width is not a multiple of 4 */
@@ -128,10 +134,9 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
                 Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
                 Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
                 __m128i c_src = _mm_loadu_si64(src_ptr);
-                c_src = _mm_unpacklo_epi8(_mm_shuffle_epi8(c_src, colorShiftMask), _mm_setzero_si128());
-                __m128i c_dst = _mm_unpacklo_epi8(_mm_loadu_si64(dst_ptr), _mm_setzero_si128());
+                c_src = _mm_shuffle_epi8(c_src, colorShiftMask);
+                __m128i c_dst = _mm_loadu_si64(dst_ptr);
                 __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask);
-                c_mix = _mm_packus_epi16(c_mix, _mm_setzero_si128());
                 _mm_storeu_si64(dst_ptr, c_mix);
                 remaining_pixels -= 2;
                 offset += 2;
@@ -148,11 +153,7 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
                 __m128i c_src = _mm_loadu_si32(&pixel);
                 __m128i c_dst = _mm_loadu_si32(dst_ptr);
                 #endif
-                c_src = _mm_unpacklo_epi8(c_src, _mm_setzero_si128());
-                c_dst = _mm_unpacklo_epi8(c_dst, _mm_setzero_si128());
                 __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, alphaMask);
-                mixed_pixel = _mm_srli_epi16(mixed_pixel, 8);
-                mixed_pixel = _mm_unpacklo_epi8(mixed_pixel, _mm_setzero_si128());
                 /* Old GCC has bad or no _mm_storeu_si32 */
                 #if defined(__GNUC__) && (__GNUC__ < 11)
                 *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
@@ -168,6 +169,7 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
         src += srcskip;
         dst += dstskip;
     }
+    SDL_free(buffer);
 }
 
 #endif