9 months ago · fde2378ccd
--- a/VisualC-GDK/SDL/SDL.vcxproj
+++ b/VisualC-GDK/SDL/SDL.vcxproj
@@ -551,8 +551,6 @@
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vulkan_xlib_xrandr.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit_auto.h" />
			
 
				-    <ClInclude Include="..\..\src\video\SDL_blit_A_avx2.h" />
			
 
				-    <ClInclude Include="..\..\src\video\SDL_blit_A_sse4_1.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit_copy.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit_slow.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_clipboard_c.h" />
			
@@ -824,8 +822,6 @@
 
				     <ClCompile Include="..\..\src\video\SDL_blit_1.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_A.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_auto.c" />
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_avx2.c" />
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_sse4_1.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_copy.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_N.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_slow.c" />
			
--- a/VisualC-GDK/SDL/SDL.vcxproj.filters
+++ b/VisualC-GDK/SDL/SDL.vcxproj.filters
@@ -458,12 +458,6 @@
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_sse_func.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_std_func.h" />
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_avx2.c">
			
 
				-      <Filter>video</Filter>
			
 
				-    </ClCompile>
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_sse4_1.c">
			
 
				-      <Filter>video</Filter>
			
 
				-    </ClCompile>
			
 
				   </ItemGroup>
			
 
				   <ItemGroup>
			
 
				     <ResourceCompile Include="..\..\src\core\windows\version.rc" />
			
--- a/VisualC-WinRT/SDL-UWP.vcxproj
+++ b/VisualC-WinRT/SDL-UWP.vcxproj
@@ -178,8 +178,6 @@
 
				     <ClInclude Include="..\src\video\dummy\SDL_nullvideo.h" />
			
 
				     <ClInclude Include="..\src\video\SDL_blit.h" />
			
 
				     <ClInclude Include="..\src\video\SDL_blit_auto.h" />
			
 
				-    <ClInclude Include="..\src\video\SDL_blit_A_avx2.h" />
			
 
				-    <ClInclude Include="..\src\video\SDL_blit_A_sse4_1.h" />
			
 
				     <ClInclude Include="..\src\video\SDL_blit_copy.h" />
			
 
				     <ClInclude Include="..\src\video\SDL_blit_slow.h" />
			
 
				     <ClInclude Include="..\src\video\SDL_clipboard_c.h" />
			
@@ -549,8 +547,6 @@
 
				     <ClCompile Include="..\src\video\SDL_blit_1.c" />
			
 
				     <ClCompile Include="..\src\video\SDL_blit_A.c" />
			
 
				     <ClCompile Include="..\src\video\SDL_blit_auto.c" />
			
 
				-    <ClCompile Include="..\src\video\SDL_blit_A_avx2.c" />
			
 
				-    <ClCompile Include="..\src\video\SDL_blit_A_sse4_1.c" />
			
 
				     <ClCompile Include="..\src\video\SDL_blit_copy.c" />
			
 
				     <ClCompile Include="..\src\video\SDL_blit_N.c" />
			
 
				     <ClCompile Include="..\src\video\SDL_blit_slow.c" />
			
--- a/VisualC-WinRT/SDL-UWP.vcxproj.filters
+++ b/VisualC-WinRT/SDL-UWP.vcxproj.filters
@@ -952,12 +952,6 @@
 
				       <Filter>Header Files</Filter>
			
 
				     </ClInclude>
			
 
				     <ClInclude Include="..\src\video\SDL_clipboard_c.h" />
			
 
				-    <ClInclude Include="..\src\video\SDL_blit_A_avx2.h">
			
 
				-      <Filter>Source Files</Filter>
			
 
				-    </ClInclude>
			
 
				-    <ClInclude Include="..\src\video\SDL_blit_A_sse4_1.h">
			
 
				-      <Filter>Source Files</Filter>
			
 
				-    </ClInclude>
			
 
				   </ItemGroup>
			
 
				   <ItemGroup>
			
 
				     <ClCompile Include="..\src\joystick\windows\SDL_windows_gaming_input.c">
			
@@ -978,11 +972,5 @@
 
				     <ClCompile Include="..\src\core\windows\pch_cpp.cpp">
			
 
				       <Filter>Source Files</Filter>
			
 
				     </ClCompile>
			
 
				-    <ClCompile Include="..\src\video\SDL_blit_A_avx2.c">
			
 
				-      <Filter>Source Files</Filter>
			
 
				-    </ClCompile>
			
 
				-    <ClCompile Include="..\src\video\SDL_blit_A_sse4_1.c">
			
 
				-      <Filter>Source Files</Filter>
			
 
				-    </ClCompile>
			
 
				   </ItemGroup>
			
 
				 </Project>
			
--- a/VisualC/SDL/SDL.vcxproj
+++ b/VisualC/SDL/SDL.vcxproj
@@ -457,8 +457,6 @@
 
				     <ClInclude Include="..\..\src\video\offscreen\SDL_offscreenwindow.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit_auto.h" />
			
 
				-    <ClInclude Include="..\..\src\video\SDL_blit_A_avx2.h" />
			
 
				-    <ClInclude Include="..\..\src\video\SDL_blit_A_sse4_1.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit_copy.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_blit_slow.h" />
			
 
				     <ClInclude Include="..\..\src\video\SDL_clipboard_c.h" />
			
@@ -682,8 +680,6 @@
 
				     <ClCompile Include="..\..\src\video\SDL_blit_1.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_A.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_auto.c" />
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_avx2.c" />
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_sse4_1.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_copy.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_N.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_slow.c" />
			
--- a/VisualC/SDL/SDL.vcxproj.filters
+++ b/VisualC/SDL/SDL.vcxproj.filters
@@ -871,12 +871,6 @@
 
				     </ClInclude>
			
 
				     <ClInclude Include="..\..\src\hidapi\SDL_hidapi_c.h" />
			
 
				     <ClInclude Include="..\..\src\thread\generic\SDL_sysrwlock_c.h" />
			
 
				-    <ClInclude Include="..\..\src\video\SDL_blit_A_avx2.h">
			
 
				-      <Filter>video</Filter>
			
 
				-    </ClInclude>
			
 
				-    <ClInclude Include="..\..\src\video\SDL_blit_A_sse4_1.h">
			
 
				-      <Filter>video</Filter>
			
 
				-    </ClInclude>
			
 
				     <ClInclude Include="..\..\src\thread\generic\SDL_sysrwlock_c.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_common.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_internal.h" />
			
@@ -1525,12 +1519,6 @@
 
				       <Filter>stdlib</Filter>
			
 
				     </ClCompile>
			
 
				     <ClCompile Include="..\..\src\thread\generic\SDL_sysrwlock.c" />
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_avx2.c">
			
 
				-      <Filter>video</Filter>
			
 
				-    </ClCompile>
			
 
				-    <ClCompile Include="..\..\src\video\SDL_blit_A_sse4_1.c">
			
 
				-      <Filter>video</Filter>
			
 
				-    </ClCompile>
			
 
				     <ClCompile Include="..\..\src\thread\generic\SDL_sysrwlock.c" />
			
 
				     <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.c" />
			
 
				     <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_sse.c" />
			
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -526,27 +526,65 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
 
				         ALPHA_BLEND_CHANNEL(sG, dG, A);                       \
			
 
				         ALPHA_BLEND_CHANNEL(sB, dB, A);                       \
			
 
				     } while (0)
			
 
				-/* Blend two 32-bit pixels with the same format */
			
 
				-#define ALPHA_BLEND_RGBA_4(src, dst, ashift)                            \
			
 
				-    do {                                                                \
			
 
				-        Uint32 srcA = (src >> ashift) & 0xFF;                           \
			
 
				-        src |= ((Uint32)0xFF) << ashift;                                \
			
 
				-                                                                        \
			
 
				-        Uint32 srcRB = src & 0x00FF00FF;                                \
			
 
				-        Uint32 dstRB = dst & 0x00FF00FF;                                \
			
 
				-                                                                        \
			
 
				-        Uint32 srcGA = (src >> 8) & 0x00FF00FF;                         \
			
 
				-        Uint32 dstGA = (dst >> 8) & 0x00FF00FF;                         \
			
 
				-                                                                        \
			
 
				-        Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; \
			
 
				-        resRB += 0x00010001;                                            \
			
 
				-        resRB += (resRB >> 8) & 0x00FF00FF;                             \
			
 
				-        resRB = (resRB >> 8) & 0x00FF00FF;                              \
			
 
				-        Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; \
			
 
				-        resGA += 0x00010001;                                            \
			
 
				-        resGA += (resGA >> 8) & 0x00FF00FF;                             \
			
 
				-        resGA &= 0xFF00FF00;                                            \
			
 
				-        dst = resRB | resGA;                                            \
			
 
				+
			
 
				+/* Blend two 8888 pixels with the same format */
			
 
				+/* Calculates dst = ((src * factor) + (dst * (255 - factor))) / 255 */
			
 
				+/* FIXME: SDL_SIZE_MAX might not be an integer literal */
			
 
				+#if defined(SIZE_MAX) && (SIZE_MAX == 0xffffffffffffffff)
			
 
				+#define FACTOR_BLEND_8888(src, dst, factor)                        \
			
 
				+    do {                                                           \
			
 
				+        Uint64 src64 = src;                                        \
			
 
				+        src64 = (src64 | (src64 << 24)) & 0x00FF00FF00FF00FF;      \
			
 
				+                                                                   \
			
 
				+        Uint64 dst64 = dst;                                        \
			
 
				+        dst64 = (dst64 | (dst64 << 24)) & 0x00FF00FF00FF00FF;      \
			
 
				+                                                                   \
			
 
				+        dst64 = ((src64 - dst64) * factor) + (dst64 << 8) - dst64; \
			
 
				+        dst64 += 0x0001000100010001;                               \
			
 
				+        dst64 += (dst64 >> 8) & 0x00FF00FF00FF00FF;                \
			
 
				+        dst64 &= 0xFF00FF00FF00FF00;                               \
			
 
				+                                                                   \
			
 
				+        dst = (Uint32)((dst64 >> 8) | (dst64 >> 32));              \
			
 
				+    } while (0)
			
 
				+#else
			
 
				+#define FACTOR_BLEND_8888(src, dst, factor)                               \
			
 
				+    do {                                                                  \
			
 
				+        Uint32 src02 = src & 0x00FF00FF;                                  \
			
 
				+        Uint32 dst02 = dst & 0x00FF00FF;                                  \
			
 
				+                                                                          \
			
 
				+        Uint32 src13 = (src >> 8) & 0x00FF00FF;                           \
			
 
				+        Uint32 dst13 = (dst >> 8) & 0x00FF00FF;                           \
			
 
				+                                                                          \
			
 
				+        Uint32 res02 = ((src02 - dst02) * factor) + (dst02 << 8) - dst02; \
			
 
				+        res02 += 0x00010001;                                              \
			
 
				+        res02 += (res02 >> 8) & 0x00FF00FF;                               \
			
 
				+        res02 = (res02 >> 8) & 0x00FF00FF;                                \
			
 
				+                                                                          \
			
 
				+        Uint32 res13 = ((src13 - dst13) * factor) + (dst13 << 8) - dst13; \
			
 
				+        res13 += 0x00010001;                                              \
			
 
				+        res13 += (res13 >> 8) & 0x00FF00FF;                               \
			
 
				+        res13 &= 0xFF00FF00;                                              \
			
 
				+        dst = res02 | res13;                                              \
			
 
				+    } while (0)
			
 
				+#endif
			
 
				+
			
 
				+/* Alpha blend two 8888 pixels with the same formats. */
			
 
				+#define ALPHA_BLEND_8888(src, dst, fmt)            \
			
 
				+    do {                                           \
			
 
				+        Uint32 srcA = (src >> fmt->Ashift) & 0xFF; \
			
 
				+        Uint32 tmp = src | fmt->Amask;             \
			
 
				+        FACTOR_BLEND_8888(tmp, dst, srcA);         \
			
 
				+    } while (0)
			
 
				+
			
 
				+/* Alpha blend two 8888 pixels with differing formats. */
			
 
				+#define ALPHA_BLEND_SWIZZLE_8888(src, dst, srcfmt, dstfmt)                  \
			
 
				+    do {                                                                    \
			
 
				+        Uint32 srcA = (src >> srcfmt->Ashift) & 0xFF;                       \
			
 
				+        Uint32 tmp = (((src >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | \
			
 
				+                     (((src >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | \
			
 
				+                     (((src >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | \
			
 
				+                     dstfmt->Amask;                                         \
			
 
				+        FACTOR_BLEND_8888(tmp, dst, srcA);                                  \
			
 
				     } while (0)
			
 
				 /* Blend the RGBA values of two pixels */
			
 
				 #define ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA) \
			
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -24,16 +24,6 @@
 
				 
			
 
				 #include "SDL_blit.h"
			
 
				 
			
 
				-#ifdef SDL_SSE4_1_INTRINSICS
			
 
				-#include "SDL_blit_A_sse4_1.h"
			
 
				-#endif
			
 
				-#ifdef SDL_AVX2_INTRINSICS
			
 
				-#include "SDL_blit_A_avx2.h"
			
 
				-#endif
			
 
				-#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS)
			
 
				-#include "SDL3/SDL_cpuinfo.h"
			
 
				-#endif
			
 
				-
			
 
				 /* Functions to perform alpha blended blitting */
			
 
				 
			
 
				 /* N->1 blending with per-surface alpha */
			
@@ -327,98 +317,6 @@ static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
 
				     }
			
 
				 }
			
 
				 
			
 
				-/* fast ARGB888->(A)RGB888 blending with pixel alpha */
			
 
				-static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int width = info->dst_w;
			
 
				-    int height = info->dst_h;
			
 
				-    Uint32 *srcp = (Uint32 *)info->src;
			
 
				-    int srcskip = info->src_skip >> 2;
			
 
				-    Uint32 *dstp = (Uint32 *)info->dst;
			
 
				-    int dstskip = info->dst_skip >> 2;
			
 
				-    SDL_PixelFormat *sf = info->src_fmt;
			
 
				-    Uint32 amask = sf->Amask;
			
 
				-    Uint32 ashift = sf->Ashift;
			
 
				-    Uint64 multmask, multmask2;
			
 
				-
			
 
				-    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2, mm_one_alpha;
			
 
				-
			
 
				-    mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
			
 
				-    if (amask == 0xFF000000) { /* 1 in the alpha channel -> mm_one_alpha */
			
 
				-        mm_one_alpha = _mm_set_pi16(1, 0, 0, 0);
			
 
				-    } else if (amask == 0x00FF0000) {
			
 
				-        mm_one_alpha = _mm_set_pi16(0, 1, 0, 0);
			
 
				-    } else if (amask == 0x0000FF00) {
			
 
				-        mm_one_alpha = _mm_set_pi16(0, 0, 1, 0);
			
 
				-    } else {
			
 
				-        mm_one_alpha = _mm_set_pi16(0, 0, 0, 1);
			
 
				-    }
			
 
				-
			
 
				-    multmask = 0x00FF;
			
 
				-    multmask <<= ((Uint64)ashift * 2);
			
 
				-    multmask2 = 0x00FF00FF00FF00FFULL;
			
 
				-
			
 
				-    while (height--) {
			
 
				-        /* *INDENT-OFF* */ /* clang-format off */
			
 
				-        DUFFS_LOOP4({
			
 
				-        Uint32 alpha = *srcp & amask;
			
 
				-        if (alpha == 0) {
			
 
				-            /* do nothing */
			
 
				-        } else if (alpha == amask) {
			
 
				-            *dstp = *srcp;
			
 
				-        } else {
			
 
				-            src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
			
 
				-            src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
			
 
				-
			
 
				-            dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
			
 
				-            dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
			
 
				-
			
 
				-            mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
			
 
				-            mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
			
 
				-            mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
			
 
				-            mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
			
 
				-            mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
			
 
				-            mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
			
 
				-
			
 
				-            /*
			
 
				-                Alpha blending is:
			
 
				-                    dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
			
 
				-                    dstA = srcA + (dstA * (1-srcA)) *
			
 
				-
			
 
				-                Here, 'src1' is:
			
 
				-                    srcRGB * srcA
			
 
				-                    srcA
			
 
				-                And 'dst1' is:
			
 
				-                    dstRGB * (1-srcA)
			
 
				-                    dstA * (1-srcA)
			
 
				-                so that *dstp is 'src1 + dst1'
			
 
				-
			
 
				-                src1 is computed using mullo_pi16: (X * mask) >> 8, but is approximate for srcA ((srcA * 255) >> 8).
			
 
				-
			
 
				-                need to a 1 to get an exact result: (srcA * 256) >> 8 == srcA
			
 
				-             */
			
 
				-            mm_alpha = _mm_add_pi16(mm_alpha, mm_one_alpha);
			
 
				-
			
 
				-            /* blend */
			
 
				-            src1 = _mm_mullo_pi16(src1, mm_alpha);
			
 
				-            src1 = _mm_srli_pi16(src1, 8);
			
 
				-            dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
			
 
				-            dst1 = _mm_srli_pi16(dst1, 8);
			
 
				-            dst1 = _mm_add_pi16(src1, dst1);
			
 
				-            dst1 = _mm_packs_pu16(dst1, mm_zero);
			
 
				-
			
 
				-            *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
			
 
				-        }
			
 
				-        ++srcp;
			
 
				-        ++dstp;
			
 
				-        }, width);
			
 
				-        /* *INDENT-ON* */ /* clang-format on */
			
 
				-        srcp += srcskip;
			
 
				-        dstp += dstskip;
			
 
				-    }
			
 
				-    _mm_empty();
			
 
				-}
			
 
				-
			
 
				 #endif /* SDL_MMX_INTRINSICS */
			
 
				 
			
 
				 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
			
@@ -1121,6 +1019,247 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
 
				     }
			
 
				 }
			
 
				 
			
 
				+/* Fast 32-bit RGBA->RGBA blending with pixel alpha */
			
 
				+static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info)
			
 
				+{
			
 
				+    int width = info->dst_w;
			
 
				+    int height = info->dst_h;
			
 
				+    Uint8 *src = info->src;
			
 
				+    int srcskip = info->src_skip;
			
 
				+    Uint8 *dst = info->dst;
			
 
				+    int dstskip = info->dst_skip;
			
 
				+    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				+
			
 
				+    while (height--) {
			
 
				+        int i = 0;
			
 
				+
			
 
				+        for (; i < width; ++i) {
			
 
				+            Uint32 src32 = *(Uint32 *)src;
			
 
				+            Uint32 dst32 = *(Uint32 *)dst;
			
 
				+            ALPHA_BLEND_8888(src32, dst32, srcfmt);
			
 
				+            *(Uint32 *)dst = dst32;
			
 
				+            src += 4;
			
 
				+            dst += 4;
			
 
				+        }
			
 
				+
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/* Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling */
			
 
				+static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info)
			
 
				+{
			
 
				+    int width = info->dst_w;
			
 
				+    int height = info->dst_h;
			
 
				+    Uint8 *src = info->src;
			
 
				+    int srcskip = info->src_skip;
			
 
				+    Uint8 *dst = info->dst;
			
 
				+    int dstskip = info->dst_skip;
			
 
				+    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				+    SDL_PixelFormat *dstfmt = info->dst_fmt;
			
 
				+
			
 
				+    while (height--) {
			
 
				+        int i = 0;
			
 
				+
			
 
				+        for (; i < width; ++i) {
			
 
				+            Uint32 src32 = *(Uint32 *)src;
			
 
				+            Uint32 dst32 = *(Uint32 *)dst;
			
 
				+            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
			
 
				+            *(Uint32 *)dst = dst32;
			
 
				+            src += 4;
			
 
				+            dst += 4;
			
 
				+        }
			
 
				+
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#ifdef SDL_SSE4_1_INTRINSICS
			
 
				+
			
 
				+static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info)
			
 
				+{
			
 
				+    int width = info->dst_w;
			
 
				+    int height = info->dst_h;
			
 
				+    Uint8 *src = info->src;
			
 
				+    int srcskip = info->src_skip;
			
 
				+    Uint8 *dst = info->dst;
			
 
				+    int dstskip = info->dst_skip;
			
 
				+    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				+    SDL_PixelFormat *dstfmt = info->dst_fmt;
			
 
				+
			
 
				+    // The byte offsets for the start of each pixel
			
 
				+    const __m128i mask_offsets = _mm_set_epi8(
			
 
				+        12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
			
 
				+
			
 
				+    const __m128i convert_mask = _mm_add_epi32(
			
 
				+        _mm_set1_epi32(
			
 
				+            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
			
 
				+            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
			
 
				+            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
			
 
				+        mask_offsets);
			
 
				+
			
 
				+    const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
			
 
				+    const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
			
 
				+
			
 
				+    while (height--) {
			
 
				+        int i = 0;
			
 
				+
			
 
				+        for (; i + 4 <= width; i += 4) {
			
 
				+            // Load 4 src pixels
			
 
				+            __m128i src128 = _mm_loadu_si128((__m128i *)src);
			
 
				+
			
 
				+            // Load 4 dst pixels
			
 
				+            __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
			
 
				+
			
 
				+            // Extract the alpha from each pixel and splat it into all the channels
			
 
				+            __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
			
 
				+
			
 
				+            // Convert to dst format
			
 
				+            src128 = _mm_shuffle_epi8(src128, convert_mask);
			
 
				+
			
 
				+            // Set the alpha channels of src to 255
			
 
				+            src128 = _mm_or_si128(src128, alpha_fill_mask);
			
 
				+
			
 
				+            __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
			
 
				+            __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
			
 
				+
			
 
				+            __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
			
 
				+            __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
			
 
				+
			
 
				+            __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128());
			
 
				+            __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128());
			
 
				+
			
 
				+            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
			
 
				+            dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
			
 
				+                                      _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
			
 
				+            dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
			
 
				+                                      _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
			
 
				+
			
 
				+            // dst += 0x1U (use 0x80 to round instead of floor)
			
 
				+            dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
			
 
				+            dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
			
 
				+
			
 
				+            // dst += dst >> 8
			
 
				+            dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
			
 
				+            dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
			
 
				+
			
 
				+            // Blend the pixels together and save the result
			
 
				+            _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
			
 
				+
			
 
				+            src += 16;
			
 
				+            dst += 16;
			
 
				+        }
			
 
				+
			
 
				+        for (; i < width; ++i) {
			
 
				+            Uint32 src32 = *(Uint32 *)src;
			
 
				+            Uint32 dst32 = *(Uint32 *)dst;
			
 
				+            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
			
 
				+            *(Uint32 *)dst = dst32;
			
 
				+            src += 4;
			
 
				+            dst += 4;
			
 
				+        }
			
 
				+
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SDL_AVX2_INTRINSICS
			
 
				+
			
 
				+static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info)
			
 
				+{
			
 
				+    int width = info->dst_w;
			
 
				+    int height = info->dst_h;
			
 
				+    Uint8 *src = info->src;
			
 
				+    int srcskip = info->src_skip;
			
 
				+    Uint8 *dst = info->dst;
			
 
				+    int dstskip = info->dst_skip;
			
 
				+    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				+    SDL_PixelFormat *dstfmt = info->dst_fmt;
			
 
				+
			
 
				+    // The byte offsets for the start of each pixel
			
 
				+    const __m256i mask_offsets = _mm256_set_epi8(
			
 
				+        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
			
 
				+
			
 
				+    const __m256i convert_mask = _mm256_add_epi32(
			
 
				+        _mm256_set1_epi32(
			
 
				+            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
			
 
				+            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
			
 
				+            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
			
 
				+        mask_offsets);
			
 
				+
			
 
				+    const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
			
 
				+    const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
			
 
				+
			
 
				+    while (height--) {
			
 
				+        int i = 0;
			
 
				+
			
 
				+        for (; i + 8 <= width; i += 8) {
			
 
				+            // Load 8 src pixels
			
 
				+            __m256i src256 = _mm256_loadu_si256((__m256i *)src);
			
 
				+
			
 
				+            // Load 8 dst pixels
			
 
				+            __m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
			
 
				+
			
 
				+            // Extract the alpha from each pixel and splat it into all the channels
			
 
				+            __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
			
 
				+
			
 
				+            // Convert to dst format
			
 
				+            src256 = _mm256_shuffle_epi8(src256, convert_mask);
			
 
				+
			
 
				+            // Set the alpha channels of src to 255
			
 
				+            src256 = _mm256_or_si256(src256, alpha_fill_mask);
			
 
				+
			
 
				+            __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256());
			
 
				+            __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256());
			
 
				+
			
 
				+            __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256());
			
 
				+            __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256());
			
 
				+
			
 
				+            __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256());
			
 
				+            __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256());
			
 
				+
			
 
				+            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
			
 
				+            dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
			
 
				+                                      _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
			
 
				+            dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
			
 
				+                                      _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
			
 
				+
			
 
				+            // dst += 0x1U (use 0x80 to round instead of floor)
			
 
				+            dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
			
 
				+            dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
			
 
				+
			
 
				+            // dst += dst >> 8
			
 
				+            dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
			
 
				+            dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
			
 
				+
			
 
				+            // Blend the pixels together and save the result
			
 
				+            _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
			
 
				+
			
 
				+            src += 32;
			
 
				+            dst += 32;
			
 
				+        }
			
 
				+
			
 
				+        for (; i < width; ++i) {
			
 
				+            Uint32 src32 = *(Uint32 *)src;
			
 
				+            Uint32 dst32 = *(Uint32 *)dst;
			
 
				+            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
			
 
				+            *(Uint32 *)dst = dst32;
			
 
				+            src += 4;
			
 
				+            dst += 4;
			
 
				+        }
			
 
				+
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 /* General (slow) N->N blending with pixel alpha */
			
 
				 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
			
 
				 {
			
@@ -1134,7 +1273,6 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 
				     SDL_PixelFormat *dstfmt = info->dst_fmt;
			
 
				     int srcbpp;
			
 
				     int dstbpp;
			
 
				-    SDL_bool freeFormat;
			
 
				     Uint32 Pixel;
			
 
				     unsigned sR, sG, sB, sA;
			
 
				     unsigned dR, dG, dB, dA;
			
@@ -1142,62 +1280,23 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 
				     /* Set up some basic variables */
			
 
				     srcbpp = srcfmt->bytes_per_pixel;
			
 
				     dstbpp = dstfmt->bytes_per_pixel;
			
 
				-    freeFormat = SDL_FALSE;
			
 
				-
			
 
				-#ifdef SDL_AVX2_INTRINSICS
			
 
				-    if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) {
			
 
				-        BlitNtoNPixelAlpha_AVX2(info);
			
 
				-        return;
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				-#ifdef SDL_SSE4_1_INTRINSICS
			
 
				-    // TODO: Re-enable
			
 
				-	if (srcbpp == 4 && dstbpp == 4 && width >= 2 && SDL_HasSSE41()) {
			
 
				-        BlitNtoNPixelAlpha_SSE4_1(info);
			
 
				-	    return;
			
 
				-    }
			
 
				-#endif
			
 
				-    /* Handle case where bad input sent */
			
 
				-    if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
			
 
				-        dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
			
 
				-        freeFormat = SDL_TRUE;
			
 
				-    }
			
 
				 
			
 
				     while (height--) {
			
 
				-        /* if (srcbpp == 4 && dstbpp == 4 && dstfmt->Ashift == 24 && dstfmt->Rshift == 16 && dstfmt->Gshift == 8 &&
			
 
				-            dstfmt->Bshift == 0) {
			
 
				-            DUFFS_LOOP4(
			
 
				-            {
			
 
				-            PIXEL_TO_ARGB_PIXEL(*(Uint32 *) src, srcfmt, Pixel);
			
 
				-            Uint32 blended = *(Uint32 *) dst;
			
 
				-            ALPHA_BLEND_RGBA_4(Pixel, blended);
			
 
				-            *(Uint32*)dst = blended;
			
 
				-            src += srcbpp;
			
 
				-            dst += dstbpp;
			
 
				-            },
			
 
				-            width);
			
 
				-        } else { */
			
 
				-            /* *INDENT-OFF* */ /* clang-format off */
			
 
				-            DUFFS_LOOP4(
			
 
				-            {
			
 
				-            DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
			
 
				-            if (sA) {
			
 
				-                DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
			
 
				-                ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
			
 
				-                ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
			
 
				-            }
			
 
				-            src += srcbpp;
			
 
				-            dst += dstbpp;
			
 
				-            },
			
 
				-            width);
			
 
				-            /* *INDENT-ON* */ /* clang-format on */
			
 
				-            src += srcskip;
			
 
				-            dst += dstskip;
			
 
				-        // }
			
 
				-    }
			
 
				-    if (freeFormat) {
			
 
				-        SDL_DestroyPixelFormat(dstfmt);
			
 
				+        DUFFS_LOOP4(
			
 
				+        {
			
 
				+        DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
			
 
				+        if (sA) {
			
 
				+            DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
			
 
				+            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
			
 
				+            ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
			
 
				+        }
			
 
				+        src += srcbpp;
			
 
				+        dst += dstbpp;
			
 
				+        },
			
 
				+        width);
			
 
				+        /* *INDENT-ON* */ /* clang-format on */
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -1229,19 +1328,23 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
 
				             return BlitNtoNPixelAlpha;
			
 
				 
			
 
				         case 4:
			
 
				-#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS)
			
 
				-            if (sf->bytes_per_pixel == 4 && df->bytes_per_pixel == 4 && (SDL_HasSSE41() || SDL_HasAVX2())) {
			
 
				-                return BlitNtoNPixelAlpha;
			
 
				-            }
			
 
				+            if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask &&
			
 
				+                SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) {
			
 
				+#ifdef SDL_AVX2_INTRINSICS
			
 
				+                if (SDL_HasAVX2()) {
			
 
				+                    return Blit8888to8888PixelAlphaSwizzleAVX2;
			
 
				+                }
			
 
				 #endif
			
 
				-            if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
			
 
				-#ifdef SDL_MMX_INTRINSICS
			
 
				-                if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
			
 
				-                    if (SDL_HasMMX()) {
			
 
				-                        return BlitRGBtoRGBPixelAlphaMMX;
			
 
				-                    }
			
 
				+#ifdef SDL_SSE4_1_INTRINSICS
			
 
				+                if (SDL_HasSSE41()) {
			
 
				+                    return Blit8888to8888PixelAlphaSwizzleSSE41;
			
 
				+                }
			
 
				+#endif
			
 
				+                if (sf->format == df->format) {
			
 
				+                    return Blit8888to8888PixelAlpha;
			
 
				+                } else {
			
 
				+                    return Blit8888to8888PixelAlphaSwizzle;
			
 
				                 }
			
 
				-#endif /* SDL_MMX_INTRINSICS */
			
 
				             }
			
 
				             return BlitNtoNPixelAlpha;
			
 
				 
			
--- a/src/video/SDL_blit_A_avx2.c
+++ b/src/video/SDL_blit_A_avx2.c
@@ -1,124 +0,0 @@
 
				-#include "SDL_internal.h"
			
 
				-
			
 
				-#if SDL_HAVE_BLIT_A
			
 
				-
			
 
				-#ifdef SDL_AVX2_INTRINSICS
			
 
				-
			
 
				-#include "SDL_blit.h"
			
 
				-
			
 
				-void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int width = info->dst_w;
			
 
				-    int height = info->dst_h;
			
 
				-    Uint8 *src = info->src;
			
 
				-    int srcskip = info->src_skip;
			
 
				-    Uint8 *dst = info->dst;
			
 
				-    int dstskip = info->dst_skip;
			
 
				-    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				-    SDL_PixelFormat *dstfmt = info->dst_fmt;
			
 
				-
			
 
				-    // The byte offsets for the start of each pixel
			
 
				-    const __m256i mask_offsets = _mm256_set_epi8(
			
 
				-        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
			
 
				-
			
 
				-    const __m256i convert_mask = _mm256_add_epi32(
			
 
				-        _mm256_set1_epi32(
			
 
				-            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
			
 
				-            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
			
 
				-            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
			
 
				-        mask_offsets);
			
 
				-
			
 
				-    const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
			
 
				-    const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
			
 
				-
			
 
				-    while (height--) {
			
 
				-        int i = 0;
			
 
				-
			
 
				-        for (; i + 8 <= width; i += 8) {
			
 
				-            // Load 8 src pixels
			
 
				-            __m256i src256 = _mm256_loadu_si256((__m256i *)src);
			
 
				-
			
 
				-            // Load 8 dst pixels
			
 
				-            __m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
			
 
				-
			
 
				-            // Extract the alpha from each pixel and splat it into all the channels
			
 
				-            __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
			
 
				-
			
 
				-            // Convert to dst format
			
 
				-            src256 = _mm256_shuffle_epi8(src256, convert_mask);
			
 
				-
			
 
				-            // Set the alpha channels of src to 255
			
 
				-            src256 = _mm256_or_si256(src256, alpha_fill_mask);
			
 
				-
			
 
				-            __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256());
			
 
				-            __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256());
			
 
				-
			
 
				-            __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256());
			
 
				-            __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256());
			
 
				-
			
 
				-            __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256());
			
 
				-            __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256());
			
 
				-
			
 
				-            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
			
 
				-            dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
			
 
				-                                      _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
			
 
				-            dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
			
 
				-                                      _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
			
 
				-
			
 
				-            // dst += 0x1U (use 0x80 to round instead of floor)
			
 
				-            dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
			
 
				-            dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
			
 
				-
			
 
				-            // dst += dst >> 8
			
 
				-            dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
			
 
				-            dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
			
 
				-
			
 
				-            // Blend the pixels together and save the result
			
 
				-            _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
			
 
				-
			
 
				-            src += 32;
			
 
				-            dst += 32;
			
 
				-        }
			
 
				-
			
 
				-        for (; i < width; ++i) {
			
 
				-            Uint32 src32 = *(Uint32 *)src;
			
 
				-            Uint32 dst32 = *(Uint32 *)dst;
			
 
				-
			
 
				-            Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;
			
 
				-
			
 
				-            src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
			
 
				-                    (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
			
 
				-                    (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
			
 
				-                    dstfmt->Amask;
			
 
				-
			
 
				-            Uint32 srcRB = src32 & 0x00FF00FF;
			
 
				-            Uint32 dstRB = dst32 & 0x00FF00FF;
			
 
				-
			
 
				-            Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
			
 
				-            Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;
			
 
				-
			
 
				-            Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
			
 
				-            resRB += 0x00010001;
			
 
				-            resRB += (resRB >> 8) & 0x00FF00FF;
			
 
				-            resRB = (resRB >> 8) & 0x00FF00FF;
			
 
				-
			
 
				-            Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
			
 
				-            resGA += 0x00010001;
			
 
				-            resGA += (resGA >> 8) & 0x00FF00FF;
			
 
				-            resGA &= 0xFF00FF00;
			
 
				-            dst32 = resRB | resGA;
			
 
				-
			
 
				-            *(Uint32 *)dst = dst32;
			
 
				-
			
 
				-            src += 4;
			
 
				-            dst += 4;
			
 
				-        }
			
 
				-
			
 
				-        src += srcskip;
			
 
				-        dst += dstskip;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-#endif
			
--- a/src/video/SDL_blit_A_avx2.h
+++ b/src/video/SDL_blit_A_avx2.h
@@ -1,4 +0,0 @@
 
				-#ifndef SDL_SDL_BLIT_A_AVX2_H
			
 
				-#define SDL_SDL_BLIT_A_AVX2_H
			
 
				-void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info);
			
 
				-#endif //SDL_SDL_BLIT_A_AVX2_H
			
--- a/src/video/SDL_blit_A_sse4_1.c
+++ b/src/video/SDL_blit_A_sse4_1.c
@@ -1,124 +0,0 @@
 
				-#include "SDL_internal.h"
			
 
				-
			
 
				-#if SDL_HAVE_BLIT_A
			
 
				-
			
 
				-#ifdef SDL_SSE4_1_INTRINSICS
			
 
				-
			
 
				-#include "SDL_blit.h"
			
 
				-
			
 
				-void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
			
 
				-{
			
 
				-    int width = info->dst_w;
			
 
				-    int height = info->dst_h;
			
 
				-    Uint8 *src = info->src;
			
 
				-    int srcskip = info->src_skip;
			
 
				-    Uint8 *dst = info->dst;
			
 
				-    int dstskip = info->dst_skip;
			
 
				-    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				-    SDL_PixelFormat *dstfmt = info->dst_fmt;
			
 
				-
			
 
				-    // The byte offsets for the start of each pixel
			
 
				-    const __m128i mask_offsets = _mm_set_epi8(
			
 
				-        12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
			
 
				-
			
 
				-    const __m128i convert_mask = _mm_add_epi32(
			
 
				-        _mm_set1_epi32(
			
 
				-            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
			
 
				-            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
			
 
				-            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
			
 
				-        mask_offsets);
			
 
				-
			
 
				-    const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
			
 
				-    const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
			
 
				-
			
 
				-    while (height--) {
			
 
				-        int i = 0;
			
 
				-
			
 
				-        for (; i + 4 <= width; i += 4) {
			
 
				-            // Load 4 src pixels
			
 
				-            __m128i src128 = _mm_loadu_si128((__m128i *)src);
			
 
				-
			
 
				-            // Load 4 dst pixels
			
 
				-            __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
			
 
				-
			
 
				-            // Extract the alpha from each pixel and splat it into all the channels
			
 
				-            __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
			
 
				-
			
 
				-            // Convert to dst format
			
 
				-            src128 = _mm_shuffle_epi8(src128, convert_mask);
			
 
				-
			
 
				-            // Set the alpha channels of src to 255
			
 
				-            src128 = _mm_or_si128(src128, alpha_fill_mask);
			
 
				-
			
 
				-            __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
			
 
				-            __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
			
 
				-
			
 
				-            __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
			
 
				-            __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
			
 
				-
			
 
				-            __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128());
			
 
				-            __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128());
			
 
				-
			
 
				-            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
			
 
				-            dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
			
 
				-                                      _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
			
 
				-            dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
			
 
				-                                      _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
			
 
				-
			
 
				-            // dst += 0x1U (use 0x80 to round instead of floor)
			
 
				-            dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
			
 
				-            dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
			
 
				-
			
 
				-            // dst += dst >> 8
			
 
				-            dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
			
 
				-            dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
			
 
				-
			
 
				-            // Blend the pixels together and save the result
			
 
				-            _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
			
 
				-
			
 
				-            src += 16;
			
 
				-            dst += 16;
			
 
				-        }
			
 
				-
			
 
				-        for (; i < width; ++i) {
			
 
				-            Uint32 src32 = *(Uint32 *)src;
			
 
				-            Uint32 dst32 = *(Uint32 *)dst;
			
 
				-
			
 
				-            Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;
			
 
				-
			
 
				-            src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
			
 
				-                    (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
			
 
				-                    (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
			
 
				-                    dstfmt->Amask;
			
 
				-
			
 
				-            Uint32 srcRB = src32 & 0x00FF00FF;
			
 
				-            Uint32 dstRB = dst32 & 0x00FF00FF;
			
 
				-
			
 
				-            Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
			
 
				-            Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;
			
 
				-
			
 
				-            Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
			
 
				-            resRB += 0x00010001;
			
 
				-            resRB += (resRB >> 8) & 0x00FF00FF;
			
 
				-            resRB = (resRB >> 8) & 0x00FF00FF;
			
 
				-
			
 
				-            Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
			
 
				-            resGA += 0x00010001;
			
 
				-            resGA += (resGA >> 8) & 0x00FF00FF;
			
 
				-            resGA &= 0xFF00FF00;
			
 
				-            dst32 = resRB | resGA;
			
 
				-
			
 
				-            *(Uint32 *)dst = dst32;
			
 
				-
			
 
				-            src += 4;
			
 
				-            dst += 4;
			
 
				-        }
			
 
				-
			
 
				-        src += srcskip;
			
 
				-        dst += dstskip;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-#endif
			
--- a/src/video/SDL_blit_A_sse4_1.h
+++ b/src/video/SDL_blit_A_sse4_1.h
@@ -1,4 +0,0 @@
 
				-#ifndef SDL_SDL_BLIT_A_SSE4_1_H
			
 
				-#define SDL_SDL_BLIT_A_SSE4_1_H
			
 
				-void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info);
			
 
				-#endif //SDL_SDL_BLIT_A_SSE4_1_H