10 months ago · 3e0581c625
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -739,6 +739,7 @@ if(SDL_ASSEMBLY)
 
				     cmake_pop_check_state()
			
 
				     if(COMPILER_SUPPORTS_SSE4_1)
			
 
				       set(HAVE_SSE4_1 TRUE)
			
 
				+      sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/intrin/SDL_blit_A_sse4_1.c")
			
 
				     endif()
			
 
				   endif()
			
 
				   if(SDL_SSE4_2)
			
@@ -802,6 +803,7 @@ if(SDL_ASSEMBLY)
 
				         return 0;
			
 
				       }" COMPILER_SUPPORTS_AVX2)
			
 
				     cmake_pop_check_state()
			
 
				+    sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/intrin/SDL_blit_A_*.c")
			
 
				     if(COMPILER_SUPPORTS_AVX2)
			
 
				       set(HAVE_AVX2 TRUE)
			
 
				     endif()
			
--- a/VisualC-GDK/SDL/SDL.vcxproj
+++ b/VisualC-GDK/SDL/SDL.vcxproj
@@ -529,6 +529,8 @@
 
				     <ClInclude Include="..\..\src\video\dummy\SDL_nullframebuffer_c.h" />
			
 
				     <ClInclude Include="..\..\src\video\dummy\SDL_nullvideo.h" />
			
 
				     <ClInclude Include="..\..\src\video\gdk\SDL_gdktextinput.h" />
			
 
				+    <ClInclude Include="..\..\src\video\intrin\SDL_blit_A_avx2.h" />
			
 
				+    <ClInclude Include="..\..\src\video\intrin\SDL_blit_A_sse4.1.h" />
			
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vk_icd.h" />
			
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vk_layer.h" />
			
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vk_platform.h" />
			
@@ -817,6 +819,8 @@
 
				       <PrecompiledHeaderOutputFile Condition="'$(Configuration)|$(Platform)'=='Debug|Gaming.Desktop.x64'">$(IntDir)$(TargetName)_cpp.pch</PrecompiledHeaderOutputFile>
			
 
				       <PrecompiledHeaderOutputFile Condition="'$(Configuration)|$(Platform)'=='Release|Gaming.Desktop.x64'">$(IntDir)$(TargetName)_cpp.pch</PrecompiledHeaderOutputFile>
			
 
				     </ClCompile>
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_avx2.c" />
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_sse4_1.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_0.c" />
			
 
				     <ClCompile Include="..\..\src\video\SDL_blit_1.c" />
			
@@ -863,4 +867,4 @@
 
				   <ImportGroup Label="ExtensionTargets">
			
 
				     <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
			
 
				   </ImportGroup>
			
 
				-</Project>
			
 
				+</Project>
			
--- a/VisualC-GDK/SDL/SDL.vcxproj.filters
+++ b/VisualC-GDK/SDL/SDL.vcxproj.filters
@@ -458,8 +458,14 @@
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_sse_func.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_std_func.h" />
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_avx2.c">
			
 
				+      <Filter>video\intrin</Filter>
			
 
				+    </ClCompile>
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_sse4_1.c">
			
 
				+      <Filter>video\intrin</Filter>
			
 
				+    </ClCompile>
			
 
				   </ItemGroup>
			
 
				   <ItemGroup>
			
 
				     <ResourceCompile Include="..\..\src\core\windows\version.rc" />
			
 
				   </ItemGroup>
			
 
				-</Project>
			
 
				+</Project>
			
--- a/VisualC/SDL/SDL.vcxproj
+++ b/VisualC/SDL/SDL.vcxproj
@@ -429,6 +429,8 @@
 
				     <ClInclude Include="..\..\src\video\dummy\SDL_nullevents_c.h" />
			
 
				     <ClInclude Include="..\..\src\video\dummy\SDL_nullframebuffer_c.h" />
			
 
				     <ClInclude Include="..\..\src\video\dummy\SDL_nullvideo.h" />
			
 
				+    <ClInclude Include="..\..\src\video\intrin\SDL_blit_A_avx2.h" />
			
 
				+    <ClInclude Include="..\..\src\video\intrin\SDL_blit_A_sse4.1.h" />
			
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vk_icd.h" />
			
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vk_layer.h" />
			
 
				     <ClInclude Include="..\..\src\video\khronos\vulkan\vk_platform.h" />
			
@@ -669,6 +671,8 @@
 
				     <ClCompile Include="..\..\src\video\dummy\SDL_nullevents.c" />
			
 
				     <ClCompile Include="..\..\src\video\dummy\SDL_nullframebuffer.c" />
			
 
				     <ClCompile Include="..\..\src\video\dummy\SDL_nullvideo.c" />
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_avx2.c" />
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_sse4_1.c" />
			
 
				     <ClCompile Include="..\..\src\video\offscreen\SDL_offscreenevents.c" />
			
 
				     <ClCompile Include="..\..\src\video\offscreen\SDL_offscreenframebuffer.c" />
			
 
				     <ClCompile Include="..\..\src\video\offscreen\SDL_offscreenopengles.c" />
			
--- a/VisualC/SDL/SDL.vcxproj.filters
+++ b/VisualC/SDL/SDL.vcxproj.filters
@@ -172,6 +172,9 @@
 
				     <Filter Include="render\direct3d12">
			
 
				       <UniqueIdentifier>{f48c2b17-1bee-4fec-a7c8-24cf619abe08}</UniqueIdentifier>
			
 
				     </Filter>
			
 
				+    <Filter Include="video\intrin">
			
 
				+      <UniqueIdentifier>{653672cc-90ae-4eba-a256-6479f2c31804}</UniqueIdentifier>
			
 
				+    </Filter>
			
 
				     <Filter Include="main">
			
 
				       <UniqueIdentifier>{00001967ea2801028a046a722a070000}</UniqueIdentifier>
			
 
				     </Filter>
			
@@ -868,6 +871,13 @@
 
				     </ClInclude>
			
 
				     <ClInclude Include="..\..\src\hidapi\SDL_hidapi_c.h" />
			
 
				     <ClInclude Include="..\..\src\thread\generic\SDL_sysrwlock_c.h" />
			
 
				+    <ClInclude Include="..\..\src\video\intrin\SDL_blit_A_avx2.h">
			
 
				+      <Filter>video\intrin</Filter>
			
 
				+    </ClInclude>
			
 
				+    <ClInclude Include="..\..\src\video\intrin\SDL_blit_A_sse4.1.h">
			
 
				+      <Filter>video\intrin</Filter>
			
 
				+    </ClInclude>
			
 
				+    <ClInclude Include="..\..\src\thread\generic\SDL_sysrwlock_c.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_common.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_internal.h" />
			
 
				     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.h" />
			
@@ -1515,6 +1525,13 @@
 
				       <Filter>stdlib</Filter>
			
 
				     </ClCompile>
			
 
				     <ClCompile Include="..\..\src\thread\generic\SDL_sysrwlock.c" />
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_avx2.c">
			
 
				+      <Filter>video\intrin</Filter>
			
 
				+    </ClCompile>
			
 
				+    <ClCompile Include="..\..\src\video\intrin\SDL_blit_A_sse4_1.c">
			
 
				+      <Filter>video\intrin</Filter>
			
 
				+    </ClCompile>
			
 
				+    <ClCompile Include="..\..\src\thread\generic\SDL_sysrwlock.c" />
			
 
				     <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.c" />
			
 
				     <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_sse.c" />
			
 
				     <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_std.c" />
			
@@ -1549,4 +1566,4 @@
 
				   <ItemGroup>
			
 
				     <MASM Include="..\..\src\stdlib\SDL_mslibc_x64.masm" />
			
 
				   </ItemGroup>
			
 
				-</Project>
			
 
				+</Project>
			
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -24,6 +24,16 @@
 
				 
			
 
				 #include "SDL_blit.h"
			
 
				 
			
 
				+#ifdef SDL_SSE4_1_INTRINSICS
			
 
				+#include "intrin/SDL_blit_A_sse4.1.h"
			
 
				+#endif
			
 
				+#ifdef SDL_AVX2_INTRINSICS
			
 
				+#include "intrin/SDL_blit_A_avx2.h"
			
 
				+#endif
			
 
				+#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS)
			
 
				+#include "SDL3/SDL_cpuinfo.h"
			
 
				+#endif
			
 
				+
			
 
				 /* Functions to perform alpha blended blitting */
			
 
				 
			
 
				 /* N->1 blending with per-surface alpha */
			
@@ -1296,6 +1306,20 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 
				     srcbpp = srcfmt->bytes_per_pixel;
			
 
				     dstbpp = dstfmt->bytes_per_pixel;
			
 
				 
			
 
				+#ifdef SDL_AVX2_INTRINSICS
			
 
				+    if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) {
			
 
				+        BlitNtoNPixelAlpha_AVX2(info);
			
 
				+        return;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SDL_SSE4_1_INTRINSICS
			
 
				+	if (srcbpp == 4 && dstbpp == 4 && width >= 2 && SDL_HasSSE41()) {
			
 
				+        BlitNtoNPixelAlpha_SSE4_1(info);
			
 
				+	    return;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				     while (height--) {
			
 
				         /* *INDENT-OFF* */ /* clang-format off */
			
 
				         DUFFS_LOOP4(
			
@@ -1358,6 +1382,11 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
 
				             return BlitNtoNPixelAlpha;
			
 
				 
			
 
				         case 4:
			
 
				+#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS)
			
 
				+            if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4) {
			
 
				+                return BlitNtoNPixelAlpha;
			
 
				+            }
			
 
				+#endif
			
 
				             if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
			
 
				 #ifdef SDL_MMX_INTRINSICS
			
 
				                 if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
			
@@ -1469,3 +1498,4 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
 
				 }
			
 
				 
			
 
				 #endif /* SDL_HAVE_BLIT_A */
			
 
				+
			
--- a/src/video/intrin/SDL_blit_A_avx2.c
+++ b/src/video/intrin/SDL_blit_A_avx2.c
@@ -0,0 +1,126 @@
 
				+#include "SDL_internal.h"
			
 
				+
			
 
				+#if SDL_HAVE_BLIT_A
			
 
				+
			
 
				+#ifdef SDL_AVX2_INTRINSICS
			
 
				+
			
 
				+#include "../SDL_blit.h"
			
 
				+#include "SDL_blit_A_sse4.1.h"
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("avx2")))
			
 
				+#endif
			
 
				+/**
			
 
				+ * Using the AVX2 instruction set, blit eight pixels with alpha blending
			
 
				+ * @param src A pointer to four 32-bit pixels of ARGB format to blit into dst
			
 
				+ * @param dst A pointer to four 32-bit pixels of ARGB format to retain visual data for while alpha blending
			
 
				+ * @return A 128-bit wide vector of four alpha-blended pixels in ARGB format
			
 
				+ */
			
 
				+__m128i MixRGBA_AVX2(__m128i src, __m128i dst) {
			
 
				+    __m256i src_color = _mm256_cvtepu8_epi16(src);
			
 
				+    __m256i dst_color = _mm256_cvtepu8_epi16(dst);
			
 
				+    const __m256i SHUFFLE_ALPHA = _mm256_set_epi8(
			
 
				+         -1, 30, -1, 30, -1, 30, -1, 30,
			
 
				+         -1, 22, -1, 22, -1, 22, -1, 22,
			
 
				+         -1, 14, -1, 14, -1, 14, -1, 14,
			
 
				+         -1, 6, -1, 6, -1, 6, -1, 6);
			
 
				+    __m256i alpha = _mm256_shuffle_epi8(src_color, SHUFFLE_ALPHA);
			
 
				+    __m256i sub = _mm256_sub_epi16(src_color, dst_color);
			
 
				+    __m256i mul = _mm256_mullo_epi16(sub, alpha);
			
 
				+    /**
			
 
				+     * With an 8-bit shuffle, one can only move integers within a lane. The 256-bit AVX2 lane is actually 4 64-bit
			
 
				+     * lanes. We pack the integers into the start of each lane. The second shuffle operates on these 64-bit integers to
			
 
				+     * put them into the correct order for transport back to the surface in the correct format.
			
 
				+     */
			
 
				+    const __m256i SHUFFLE_REDUCE = _mm256_set_epi8(
			
 
				+            -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+            31, 29, 27, 25, 23, 21, 19, 17,
			
 
				+            -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+            15, 13, 11, 9, 7, 5, 3, 1);
			
 
				+    __m256i reduced = _mm256_shuffle_epi8(mul, SHUFFLE_REDUCE);
			
 
				+    __m256i packed = _mm256_permute4x64_epi64(reduced, _MM_SHUFFLE(3, 1, 2, 0));
			
 
				+    __m128i mix = _mm256_castsi256_si128(packed);
			
 
				+    return _mm_add_epi8(mix, dst);
			
 
				+}
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("avx2")))
			
 
				+#endif
			
 
				+void BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
			
 
				+{
			
 
				+    int width = info->dst_w;
			
 
				+    int height = info->dst_h;
			
 
				+    Uint8 *src = info->src;
			
 
				+    int srcskip = info->src_skip;
			
 
				+    Uint8 *dst = info->dst;
			
 
				+    int dstskip = info->dst_skip;
			
 
				+    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				+
			
 
				+    int chunks = width / 4;
			
 
				+    Uint8 *buf = SDL_malloc(sizeof(Uint8) * chunks * 16);
			
 
				+
			
 
				+    while (height--) {
			
 
				+        /* Process 4-wide chunks of source color data that may be in wrong format */
			
 
				+        for (int i = 0; i < chunks; i += 1) {
			
 
				+            __m128i c_src = convertPixelFormatsx4(_mm_loadu_si128((__m128i*) (src + i * 16)), srcfmt);
			
 
				+            _mm_store_si128((__m128i*)(buf + i * 16), c_src);
			
 
				+        }
			
 
				+
			
 
				+        /* Alpha-blend in 4-wide chunk from src into destination */
			
 
				+        for (int i = 0; i < chunks; i += 1) {
			
 
				+            __m128i c_src = _mm_loadu_si128((__m128i*) (buf + i * 16));
			
 
				+            __m128i c_dst = _mm_loadu_si128((__m128i*) (dst + i * 16));
			
 
				+            __m128i c_mix = MixRGBA_AVX2(c_src, c_dst);
			
 
				+            _mm_storeu_si128((__m128i*) (dst + i * 16), c_mix);
			
 
				+        }
			
 
				+
			
 
				+        /* Handle remaining pixels when width is not a multiple of 4 */
			
 
				+        if (width % 4 != 0) {
			
 
				+            int remaining_pixels = width % 4;
			
 
				+            int offset = width - remaining_pixels;
			
 
				+            if (remaining_pixels >= 2) {
			
 
				+                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
			
 
				+                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
			
 
				+                __m128i c_src = _mm_loadu_si64(src_ptr);
			
 
				+                c_src = convertPixelFormatsx4(c_src, srcfmt);
			
 
				+                __m128i c_dst = _mm_loadu_si64(dst_ptr);
			
 
				+                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst);
			
 
				+                _mm_storeu_si64(dst_ptr, c_mix);
			
 
				+                remaining_pixels -= 2;
			
 
				+                offset += 2;
			
 
				+            }
			
 
				+            if (remaining_pixels == 1) {
			
 
				+                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
			
 
				+                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
			
 
				+                Uint32 pixel = convertPixelFormat(*src_ptr, srcfmt);
			
 
				+                /* Old GCC has bad or no _mm_loadu_si32 */
			
 
				+                #if defined(__GNUC__) && (__GNUC__ < 11)
			
 
				+                __m128i c_src = _mm_set_epi32(0, 0, 0, pixel);
			
 
				+                __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr);
			
 
				+                #else
			
 
				+                __m128i c_src = _mm_loadu_si32(&pixel);
			
 
				+                __m128i c_dst = _mm_loadu_si32(dst_ptr);
			
 
				+                #endif
			
 
				+                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst);
			
 
				+                /* Old GCC has bad or no _mm_storeu_si32 */
			
 
				+                #if defined(__GNUC__) && (__GNUC__ < 11)
			
 
				+                *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
			
 
				+                #else
			
 
				+                _mm_storeu_si32(dst_ptr, mixed_pixel);
			
 
				+                #endif
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        src += 4 * width;
			
 
				+        dst += 4 * width;
			
 
				+
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				+    }
			
 
				+    SDL_free(buf);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/src/video/intrin/SDL_blit_A_avx2.h
+++ b/src/video/intrin/SDL_blit_A_avx2.h
@@ -0,0 +1,7 @@
 
				+#ifndef SDL_SDL_BLIT_A_AVX2_H
			
 
				+#define SDL_SDL_BLIT_A_AVX2_H
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("avx2")))
			
 
				+#endif
			
 
				+void BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info);
			
 
				+#endif //SDL_SDL_BLIT_A_AVX2_H
			
--- a/src/video/intrin/SDL_blit_A_sse4.1.h
+++ b/src/video/intrin/SDL_blit_A_sse4.1.h
@@ -0,0 +1,24 @@
 
				+#ifndef SDL_SDL_BLIT_A_SSE4_1_H
			
 
				+#define SDL_SDL_BLIT_A_SSE4_1_H
			
 
				+
			
 
				+#ifdef SDL_SSE4_1_INTRINSICS
			
 
				+Uint32 convertPixelFormat(Uint32 color, const SDL_PixelFormat* srcFormat);
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("sse4.1")))
			
 
				+#endif
			
 
				+__m128i convertPixelFormatsx4(__m128i colors, const SDL_PixelFormat* srcFormat);
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("sse4.1")))
			
 
				+#endif
			
 
				+__m128i MixRGBA_SSE4_1(__m128i src, __m128i dst);
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("sse4.1")))
			
 
				+#endif
			
 
				+void BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info);
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif //SDL_SDL_BLIT_A_SSE4_1_H
			
--- a/src/video/intrin/SDL_blit_A_sse4_1.c
+++ b/src/video/intrin/SDL_blit_A_sse4_1.c
@@ -0,0 +1,146 @@
 
				+#include "SDL_internal.h"
			
 
				+
			
 
				+#if SDL_HAVE_BLIT_A
			
 
				+
			
 
				+#ifdef SDL_SSE4_1_INTRINSICS
			
 
				+
			
 
				+#include "../SDL_blit.h"
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("sse4.1")))
			
 
				+#endif
			
 
				+/**
			
 
				+ * Using the SSE4.1 instruction set, blit four pixels with alpha blending
			
 
				+ * @param src A pointer to two 32-bit pixels of ARGB format to blit into dst
			
 
				+ * @param dst A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending
			
 
				+ * @return A 128-bit wide vector of two alpha-blended pixels in ARGB format
			
 
				+ */
			
 
				+__m128i MixRGBA_SSE4_1(__m128i src, __m128i dst) {
			
 
				+    __m128i src_color = _mm_cvtepu8_epi16(src);
			
 
				+    __m128i dst_color = _mm_cvtepu8_epi16(dst);
			
 
				+    /**
			
 
				+     * Combines a shuffle and an _mm_cvtepu8_epi16 operation into one operation by moving the lower 8 bits of the alpha
			
 
				+     * channel around to create 16-bit integers.
			
 
				+     */
			
 
				+    const __m128i SHUFFLE_ALPHA = _mm_set_epi8(
			
 
				+         -1, 7, -1, 7, -1, 7, -1, 7,
			
 
				+         -1, 3, -1, 3, -1, 3, -1, 3);
			
 
				+    __m128i alpha = _mm_shuffle_epi8(src, SHUFFLE_ALPHA);
			
 
				+    __m128i sub = _mm_sub_epi16(src_color, dst_color);
			
 
				+    __m128i mul = _mm_mullo_epi16(sub, alpha);
			
 
				+    const __m128i SHUFFLE_REDUCE = _mm_set_epi8(
			
 
				+        -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+        15, 13, 11, 9, 7, 5, 3, 1);
			
 
				+    __m128i reduced = _mm_shuffle_epi8(mul, SHUFFLE_REDUCE);
			
 
				+
			
 
				+    return _mm_add_epi8(reduced, dst);
			
 
				+}
			
 
				+
			
 
				+Uint32 convertPixelFormat(Uint32 color, const SDL_PixelFormat* srcFormat) {
			
 
				+    Uint8 a = (color >> srcFormat->Ashift) & 0xFF;
			
 
				+    Uint8 r = (color >> srcFormat->Rshift) & 0xFF;
			
 
				+    Uint8 g = (color >> srcFormat->Gshift) & 0xFF;
			
 
				+    Uint8 b = (color >> srcFormat->Bshift) & 0xFF;
			
 
				+
			
 
				+    return (a << 24) | (r << 16) | (g << 8) | b;
			
 
				+}
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("sse4.1")))
			
 
				+#endif
			
 
				+/*
			
 
				+ * This helper function converts arbitrary pixel format data into ARGB form with a 4 pixel-wide shuffle
			
 
				+ */
			
 
				+__m128i convertPixelFormatsx4(__m128i colors, const SDL_PixelFormat* srcFormat) {
			
 
				+    // Create shuffle masks based on the source SDL_PixelFormat to ARGB
			
 
				+    __m128i srcShuffleMask = _mm_set_epi8(
			
 
				+        srcFormat->Ashift / 8 + 12, srcFormat->Rshift / 8 + 12, srcFormat->Gshift / 8 + 12, srcFormat->Bshift / 8 + 12,
			
 
				+        srcFormat->Ashift / 8 + 8, srcFormat->Rshift / 8 + 8, srcFormat->Gshift / 8 + 8, srcFormat->Bshift / 8 + 8,
			
 
				+        srcFormat->Ashift / 8 + 4, srcFormat->Rshift / 8 + 4, srcFormat->Gshift / 8 + 4, srcFormat->Bshift / 8 + 4,
			
 
				+        srcFormat->Ashift / 8, srcFormat->Rshift / 8, srcFormat->Gshift / 8, srcFormat->Bshift / 8
			
 
				+    );
			
 
				+
			
 
				+    // Shuffle the colors
			
 
				+    return _mm_shuffle_epi8(colors, srcShuffleMask);
			
 
				+}
			
 
				+
			
 
				+#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__))
			
 
				+__attribute__((target("sse4.1")))
			
 
				+#endif
			
 
				+void BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
			
 
				+    int width = info->dst_w;
			
 
				+    int height = info->dst_h;
			
 
				+    Uint8 *src = info->src;
			
 
				+    int srcskip = info->src_skip;
			
 
				+    Uint8 *dst = info->dst;
			
 
				+    int dstskip = info->dst_skip;
			
 
				+    SDL_PixelFormat *srcfmt = info->src_fmt;
			
 
				+
			
 
				+    int chunks = width / 4;
			
 
				+    Uint8 *buffer = (Uint8*)SDL_malloc(chunks * 16 * sizeof(Uint8));
			
 
				+
			
 
				+    while (height--) {
			
 
				+        /* Process 4-wide chunks of source color data that may be in wrong format into buffer */
			
 
				+        for (int i = 0; i < chunks; i += 1) {
			
 
				+            __m128i colors = _mm_loadu_si128((__m128i*)(src + i * 16));
			
 
				+            _mm_storeu_si128((__m128i*)(buffer + i * 16), convertPixelFormatsx4(colors, srcfmt));
			
 
				+        }
			
 
				+
			
 
				+        /* Alpha-blend in 2-wide chunks from buffer into destination */
			
 
				+        for (int i = 0; i < chunks * 2; i += 1) {
			
 
				+            __m128i c_src = _mm_loadu_si64((buffer + (i * 8)));
			
 
				+            __m128i c_dst = _mm_loadu_si64((dst + i * 8));
			
 
				+            __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst);
			
 
				+            _mm_storeu_si64(dst + i * 8, c_mix);
			
 
				+        }
			
 
				+
			
 
				+        /* Handle remaining pixels when width is not a multiple of 4 */
			
 
				+        if (width % 4 != 0) {
			
 
				+            int remaining_pixels = width % 4;
			
 
				+            int offset = width - remaining_pixels;
			
 
				+            if (remaining_pixels >= 2) {
			
 
				+                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
			
 
				+                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
			
 
				+                __m128i c_src = _mm_loadu_si64(src_ptr);
			
 
				+                c_src = convertPixelFormatsx4(c_src, srcfmt);
			
 
				+                __m128i c_dst = _mm_loadu_si64(dst_ptr);
			
 
				+                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst);
			
 
				+                _mm_storeu_si64(dst_ptr, c_mix);
			
 
				+                remaining_pixels -= 2;
			
 
				+                offset += 2;
			
 
				+            }
			
 
				+            if (remaining_pixels == 1) {
			
 
				+                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
			
 
				+                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
			
 
				+                Uint32 pixel = convertPixelFormat(*src_ptr, srcfmt);
			
 
				+                /* Old GCC has bad or no _mm_loadu_si32 */
			
 
				+                #if defined(__GNUC__) && (__GNUC__ < 11)
			
 
				+                __m128i c_src = _mm_set_epi32(0, 0, 0, pixel);
			
 
				+                __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr);
			
 
				+                #else
			
 
				+                __m128i c_src = _mm_loadu_si32(&pixel);
			
 
				+                __m128i c_dst = _mm_loadu_si32(dst_ptr);
			
 
				+                #endif
			
 
				+                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst);
			
 
				+                /* Old GCC has bad or no _mm_storeu_si32 */
			
 
				+                #if defined(__GNUC__) && (__GNUC__ < 11)
			
 
				+                *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
			
 
				+                #else
			
 
				+                _mm_storeu_si32(dst_ptr, mixed_pixel);
			
 
				+                #endif
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        src += 4 * width;
			
 
				+        dst += 4 * width;
			
 
				+
			
 
				+        src += srcskip;
			
 
				+        dst += dstskip;
			
 
				+    }
			
 
				+
			
 
				+    SDL_free(buffer);
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif