From 1437254152868e8fa3f11d5b5f08dd3ee5d89ee4 Mon Sep 17 00:00:00 2001
From: Andrei Karas <akaras@inbox.ru>
Date: Thu, 25 May 2017 02:08:50 +0300
Subject: Switch in replaceAOGLColor into using custom despatcher.

---
 src/CMakeLists.txt                                 |   3 -
 src/Makefile.am                                    |   3 -
 src/resources/dye/dye_unittest.cc                  |  39 +--
 src/resources/dye/dyepalette.cpp                   |  12 +
 src/resources/dye/dyepalette.h                     |  42 +--
 src/resources/dye/dyepalette_replaceaoglcolor.cpp  | 372 ++++++++++++++++++---
 .../dye/dyepalette_replaceaoglcolor_avx2.hpp       |  55 ---
 .../dye/dyepalette_replaceaoglcolor_default.hpp    | 110 ------
 .../dye/dyepalette_replaceaoglcolor_sse2.hpp       |  55 ---
 src/resources/openglimagehelper.cpp                |   2 +-
 src/resources/safeopenglimagehelper.cpp            |   2 +-
 11 files changed, 349 insertions(+), 346 deletions(-)
 delete mode 100644 src/resources/dye/dyepalette_replaceaoglcolor_avx2.hpp
 delete mode 100644 src/resources/dye/dyepalette_replaceaoglcolor_default.hpp
 delete mode 100644 src/resources/dye/dyepalette_replaceaoglcolor_sse2.hpp

(limited to 'src')

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1e558f861..d0fa5f27a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -642,9 +642,6 @@ SET(SRCS
     resources/dye/dyepalette.h
     resources/dye/dyepalette_replaceacolor.cpp
     resources/dye/dyepalette_replaceaoglcolor.cpp
-    resources/dye/dyepalette_replaceaoglcolor_avx2.hpp
-    resources/dye/dyepalette_replaceaoglcolor_default.hpp
-    resources/dye/dyepalette_replaceaoglcolor_sse2.hpp
     resources/dye/dyepalette_replacescolor.cpp
     resources/dye/dyepalette_replacesoglcolor.cpp
     resources/dye/dyepaletteptr.h
diff --git a/src/Makefile.am b/src/Makefile.am
index 55dee77e3..3281ff06c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -426,9 +426,6 @@ BASE_SRC += events/actionevent.h \
 	      resources/dye/dyepalette.h \
 	      resources/dye/dyepalette_replaceacolor.cpp \
 	      resources/dye/dyepalette_replaceaoglcolor.cpp \
-	      resources/dye/dyepalette_replaceaoglcolor_avx2.hpp \
-	      resources/dye/dyepalette_replaceaoglcolor_default.hpp \
-	      resources/dye/dyepalette_replaceaoglcolor_sse2.hpp \
 	      resources/dye/dyepalette_replacescolor.cpp \
 	      resources/dye/dyepalette_replacesoglcolor.cpp \
 	      resources/dye/dyepaletteptr.h \
diff --git a/src/resources/dye/dye_unittest.cc b/src/resources/dye/dye_unittest.cc
index b9831d087..09b207371 100644
--- a/src/resources/dye/dye_unittest.cc
+++ b/src/resources/dye/dye_unittest.cc
@@ -268,7 +268,7 @@ TEST_CASE("Dye replaceAOGLColor 1 1", "")
     DyePalette palette("#00ff0010,00001120", 8);
     uint32_t data[1];
     data[0] = buildHex(0x10, 0x03, 0x02, 0x01);
-    palette.replaceAOGLColor(&data[0], 1);
+    DYEPALETTE(palette, AOGLColor)(&data[0], 1);
     REQUIRE(data[0] == buildHex(0x10, 0x03, 0x02, 0x01));
 }
 
@@ -277,7 +277,7 @@ TEST_CASE("Dye replaceAOGLColor 1 2", "")
     DyePalette palette("#00ff0120,020311ff", 8);
     uint32_t data[1];
     data[0] = buildHex(0x20, 0x01, 0xff, 0x00);
-    palette.replaceAOGLColor(&data[0], 1);
+    DYEPALETTE(palette, AOGLColor)(&data[0], 1);
     REQUIRE(data[0] == buildHex(0xff, 0x11, 0x03, 0x02));
 }
 
@@ -286,7 +286,7 @@ TEST_CASE("Dye replaceAOGLColor 1 3", "")
     DyePalette palette("#40404040,20000000,0100ee40,102030ff", 8);
     uint32_t data[1];
     data[0] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColor(&data[0], 1);
+    DYEPALETTE(palette, AOGLColor)(&data[0], 1);
     REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
 }
 
@@ -296,7 +296,7 @@ TEST_CASE("Dye replaceAOGLColor 2 1", "")
     uint32_t data[2];
     data[0] = buildHex(0x40, 0xee, 0x00, 0x01);
     data[1] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColor(&data[0], 2);
+    DYEPALETTE(palette, AOGLColor)(&data[0], 2);
     REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[1] == buildHex(0xff, 0x30, 0x20, 0x10));
 }
@@ -309,7 +309,7 @@ TEST_CASE("Dye replaceAOGLColor 4 1", "")
     data[1] = buildHex(0x40, 0xee, 0x00, 0x01);
     data[2] = buildHex(0x41, 0xee, 0x00, 0x01);
     data[3] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColor(&data[0], 4);
+    DYEPALETTE(palette, AOGLColor)(&data[0], 4);
     REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[1] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[2] == buildHex(0x41, 0xee, 0x00, 0x01));
@@ -328,7 +328,7 @@ TEST_CASE("Dye replaceAOGLColor 8 1", "")
     data[5] = buildHex(0x40, 0x40, 0x40, 0x40);
     data[6] = buildHex(0x41, 0xe0, 0x00, 0x01);
     data[7] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColor(&data[0], 8);
+    DYEPALETTE(palette, AOGLColor)(&data[0], 8);
     REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[1] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[2] == buildHex(0x41, 0xee, 0x00, 0x01));
@@ -415,29 +415,6 @@ TEST_CASE("Dye replaceAOGLColor 8 1 default", "")
     REQUIRE(data[7] == buildHex(0xff, 0x30, 0x20, 0x10));
 }
 
-TEST_CASE("Dye replaceAOGLColor 8 1 simd", "")
-{
-    DyePalette palette("#40404040,20000000,0100ee40,102030ff", 8);
-    uint32_t data[8];
-    data[0] = buildHex(0x40, 0xee, 0x00, 0x01);
-    data[1] = buildHex(0x40, 0xee, 0x00, 0x01);
-    data[2] = buildHex(0x41, 0xee, 0x00, 0x01);
-    data[3] = buildHex(0x40, 0xee, 0x00, 0x01);
-    data[4] = buildHex(0x40, 0xee, 0x00, 0x01);
-    data[5] = buildHex(0x40, 0x40, 0x40, 0x40);
-    data[6] = buildHex(0x41, 0xe0, 0x00, 0x01);
-    data[7] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColorSimd(&data[0], 8);
-    REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
-    REQUIRE(data[1] == buildHex(0xff, 0x30, 0x20, 0x10));
-    REQUIRE(data[2] == buildHex(0x41, 0xee, 0x00, 0x01));
-    REQUIRE(data[3] == buildHex(0xff, 0x30, 0x20, 0x10));
-    REQUIRE(data[4] == buildHex(0xff, 0x30, 0x20, 0x10));
-    REQUIRE(data[5] == buildHex(0x00, 0x00, 0x00, 0x20));
-    REQUIRE(data[6] == buildHex(0x41, 0xe0, 0x00, 0x01));
-    REQUIRE(data[7] == buildHex(0xff, 0x30, 0x20, 0x10));
-}
-
 TEST_CASE("Dye replaceAOGLColor 8 1 sse2", "")
 {
     DyePalette palette("#40404040,20000000,0100ee40,102030ff", 8);
@@ -450,7 +427,7 @@ TEST_CASE("Dye replaceAOGLColor 8 1 sse2", "")
     data[5] = buildHex(0x40, 0x40, 0x40, 0x40);
     data[6] = buildHex(0x41, 0xe0, 0x00, 0x01);
     data[7] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColorSse2(&data[0], 8);
+    DYEPALETTE(palette, AOGLColorSse2)(&data[0], 8);
     REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[1] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[2] == buildHex(0x41, 0xee, 0x00, 0x01));
@@ -473,7 +450,7 @@ TEST_CASE("Dye replaceAOGLColor 8 1 avx2", "")
     data[5] = buildHex(0x40, 0x40, 0x40, 0x40);
     data[6] = buildHex(0x41, 0xe0, 0x00, 0x01);
     data[7] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceAOGLColorAvx2(&data[0], 8);
+    DYEPALETTE(palette, AOGLColorAvx2)(&data[0], 8);
     REQUIRE(data[0] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[1] == buildHex(0xff, 0x30, 0x20, 0x10));
     REQUIRE(data[2] == buildHex(0x41, 0xee, 0x00, 0x01));
diff --git a/src/resources/dye/dyepalette.cpp b/src/resources/dye/dyepalette.cpp
index bf12b7047..b8346058f 100644
--- a/src/resources/dye/dyepalette.cpp
+++ b/src/resources/dye/dyepalette.cpp
@@ -53,6 +53,9 @@ DyeFunctionPtr DyePalette::funcReplaceSOGLColorAvx2 = nullptr;
 DyeFunctionPtr DyePalette::funcReplaceAColor = nullptr;
 DyeFunctionPtr DyePalette::funcReplaceAColorSse2 = nullptr;
 DyeFunctionPtr DyePalette::funcReplaceAColorAvx2 = nullptr;
+DyeFunctionPtr DyePalette::funcReplaceAOGLColor = nullptr;
+DyeFunctionPtr DyePalette::funcReplaceAOGLColorSse2 = nullptr;
+DyeFunctionPtr DyePalette::funcReplaceAOGLColorAvx2 = nullptr;
 
 DyePalette::DyePalette(const std::string &restrict description,
                        const uint8_t blockSize) :
@@ -254,6 +257,9 @@ void DyePalette::initFunctions()
         funcReplaceAColor = &DyePalette::replaceAColorAvx2;
         funcReplaceAColorAvx2 = &DyePalette::replaceAColorAvx2;
         funcReplaceAColorSse2 = &DyePalette::replaceAColorSse2;
+        funcReplaceAOGLColor = &DyePalette::replaceAOGLColorAvx2;
+        funcReplaceAOGLColorAvx2 = &DyePalette::replaceAOGLColorAvx2;
+        funcReplaceAOGLColorSse2 = &DyePalette::replaceAOGLColorSse2;
     }
     else if (flags & Cpu::FEATURE_SSE2)
     {
@@ -266,6 +272,9 @@ void DyePalette::initFunctions()
         funcReplaceAColor = &DyePalette::replaceAColorSse2;
         funcReplaceAColorAvx2 = &DyePalette::replaceAColorSse2;
         funcReplaceAColorSse2 = &DyePalette::replaceAColorSse2;
+        funcReplaceAOGLColor = &DyePalette::replaceAOGLColorSse2;
+        funcReplaceAOGLColorAvx2 = &DyePalette::replaceAOGLColorSse2;
+        funcReplaceAOGLColorSse2 = &DyePalette::replaceAOGLColorSse2;
     }
     else
 #endif  // SIMD_SUPPORTED
@@ -279,5 +288,8 @@ void DyePalette::initFunctions()
         funcReplaceAColor = &DyePalette::replaceAColorDefault;
         funcReplaceAColorAvx2 = &DyePalette::replaceAColorDefault;
         funcReplaceAColorSse2 = &DyePalette::replaceAColorDefault;
+        funcReplaceAOGLColor = &DyePalette::replaceAOGLColorDefault;
+        funcReplaceAOGLColorAvx2 = &DyePalette::replaceAOGLColorDefault;
+        funcReplaceAOGLColorSse2 = &DyePalette::replaceAOGLColorDefault;
     }
 }
diff --git a/src/resources/dye/dyepalette.h b/src/resources/dye/dyepalette.h
index 1ac15c9a0..06473ec69 100644
--- a/src/resources/dye/dyepalette.h
+++ b/src/resources/dye/dyepalette.h
@@ -132,56 +132,17 @@ class DyePalette final
                                   const int bufSize) const restrict2;
 #endif  // SIMD_SUPPORTED
 
-        /**
-         * replace colors for OpenGL for A dye.
-         */
-        void replaceAOGLColor(uint32_t *restrict pixels,
-                              const int bufSize) const restrict2;
-
         /**
          * replace colors for OpenGL for A dye.
          */
         void replaceAOGLColorDefault(uint32_t *restrict pixels,
                                      const int bufSize) const restrict2;
 
-        /**
-         * replace colors for OpenGL for A dye.
-         */
-        FUNCTION_SIMD_DEFAULT
-        void replaceAOGLColorSimd(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-
-        /**
-         * replace colors for OpenGL for A dye.
-         */
-        FUNCTION_SIMD_DEFAULT
-        void replaceAOGLColorSse2(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-
-        /**
-         * replace colors for OpenGL for A dye.
-         */
-        FUNCTION_SIMD_DEFAULT
-        void replaceAOGLColorAvx2(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-
 #ifdef SIMD_SUPPORTED
         /**
          * replace colors for OpenGL for A dye.
          */
         __attribute__ ((target ("sse2")))
-        void replaceAOGLColorSimd(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-        /**
-         * replace colors for OpenGL for A dye.
-         */
-        __attribute__ ((target ("avx2")))
-        void replaceAOGLColorSimd(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-        /**
-         * replace colors for OpenGL for A dye.
-         */
-        __attribute__ ((target ("sse2")))
         void replaceAOGLColorSse2(uint32_t *restrict pixels,
                                   const int bufSize) const restrict2;
         /**
@@ -211,6 +172,9 @@ class DyePalette final
         static DyeFunctionPtr funcReplaceAColor;
         static DyeFunctionPtr funcReplaceAColorSse2;
         static DyeFunctionPtr funcReplaceAColorAvx2;
+        static DyeFunctionPtr funcReplaceAOGLColor;
+        static DyeFunctionPtr funcReplaceAOGLColorSse2;
+        static DyeFunctionPtr funcReplaceAOGLColorAvx2;
 
 #ifndef UNITTESTS
     private:
diff --git a/src/resources/dye/dyepalette_replaceaoglcolor.cpp b/src/resources/dye/dyepalette_replaceaoglcolor.cpp
index 16b54a666..48c929f95 100644
--- a/src/resources/dye/dyepalette_replaceaoglcolor.cpp
+++ b/src/resources/dye/dyepalette_replaceaoglcolor.cpp
@@ -35,23 +35,97 @@
 
 #include "debug.h"
 
-void DyePalette::replaceAOGLColor(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2
-{
-#ifdef SIMD_SUPPORTED
-    if (bufSize >= 8)
-        replaceAOGLColorSimd(pixels, bufSize);
-    else
-        replaceAOGLColorDefault(pixels, bufSize);
-#else  // SIMD_SUPPORTED
-#include "resources/dye/dyepalette_replaceaoglcolor_default.hpp"
-#endif  // SIMD_SUPPORTED
-}
-
 void DyePalette::replaceAOGLColorDefault(uint32_t *restrict pixels,
                                          const int bufSize) const restrict2
 {
-#include "resources/dye/dyepalette_replaceaoglcolor_default.hpp"
+    std::vector<DyeColor>::const_iterator it_end = mColors.end();
+    const size_t sz = mColors.size();
+    if (!sz || !pixels)
+        return;
+    if (sz % 2)
+        -- it_end;
+
+#ifdef ENABLE_CILKPLUS
+    cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
+    {
+        uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
+        const unsigned int data = pixels[ptr];
+
+        std::vector<DyeColor>::const_iterator it = mColors.begin();
+        while (it != it_end)
+        {
+            const DyeColor &col = *it;
+            ++ it;
+            const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int coldata = (col.value[0] << 24U)
+                | (col.value[1] << 16U)
+                | (col.value[2] << 8U)
+                | col.value[3];
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            const unsigned int coldata = (col.value[0])
+                | (col.value[1] << 8U)
+                | (col.value[2] << 16U)
+                | (col.value[3] << 24U);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            if (data == coldata)
+            {
+                p[0] = col2.value[0];
+                p[1] = col2.value[1];
+                p[2] = col2.value[2];
+                p[3] = col2.value[3];
+                break;
+            }
+
+            ++ it;
+        }
+    }
+
+#else  // ENABLE_CILKPLUS
+
+    for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
+         pixels != p_end;
+         ++pixels)
+    {
+        uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
+        const unsigned int data = *pixels;
+
+        std::vector<DyeColor>::const_iterator it = mColors.begin();
+        while (it != it_end)
+        {
+            const DyeColor &col = *it;
+            ++ it;
+            const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int coldata = (col.value[0] << 24U)
+                | (col.value[1] << 16U)
+                | (col.value[2] << 8U)
+                | col.value[3];
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            const unsigned int coldata = (col.value[0])
+                | (col.value[1] << 8U)
+                | (col.value[2] << 16U)
+                | (col.value[3] << 24U);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            if (data == coldata)
+            {
+                p[0] = col2.value[0];
+                p[1] = col2.value[1];
+                p[2] = col2.value[2];
+                p[3] = col2.value[3];
+                break;
+            }
+
+            ++ it;
+        }
+    }
+#endif  // ENABLE_CILKPLUS
 }
 
 #ifdef SIMD_SUPPORTED
@@ -64,54 +138,256 @@ static void print256(const char *const text, const __m256i &val)
 */
 
 __attribute__ ((target ("sse2")))
-void DyePalette::replaceAOGLColorSimd(uint32_t *restrict pixels,
+void DyePalette::replaceAOGLColorSse2(uint32_t *restrict pixels,
                                       const int bufSize) const restrict2
 {
-#include "resources/dye/dyepalette_replaceaoglcolor_sse2.hpp"
-}
+    std::vector<DyeColor>::const_iterator it_end = mColors.end();
+    const size_t sz = mColors.size();
+    if (!sz || !pixels)
+        return;
+    if (sz % 2)
+        -- it_end;
 
-__attribute__ ((target ("avx2")))
-void DyePalette::replaceAOGLColorSimd(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replaceaoglcolor_avx2.hpp"
-}
+    if (bufSize >= 8)
+    {
+        for (int ptr = 0; ptr < bufSize; ptr += 4)
+        {
+//            __m128i base = _mm_load_si128(reinterpret_cast<__m128i*>(
+//                &pixels[ptr]));
+            __m128i base = _mm_loadu_si128(reinterpret_cast<__m128i*>(
+                &pixels[ptr]));
 
-__attribute__ ((target ("sse2")))
-void DyePalette::replaceAOGLColorSse2(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replaceaoglcolor_sse2.hpp"
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+                __m128i newMask = _mm_set1_epi32(col2.valueAOgl);
+                __m128i cmpMask = _mm_set1_epi32(col.valueAOgl);
+                __m128i cmpRes = _mm_cmpeq_epi32(base, cmpMask);
+                __m128i srcAnd = _mm_andnot_si128(cmpRes, base);
+                __m128i dstAnd = _mm_and_si128(cmpRes, newMask);
+                base = _mm_or_si128(srcAnd, dstAnd);
+
+                ++ it;
+            }
+//            _mm_store_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
+        }
+    }
+    else
+    {
+#ifdef ENABLE_CILKPLUS
+        cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
+            const unsigned int data = pixels[ptr];
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24U)
+                    | (col.value[1] << 16U)
+                    | (col.value[2] << 8U)
+                    | col.value[3];
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8U)
+                    | (col.value[2] << 16U)
+                    | (col.value[3] << 24U);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    p[3] = col2.value[3];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+
+#else  // ENABLE_CILKPLUS
+
+        for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
+             pixels != p_end;
+             ++pixels)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
+            const unsigned int data = *pixels;
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24U)
+                    | (col.value[1] << 16U)
+                    | (col.value[2] << 8U)
+                    | col.value[3];
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8U)
+                    | (col.value[2] << 16U)
+                    | (col.value[3] << 24U);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    p[3] = col2.value[3];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+#endif  // ENABLE_CILKPLUS
+    }
 }
 
 __attribute__ ((target ("avx2")))
 void DyePalette::replaceAOGLColorAvx2(uint32_t *restrict pixels,
                                       const int bufSize) const restrict2
 {
-#include "resources/dye/dyepalette_replaceaoglcolor_avx2.hpp"
-}
+    std::vector<DyeColor>::const_iterator it_end = mColors.end();
+    const size_t sz = mColors.size();
+    if (!sz || !pixels)
+        return;
+    if (sz % 2)
+        -- it_end;
 
-#endif  // SIMD_SUPPORTED
+    if (bufSize >= 8)
+    {
+        for (int ptr = 0; ptr < bufSize; ptr += 8)
+        {
+//            __m256i base = _mm256_load_si256(reinterpret_cast<__m256i*>(
+//                &pixels[ptr]));
+            __m256i base = _mm256_loadu_si256(reinterpret_cast<__m256i*>(
+                &pixels[ptr]));
 
-FUNCTION_SIMD_DEFAULT
-void DyePalette::replaceAOGLColorSimd(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replaceaoglcolor_default.hpp"
-}
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
 
-FUNCTION_SIMD_DEFAULT
-void DyePalette::replaceAOGLColorSse2(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replaceaoglcolor_default.hpp"
-}
+                __m256i newMask = _mm256_set1_epi32(col2.valueAOgl);
+                __m256i cmpMask = _mm256_set1_epi32(col.valueAOgl);
+                __m256i cmpRes = _mm256_cmpeq_epi32(base, cmpMask);
+                __m256i srcAnd = _mm256_andnot_si256(cmpRes, base);
+                __m256i dstAnd = _mm256_and_si256(cmpRes, newMask);
+                base = _mm256_or_si256(srcAnd, dstAnd);
 
-FUNCTION_SIMD_DEFAULT
-void DyePalette::replaceAOGLColorAvx2(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replaceaoglcolor_default.hpp"
+                ++ it;
+            }
+//            _mm256_store_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
+        }
+    }
+    else
+    {
+#ifdef ENABLE_CILKPLUS
+        cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
+            const unsigned int data = pixels[ptr];
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24U)
+                    | (col.value[1] << 16U)
+                    | (col.value[2] << 8U)
+                    | col.value[3];
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8U)
+                    | (col.value[2] << 16U)
+                    | (col.value[3] << 24U);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    p[3] = col2.value[3];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+
+#else  // ENABLE_CILKPLUS
+
+        for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
+             pixels != p_end;
+             ++pixels)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
+            const unsigned int data = *pixels;
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24U)
+                    | (col.value[1] << 16U)
+                    | (col.value[2] << 8U)
+                    | col.value[3];
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8U)
+                    | (col.value[2] << 16U)
+                    | (col.value[3] << 24U);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    p[3] = col2.value[3];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+#endif  // ENABLE_CILKPLUS
+    }
 }
 
+#endif   // SIMD_SUPPORTED
 #endif  // USE_OPENGL
diff --git a/src/resources/dye/dyepalette_replaceaoglcolor_avx2.hpp b/src/resources/dye/dyepalette_replaceaoglcolor_avx2.hpp
deleted file mode 100644
index d16367c4b..000000000
--- a/src/resources/dye/dyepalette_replaceaoglcolor_avx2.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  The ManaPlus Client
- *  Copyright (C) 2007-2009  The Mana World Development Team
- *  Copyright (C) 2009-2010  The Mana Developers
- *  Copyright (C) 2011-2017  The ManaPlus Developers
- *
- *  This file is part of The ManaPlus Client.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-    std::vector<DyeColor>::const_iterator it_end = mColors.end();
-    const size_t sz = mColors.size();
-    if (!sz || !pixels)
-        return;
-    if (sz % 2)
-        -- it_end;
-
-    for (int ptr = 0; ptr < bufSize; ptr += 8)
-    {
-//        __m256i base = _mm256_load_si256(reinterpret_cast<__m256i*>(
-//            &pixels[ptr]));
-        __m256i base = _mm256_loadu_si256(reinterpret_cast<__m256i*>(
-            &pixels[ptr]));
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-            __m256i newMask = _mm256_set1_epi32(col2.valueAOgl);
-            __m256i cmpMask = _mm256_set1_epi32(col.valueAOgl);
-            __m256i cmpRes = _mm256_cmpeq_epi32(base, cmpMask);
-            __m256i srcAnd = _mm256_andnot_si256(cmpRes, base);
-            __m256i dstAnd = _mm256_and_si256(cmpRes, newMask);
-            base = _mm256_or_si256(srcAnd, dstAnd);
-
-            ++ it;
-        }
-//        _mm256_store_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
-    }
diff --git a/src/resources/dye/dyepalette_replaceaoglcolor_default.hpp b/src/resources/dye/dyepalette_replaceaoglcolor_default.hpp
deleted file mode 100644
index 75604c608..000000000
--- a/src/resources/dye/dyepalette_replaceaoglcolor_default.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- *  The ManaPlus Client
- *  Copyright (C) 2007-2009  The Mana World Development Team
- *  Copyright (C) 2009-2010  The Mana Developers
- *  Copyright (C) 2011-2017  The ManaPlus Developers
- *
- *  This file is part of The ManaPlus Client.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-    std::vector<DyeColor>::const_iterator it_end = mColors.end();
-    const size_t sz = mColors.size();
-    if (!sz || !pixels)
-        return;
-    if (sz % 2)
-        -- it_end;
-
-#ifdef ENABLE_CILKPLUS
-    cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
-    {
-        uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
-        const unsigned int data = pixels[ptr];
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-            const unsigned int coldata = (col.value[0] << 24U)
-                | (col.value[1] << 16U)
-                | (col.value[2] << 8U)
-                | col.value[3];
-#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            const unsigned int coldata = (col.value[0])
-                | (col.value[1] << 8U)
-                | (col.value[2] << 16U)
-                | (col.value[3] << 24U);
-#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            if (data == coldata)
-            {
-                p[0] = col2.value[0];
-                p[1] = col2.value[1];
-                p[2] = col2.value[2];
-                p[3] = col2.value[3];
-                break;
-            }
-
-            ++ it;
-        }
-    }
-
-#else  // ENABLE_CILKPLUS
-
-    for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
-         pixels != p_end;
-         ++pixels)
-    {
-        uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
-        const unsigned int data = *pixels;
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-            const unsigned int coldata = (col.value[0] << 24U)
-                | (col.value[1] << 16U)
-                | (col.value[2] << 8U)
-                | col.value[3];
-#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            const unsigned int coldata = (col.value[0])
-                | (col.value[1] << 8U)
-                | (col.value[2] << 16U)
-                | (col.value[3] << 24U);
-#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            if (data == coldata)
-            {
-                p[0] = col2.value[0];
-                p[1] = col2.value[1];
-                p[2] = col2.value[2];
-                p[3] = col2.value[3];
-                break;
-            }
-
-            ++ it;
-        }
-    }
-#endif  // ENABLE_CILKPLUS
diff --git a/src/resources/dye/dyepalette_replaceaoglcolor_sse2.hpp b/src/resources/dye/dyepalette_replaceaoglcolor_sse2.hpp
deleted file mode 100644
index 6186d9ffd..000000000
--- a/src/resources/dye/dyepalette_replaceaoglcolor_sse2.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  The ManaPlus Client
- *  Copyright (C) 2007-2009  The Mana World Development Team
- *  Copyright (C) 2009-2010  The Mana Developers
- *  Copyright (C) 2011-2017  The ManaPlus Developers
- *
- *  This file is part of The ManaPlus Client.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-    std::vector<DyeColor>::const_iterator it_end = mColors.end();
-    const size_t sz = mColors.size();
-    if (!sz || !pixels)
-        return;
-    if (sz % 2)
-        -- it_end;
-
-    for (int ptr = 0; ptr < bufSize; ptr += 4)
-    {
-//        __m128i base = _mm_load_si128(reinterpret_cast<__m128i*>(
-//            &pixels[ptr]));
-        __m128i base = _mm_loadu_si128(reinterpret_cast<__m128i*>(
-            &pixels[ptr]));
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-            __m128i newMask = _mm_set1_epi32(col2.valueAOgl);
-            __m128i cmpMask = _mm_set1_epi32(col.valueAOgl);
-            __m128i cmpRes = _mm_cmpeq_epi32(base, cmpMask);
-            __m128i srcAnd = _mm_andnot_si128(cmpRes, base);
-            __m128i dstAnd = _mm_and_si128(cmpRes, newMask);
-            base = _mm_or_si128(srcAnd, dstAnd);
-
-            ++ it;
-        }
-//        _mm_store_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
-    }
diff --git a/src/resources/openglimagehelper.cpp b/src/resources/openglimagehelper.cpp
index 9f19c3e95..d854c0666 100644
--- a/src/resources/openglimagehelper.cpp
+++ b/src/resources/openglimagehelper.cpp
@@ -96,7 +96,7 @@ Image *OpenGLImageHelper::load(SDL_RWops *const rw, Dye const &dye)
         {
             const DyePalette *const pal = dye.getAPalete();
             if (pal)
-                pal->replaceAOGLColor(pixels, surf->w * surf->h);
+                DYEPALETTEP(pal, AOGLColor)(pixels, surf->w * surf->h);
             break;
         }
         case 0:
diff --git a/src/resources/safeopenglimagehelper.cpp b/src/resources/safeopenglimagehelper.cpp
index cb318d8b9..676812d12 100644
--- a/src/resources/safeopenglimagehelper.cpp
+++ b/src/resources/safeopenglimagehelper.cpp
@@ -93,7 +93,7 @@ Image *SafeOpenGLImageHelper::load(SDL_RWops *const rw,
         {
             const DyePalette *const pal = dye.getAPalete();
             if (pal)
-                pal->replaceAOGLColor(pixels, surf->w * surf->h);
+                DYEPALETTEP(pal, AOGLColor)(pixels, surf->w * surf->h);
             break;
         }
         case 0:
-- 
cgit v1.2.3-70-g09d2