From 69aaf75b49cdf385b03469b82dd05480abf6e8b5 Mon Sep 17 00:00:00 2001
From: Andrei Karas <akaras@inbox.ru>
Date: Thu, 25 May 2017 00:55:33 +0300
Subject: Switch in replaceSOGLColor into using custom despatcher.

---
 src/CMakeLists.txt                                 |   3 -
 src/Makefile.am                                    |   3 -
 src/resources/dye/dye_unittest.cc                  |  39 +--
 src/resources/dye/dyepalette.cpp                   |  12 +
 src/resources/dye/dyepalette.h                     |  43 +--
 src/resources/dye/dyepalette_replacesoglcolor.cpp  | 376 ++++++++++++++++++---
 .../dye/dyepalette_replacesoglcolor_avx2.hpp       |  55 ---
 .../dye/dyepalette_replacesoglcolor_default.hpp    | 108 ------
 .../dye/dyepalette_replacesoglcolor_sse2.hpp       |  55 ---
 src/resources/openglimagehelper.cpp                |   2 +-
 src/resources/safeopenglimagehelper.cpp            |   2 +-
 11 files changed, 352 insertions(+), 346 deletions(-)
 delete mode 100644 src/resources/dye/dyepalette_replacesoglcolor_avx2.hpp
 delete mode 100644 src/resources/dye/dyepalette_replacesoglcolor_default.hpp
 delete mode 100644 src/resources/dye/dyepalette_replacesoglcolor_sse2.hpp

(limited to 'src')

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7fe0b9445..e94ad3dc7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -650,9 +650,6 @@ SET(SRCS
     resources/dye/dyepalette_replaceaoglcolor_sse2.hpp
     resources/dye/dyepalette_replacescolor.cpp
     resources/dye/dyepalette_replacesoglcolor.cpp
-    resources/dye/dyepalette_replacesoglcolor_avx2.hpp
-    resources/dye/dyepalette_replacesoglcolor_default.hpp
-    resources/dye/dyepalette_replacesoglcolor_sse2.hpp
     resources/dye/dyepaletteptr.h
     resources/effectdescription.h
     resources/emoteinfo.h
diff --git a/src/Makefile.am b/src/Makefile.am
index 13206a9ae..c7e6a2e66 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -434,9 +434,6 @@ BASE_SRC += events/actionevent.h \
 	      resources/dye/dyepalette_replaceaoglcolor_sse2.hpp \
 	      resources/dye/dyepalette_replacescolor.cpp \
 	      resources/dye/dyepalette_replacesoglcolor.cpp \
-	      resources/dye/dyepalette_replacesoglcolor_avx2.hpp \
-	      resources/dye/dyepalette_replacesoglcolor_default.hpp \
-	      resources/dye/dyepalette_replacesoglcolor_sse2.hpp \
 	      resources/dye/dyepaletteptr.h \
 	      resources/fboinfo.h \
 	      resources/frame.h \
diff --git a/src/resources/dye/dye_unittest.cc b/src/resources/dye/dye_unittest.cc
index 5d2cfdb8d..6d6e6028e 100644
--- a/src/resources/dye/dye_unittest.cc
+++ b/src/resources/dye/dye_unittest.cc
@@ -69,7 +69,7 @@ TEST_CASE("Dye replaceSOGLColor 1 1", "")
     DyePalette palette("#00ff00,000011", 6);
     uint32_t data[1];
     data[0] = buildHex(0x01, 0x02, 0x03, 0x10);
-    palette.replaceSOGLColor(&data[0], 1);
+    DYEPALETTE(palette, SOGLColor)(&data[0], 1);
     REQUIRE(data[0] == buildHex(0x01, 0x02, 0x03, 0x10));
 }
 
@@ -78,7 +78,7 @@ TEST_CASE("Dye replaceSOGLColor 1 2", "")
     DyePalette palette("#01ff02,030411", 6);
     uint32_t data[1];
     data[0] = buildHex(0x20, 0x02, 0xff, 0x01);
-    palette.replaceSOGLColor(&data[0], 1);
+    DYEPALETTE(palette, SOGLColor)(&data[0], 1);
     REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
 }
 
@@ -87,7 +87,7 @@ TEST_CASE("Dye replaceSOGLColor 1 3", "")
     DyePalette palette("#404040,200000,0100ee,102030", 6);
     uint32_t data[1];
     data[0] = buildHex(0x40, 0xee, 0x00, 0x01);
-    palette.replaceSOGLColor(&data[0], 1);
+    DYEPALETTE(palette, SOGLColor)(&data[0], 1);
     REQUIRE(data[0] == buildHex(0x40, 0x30, 0x20, 0x10));
 }
 
@@ -97,7 +97,7 @@ TEST_CASE("Dye replaceSOGLColor 2 1", "")
     uint32_t data[2];
     data[0] = buildHex(0x20, 0x02, 0xff, 0x01);
     data[1] = buildHex(0x30, 0x02, 0xff, 0x01);
-    palette.replaceSOGLColor(&data[0], 2);
+    DYEPALETTE(palette, SOGLColor)(&data[0], 2);
     REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
     REQUIRE(data[1] == buildHex(0x30, 0x11, 0x04, 0x03));
 }
@@ -110,7 +110,7 @@ TEST_CASE("Dye replaceSOGLColor 4 1", "")
     data[1] = buildHex(0x30, 0x02, 0xff, 0x01);
     data[2] = buildHex(0x40, 0x02, 0xff, 0x01);
     data[3] = buildHex(0x50, 0x02, 0xff, 0x02);
-    palette.replaceSOGLColor(&data[0], 4);
+    DYEPALETTE(palette, SOGLColor)(&data[0], 4);
     REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
     REQUIRE(data[1] == buildHex(0x30, 0x11, 0x04, 0x03));
     REQUIRE(data[2] == buildHex(0x40, 0x11, 0x04, 0x03));
@@ -129,7 +129,7 @@ TEST_CASE("Dye replaceSOGLColor 8 1", "")
     data[5] = buildHex(0x30, 0x02, 0xff, 0x01);
     data[6] = buildHex(0x40, 0x02, 0xff, 0x01);
     data[7] = buildHex(0x60, 0x02, 0xff, 0x02);
-    palette.replaceSOGLColor(&data[0], 8);
+    DYEPALETTE(palette, SOGLColor)(&data[0], 8);
     REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
     REQUIRE(data[1] == buildHex(0x30, 0x11, 0x04, 0x03));
     REQUIRE(data[2] == buildHex(0x40, 0x11, 0x04, 0x03));
@@ -216,29 +216,6 @@ TEST_CASE("Dye replaceSOGLColor 8 1 default", "")
     REQUIRE(data[7] == buildHex(0x60, 0x02, 0xff, 0x02));
 }
 
-TEST_CASE("Dye replaceSOGLColor 8 1 simd", "")
-{
-    DyePalette palette("#01ff02,030411,01ee02,010203", 6);
-    uint32_t data[8];
-    data[0] = buildHex(0x20, 0x02, 0xff, 0x01);
-    data[1] = buildHex(0x30, 0x02, 0xff, 0x01);
-    data[2] = buildHex(0x40, 0x02, 0xff, 0x01);
-    data[3] = buildHex(0x50, 0x02, 0xff, 0x02);
-    data[4] = buildHex(0x20, 0x02, 0xff, 0x01);
-    data[5] = buildHex(0x30, 0x02, 0xff, 0x01);
-    data[6] = buildHex(0x40, 0x02, 0xff, 0x01);
-    data[7] = buildHex(0x60, 0x02, 0xff, 0x02);
-    palette.replaceSOGLColorSimd(&data[0], 8);
-    REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
-    REQUIRE(data[1] == buildHex(0x30, 0x11, 0x04, 0x03));
-    REQUIRE(data[2] == buildHex(0x40, 0x11, 0x04, 0x03));
-    REQUIRE(data[3] == buildHex(0x50, 0x02, 0xff, 0x02));
-    REQUIRE(data[4] == buildHex(0x20, 0x11, 0x04, 0x03));
-    REQUIRE(data[5] == buildHex(0x30, 0x11, 0x04, 0x03));
-    REQUIRE(data[6] == buildHex(0x40, 0x11, 0x04, 0x03));
-    REQUIRE(data[7] == buildHex(0x60, 0x02, 0xff, 0x02));
-}
-
 TEST_CASE("Dye replaceSOGLColor 8 1 sse2", "")
 {
     DyePalette palette("#01ff02,030411,01ee02,010203", 6);
@@ -251,7 +228,7 @@ TEST_CASE("Dye replaceSOGLColor 8 1 sse2", "")
     data[5] = buildHex(0x30, 0x02, 0xff, 0x01);
     data[6] = buildHex(0x40, 0x02, 0xff, 0x01);
     data[7] = buildHex(0x60, 0x02, 0xff, 0x02);
-    palette.replaceSOGLColorSse2(&data[0], 8);
+    DYEPALETTE(palette, SOGLColorSse2)(&data[0], 8);
     REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
     REQUIRE(data[1] == buildHex(0x30, 0x11, 0x04, 0x03));
     REQUIRE(data[2] == buildHex(0x40, 0x11, 0x04, 0x03));
@@ -274,7 +251,7 @@ TEST_CASE("Dye replaceSOGLColor 8 1 avx2", "")
     data[5] = buildHex(0x30, 0x02, 0xff, 0x01);
     data[6] = buildHex(0x40, 0x02, 0xff, 0x01);
     data[7] = buildHex(0x60, 0x02, 0xff, 0x02);
-    palette.replaceSOGLColorAvx2(&data[0], 8);
+    DYEPALETTE(palette, SOGLColorAvx2)(&data[0], 8);
     REQUIRE(data[0] == buildHex(0x20, 0x11, 0x04, 0x03));
     REQUIRE(data[1] == buildHex(0x30, 0x11, 0x04, 0x03));
     REQUIRE(data[2] == buildHex(0x40, 0x11, 0x04, 0x03));
diff --git a/src/resources/dye/dyepalette.cpp b/src/resources/dye/dyepalette.cpp
index 5a350c8e2..c341d140e 100644
--- a/src/resources/dye/dyepalette.cpp
+++ b/src/resources/dye/dyepalette.cpp
@@ -47,6 +47,9 @@
 DyeFunctionPtr DyePalette::funcReplaceSColor = nullptr;
 DyeFunctionPtr DyePalette::funcReplaceSColorSse2 = nullptr;
 DyeFunctionPtr DyePalette::funcReplaceSColorAvx2 = nullptr;
+DyeFunctionPtr DyePalette::funcReplaceSOGLColor = nullptr;
+DyeFunctionPtr DyePalette::funcReplaceSOGLColorSse2 = nullptr;
+DyeFunctionPtr DyePalette::funcReplaceSOGLColorAvx2 = nullptr;
 
 DyePalette::DyePalette(const std::string &restrict description,
                        const uint8_t blockSize) :
@@ -242,12 +245,18 @@ void DyePalette::initFunctions()
         funcReplaceSColor = &DyePalette::replaceSColorAvx2;
         funcReplaceSColorAvx2 = &DyePalette::replaceSColorAvx2;
         funcReplaceSColorSse2 = &DyePalette::replaceSColorSse2;
+        funcReplaceSOGLColor = &DyePalette::replaceSOGLColorAvx2;
+        funcReplaceSOGLColorAvx2 = &DyePalette::replaceSOGLColorAvx2;
+        funcReplaceSOGLColorSse2 = &DyePalette::replaceSOGLColorSse2;
     }
     else if (flags & Cpu::FEATURE_SSE2)
     {
         funcReplaceSColor = &DyePalette::replaceSColorSse2;
         funcReplaceSColorAvx2 = &DyePalette::replaceSColorSse2;
         funcReplaceSColorSse2 = &DyePalette::replaceSColorSse2;
+        funcReplaceSOGLColor = &DyePalette::replaceSOGLColorSse2;
+        funcReplaceSOGLColorAvx2 = &DyePalette::replaceSOGLColorSse2;
+        funcReplaceSOGLColorSse2 = &DyePalette::replaceSOGLColorSse2;
     }
     else
 #endif  // SIMD_SUPPORTED
@@ -255,5 +264,8 @@ void DyePalette::initFunctions()
         funcReplaceSColor = &DyePalette::replaceSColorDefault;
         funcReplaceSColorAvx2 = &DyePalette::replaceSColorDefault;
         funcReplaceSColorSse2 = &DyePalette::replaceSColorDefault;
+        funcReplaceSOGLColor = &DyePalette::replaceSOGLColorDefault;
+        funcReplaceSOGLColorAvx2 = &DyePalette::replaceSOGLColorDefault;
+        funcReplaceSOGLColorSse2 = &DyePalette::replaceSOGLColorDefault;
     }
 }
diff --git a/src/resources/dye/dyepalette.h b/src/resources/dye/dyepalette.h
index da165305e..4129928fa 100644
--- a/src/resources/dye/dyepalette.h
+++ b/src/resources/dye/dyepalette.h
@@ -152,56 +152,16 @@ class DyePalette final
 #endif  // SIMD_SUPPORTED
 
 #ifdef USE_OPENGL
-        /**
-         * replace colors for OpenGL for S dye.
-         */
-        void replaceSOGLColor(uint32_t *restrict pixels,
-                              const int bufSize) const restrict2;
-
         /**
          * replace colors for OpenGL for S dye.
          */
         void replaceSOGLColorDefault(uint32_t *restrict pixels,
                                      const int bufSize) const restrict2;
-
-        /**
-         * replace colors for OpenGL for S dye.
-         */
-        FUNCTION_SIMD_DEFAULT
-        void replaceSOGLColorSimd(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-
-        /**
-         * replace colors for OpenGL for S dye.
-         */
-        FUNCTION_SIMD_DEFAULT
-        void replaceSOGLColorSse2(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-
-        /**
-         * replace colors for OpenGL for S dye.
-         */
-        FUNCTION_SIMD_DEFAULT
-        void replaceSOGLColorAvx2(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-
 #ifdef SIMD_SUPPORTED
         /**
          * replace colors for OpenGL for S dye.
          */
         __attribute__ ((target ("sse2")))
-        void replaceSOGLColorSimd(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-        /**
-         * replace colors for OpenGL for S dye.
-         */
-        __attribute__ ((target ("avx2")))
-        void replaceSOGLColorSimd(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2;
-        /**
-         * replace colors for OpenGL for S dye.
-         */
-        __attribute__ ((target ("sse2")))
         void replaceSOGLColorSse2(uint32_t *restrict pixels,
                                   const int bufSize) const restrict2;
         /**
@@ -285,6 +245,9 @@ class DyePalette final
         static DyeFunctionPtr funcReplaceSColor;
         static DyeFunctionPtr funcReplaceSColorSse2;
         static DyeFunctionPtr funcReplaceSColorAvx2;
+        static DyeFunctionPtr funcReplaceSOGLColor;
+        static DyeFunctionPtr funcReplaceSOGLColorSse2;
+        static DyeFunctionPtr funcReplaceSOGLColorAvx2;
 
 #ifndef UNITTESTS
     private:
diff --git a/src/resources/dye/dyepalette_replacesoglcolor.cpp b/src/resources/dye/dyepalette_replacesoglcolor.cpp
index 57ffd6b50..005523b4b 100644
--- a/src/resources/dye/dyepalette_replacesoglcolor.cpp
+++ b/src/resources/dye/dyepalette_replacesoglcolor.cpp
@@ -35,26 +35,98 @@
 
 #include "debug.h"
 
-void DyePalette::replaceSOGLColor(uint32_t *restrict pixels,
-                                  const int bufSize) const restrict2
-{
-#ifdef SIMD_SUPPORTED
-    if (bufSize >= 8)
-        replaceSOGLColorSimd(pixels, bufSize);
-    else
-        replaceSOGLColorDefault(pixels, bufSize);
-#else  // SIMD_SUPPORTED
-#include "resources/dye/dyepalette_replacesoglcolor_default.hpp"
-#endif  // SIMD_SUPPORTED
-}
-
 void DyePalette::replaceSOGLColorDefault(uint32_t *restrict pixels,
                                          const int bufSize) const restrict2
 {
-#include "resources/dye/dyepalette_replacesoglcolor_default.hpp"
-}
+    std::vector<DyeColor>::const_iterator it_end = mColors.end();
+    const size_t sz = mColors.size();
+    if (!sz || !pixels)
+        return;
+    if (sz % 2)
+        -- it_end;
+
+#ifdef ENABLE_CILKPLUS
+    cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
+    {
+        uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+        const unsigned int data = (pixels[ptr]) & 0xffffff00;
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+        const unsigned int data = (pixels[ptr]) & 0x00ffffff;
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+        std::vector<DyeColor>::const_iterator it = mColors.begin();
+        while (it != it_end)
+        {
+            const DyeColor &col = *it;
+            ++ it;
+            const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int coldata = (col.value[0] << 24)
+                | (col.value[1] << 16) | (col.value[2] << 8);
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            const unsigned int coldata = (col.value[0])
+                | (col.value[1] << 8) | (col.value[2] << 16);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            if (data == coldata)
+            {
+                p[0] = col2.value[0];
+                p[1] = col2.value[1];
+                p[2] = col2.value[2];
+                break;
+            }
+
+            ++ it;
+        }
+    }
+
+#else  // ENABLE_CILKPLUS
+
+    for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
+         pixels != p_end;
+         ++pixels)
+    {
+        uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+        const unsigned int data = (*pixels) & 0xffffff00;
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+        const unsigned int data = (*pixels) & 0x00ffffff;
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+        std::vector<DyeColor>::const_iterator it = mColors.begin();
+        while (it != it_end)
+        {
+            const DyeColor &col = *it;
+            ++ it;
+            const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int coldata = (col.value[0] << 24)
+                | (col.value[1] << 16) | (col.value[2] << 8);
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
 
+            const unsigned int coldata = (col.value[0])
+                | (col.value[1] << 8) | (col.value[2] << 16);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
 
+            if (data == coldata)
+            {
+                p[0] = col2.value[0];
+                p[1] = col2.value[1];
+                p[2] = col2.value[2];
+                break;
+            }
+
+            ++ it;
+        }
+    }
+#endif  // ENABLE_CILKPLUS
+}
 
 #ifdef SIMD_SUPPORTED
 /*
@@ -66,54 +138,260 @@ static void print256(const char *const text, const __m256i &val)
 */
 
 __attribute__ ((target ("sse2")))
-void DyePalette::replaceSOGLColorSimd(uint32_t *restrict pixels,
+void DyePalette::replaceSOGLColorSse2(uint32_t *restrict pixels,
                                       const int bufSize) const restrict2
 {
-#include "resources/dye/dyepalette_replacesoglcolor_sse2.hpp"
-}
+    std::vector<DyeColor>::const_iterator it_end = mColors.end();
+    const size_t sz = mColors.size();
+    if (!sz || !pixels)
+        return;
+    if (sz % 2)
+        -- it_end;
 
-__attribute__ ((target ("avx2")))
-void DyePalette::replaceSOGLColorSimd(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replacesoglcolor_avx2.hpp"
-}
+    if (bufSize >= 8)
+    {
+        for (int ptr = 0; ptr < bufSize; ptr += 4)
+        {
+            __m128i mask = _mm_set1_epi32(0x00ffffff);
+//            __m128i base = _mm_load_si128(reinterpret_cast<__m128i*>(
+//             &pixels[ptr]));
+            __m128i base = _mm_loadu_si128(reinterpret_cast<__m128i*>(
+                &pixels[ptr]));
 
-__attribute__ ((target ("sse2")))
-void DyePalette::replaceSOGLColorSse2(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replacesoglcolor_sse2.hpp"
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+                __m128i base2 = _mm_and_si128(mask, base);
+                __m128i newMask = _mm_set1_epi32(col2.valueSOgl);
+                __m128i cmpMask = _mm_set1_epi32(col.valueSOgl);
+                __m128i cmpRes = _mm_cmpeq_epi32(base2, cmpMask);
+                cmpRes = _mm_and_si128(mask, cmpRes);
+                __m128i srcAnd = _mm_andnot_si128(cmpRes, base);
+                __m128i dstAnd = _mm_and_si128(cmpRes, newMask);
+                base = _mm_or_si128(srcAnd, dstAnd);
+                ++ it;
+            }
+//            _mm_store_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
+        }
+    }
+    else
+    {
+#ifdef ENABLE_CILKPLUS
+        cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int data = (pixels[ptr]) & 0xffffff00;
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            const unsigned int data = (pixels[ptr]) & 0x00ffffff;
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24)
+                    | (col.value[1] << 16) | (col.value[2] << 8);
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8) | (col.value[2] << 16);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+
+#else  // ENABLE_CILKPLUS
+
+        for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
+             pixels != p_end;
+             ++pixels)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int data = (*pixels) & 0xffffff00;
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            const unsigned int data = (*pixels) & 0x00ffffff;
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24)
+                    | (col.value[1] << 16) | (col.value[2] << 8);
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8) | (col.value[2] << 16);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+#endif  // ENABLE_CILKPLUS
+    }
 }
 
 __attribute__ ((target ("avx2")))
 void DyePalette::replaceSOGLColorAvx2(uint32_t *restrict pixels,
                                       const int bufSize) const restrict2
 {
-#include "resources/dye/dyepalette_replacesoglcolor_avx2.hpp"
-}
+    std::vector<DyeColor>::const_iterator it_end = mColors.end();
+    const size_t sz = mColors.size();
+    if (!sz || !pixels)
+        return;
+    if (sz % 2)
+        -- it_end;
 
-#endif  // SIMD_SUPPORTED
+    if (bufSize >= 8)
+    {
+        for (int ptr = 0; ptr < bufSize; ptr += 8)
+        {
+            __m256i mask = _mm256_set1_epi32(0x00ffffff);
+//          __m256i base = _mm256_load_si256(reinterpret_cast<__m256i*>(
+//              &pixels[ptr]));
+            __m256i base = _mm256_loadu_si256(reinterpret_cast<__m256i*>(
+                &pixels[ptr]));
 
-FUNCTION_SIMD_DEFAULT
-void DyePalette::replaceSOGLColorSimd(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replacesoglcolor_default.hpp"
-}
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
 
-FUNCTION_SIMD_DEFAULT
-void DyePalette::replaceSOGLColorSse2(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replacesoglcolor_default.hpp"
-}
+                __m256i base2 = _mm256_and_si256(mask, base);
+                __m256i newMask = _mm256_set1_epi32(col2.valueSOgl);
+                __m256i cmpMask = _mm256_set1_epi32(col.valueSOgl);
+                __m256i cmpRes = _mm256_cmpeq_epi32(base2, cmpMask);
+                cmpRes = _mm256_and_si256(mask, cmpRes);
+                __m256i srcAnd = _mm256_andnot_si256(cmpRes, base);
+                __m256i dstAnd = _mm256_and_si256(cmpRes, newMask);
+                base = _mm256_or_si256(srcAnd, dstAnd);
+                ++ it;
+            }
+//            _mm256_store_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
+        }
+    }
+    else
+    {
+#ifdef ENABLE_CILKPLUS
+        cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int data = (pixels[ptr]) & 0xffffff00;
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
 
-FUNCTION_SIMD_DEFAULT
-void DyePalette::replaceSOGLColorAvx2(uint32_t *restrict pixels,
-                                      const int bufSize) const restrict2
-{
-#include "resources/dye/dyepalette_replacesoglcolor_default.hpp"
+            const unsigned int data = (pixels[ptr]) & 0x00ffffff;
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24)
+                    | (col.value[1] << 16) | (col.value[2] << 8);
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8) | (col.value[2] << 16);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+
+#else  // ENABLE_CILKPLUS
+
+        for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
+             pixels != p_end;
+             ++pixels)
+        {
+            uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+            const unsigned int data = (*pixels) & 0xffffff00;
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            const unsigned int data = (*pixels) & 0x00ffffff;
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+            std::vector<DyeColor>::const_iterator it = mColors.begin();
+            while (it != it_end)
+            {
+                const DyeColor &col = *it;
+                ++ it;
+                const DyeColor &col2 = *it;
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+                const unsigned int coldata = (col.value[0] << 24)
+                    | (col.value[1] << 16) | (col.value[2] << 8);
+#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                const unsigned int coldata = (col.value[0])
+                    | (col.value[1] << 8) | (col.value[2] << 16);
+#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
+
+                if (data == coldata)
+                {
+                    p[0] = col2.value[0];
+                    p[1] = col2.value[1];
+                    p[2] = col2.value[2];
+                    break;
+                }
+
+                ++ it;
+            }
+        }
+#endif  // ENABLE_CILKPLUS
+    }
 }
 
+#endif  // SIMD_SUPPORTED
 #endif  // USE_OPENGL
diff --git a/src/resources/dye/dyepalette_replacesoglcolor_avx2.hpp b/src/resources/dye/dyepalette_replacesoglcolor_avx2.hpp
deleted file mode 100644
index 6e45f807c..000000000
--- a/src/resources/dye/dyepalette_replacesoglcolor_avx2.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  The ManaPlus Client
- *  Copyright (C) 2011-2017  The ManaPlus Developers
- *
- *  This file is part of The ManaPlus Client.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-    std::vector<DyeColor>::const_iterator it_end = mColors.end();
-    const size_t sz = mColors.size();
-    if (!sz || !pixels)
-        return;
-    if (sz % 2)
-        -- it_end;
-
-    for (int ptr = 0; ptr < bufSize; ptr += 8)
-    {
-        __m256i mask = _mm256_set1_epi32(0x00ffffff);
-//        __m256i base = _mm256_load_si256(reinterpret_cast<__m256i*>(
-//            &pixels[ptr]));
-        __m256i base = _mm256_loadu_si256(reinterpret_cast<__m256i*>(
-            &pixels[ptr]));
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-            __m256i base2 = _mm256_and_si256(mask, base);
-            __m256i newMask = _mm256_set1_epi32(col2.valueSOgl);
-            __m256i cmpMask = _mm256_set1_epi32(col.valueSOgl);
-            __m256i cmpRes = _mm256_cmpeq_epi32(base2, cmpMask);
-            cmpRes = _mm256_and_si256(mask, cmpRes);
-            __m256i srcAnd = _mm256_andnot_si256(cmpRes, base);
-            __m256i dstAnd = _mm256_and_si256(cmpRes, newMask);
-            base = _mm256_or_si256(srcAnd, dstAnd);
-            ++ it;
-        }
-//        _mm256_store_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
-    }
diff --git a/src/resources/dye/dyepalette_replacesoglcolor_default.hpp b/src/resources/dye/dyepalette_replacesoglcolor_default.hpp
deleted file mode 100644
index 7669b99bf..000000000
--- a/src/resources/dye/dyepalette_replacesoglcolor_default.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  The ManaPlus Client
- *  Copyright (C) 2011-2017  The ManaPlus Developers
- *
- *  This file is part of The ManaPlus Client.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-    std::vector<DyeColor>::const_iterator it_end = mColors.end();
-    const size_t sz = mColors.size();
-    if (!sz || !pixels)
-        return;
-    if (sz % 2)
-        -- it_end;
-
-#ifdef ENABLE_CILKPLUS
-    cilk_for (int ptr = 0; ptr < bufSize; ptr ++)
-    {
-        uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-        const unsigned int data = (pixels[ptr]) & 0xffffff00;
-#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-        const unsigned int data = (pixels[ptr]) & 0x00ffffff;
-#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-            const unsigned int coldata = (col.value[0] << 24)
-                | (col.value[1] << 16) | (col.value[2] << 8);
-#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            const unsigned int coldata = (col.value[0])
-                | (col.value[1] << 8) | (col.value[2] << 16);
-#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            if (data == coldata)
-            {
-                p[0] = col2.value[0];
-                p[1] = col2.value[1];
-                p[2] = col2.value[2];
-                break;
-            }
-
-            ++ it;
-        }
-    }
-
-#else  // ENABLE_CILKPLUS
-
-    for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
-         pixels != p_end;
-         ++pixels)
-    {
-        uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-        const unsigned int data = (*pixels) & 0xffffff00;
-#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-        const unsigned int data = (*pixels) & 0x00ffffff;
-#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-            const unsigned int coldata = (col.value[0] << 24)
-                | (col.value[1] << 16) | (col.value[2] << 8);
-#else  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            const unsigned int coldata = (col.value[0])
-                | (col.value[1] << 8) | (col.value[2] << 16);
-#endif  // SDL_BYTEORDER == SDL_BIG_ENDIAN
-
-            if (data == coldata)
-            {
-                p[0] = col2.value[0];
-                p[1] = col2.value[1];
-                p[2] = col2.value[2];
-                break;
-            }
-
-            ++ it;
-        }
-    }
-#endif  // ENABLE_CILKPLUS
diff --git a/src/resources/dye/dyepalette_replacesoglcolor_sse2.hpp b/src/resources/dye/dyepalette_replacesoglcolor_sse2.hpp
deleted file mode 100644
index a59c53979..000000000
--- a/src/resources/dye/dyepalette_replacesoglcolor_sse2.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  The ManaPlus Client
- *  Copyright (C) 2011-2017  The ManaPlus Developers
- *
- *  This file is part of The ManaPlus Client.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-    std::vector<DyeColor>::const_iterator it_end = mColors.end();
-    const size_t sz = mColors.size();
-    if (!sz || !pixels)
-        return;
-    if (sz % 2)
-        -- it_end;
-
-    for (int ptr = 0; ptr < bufSize; ptr += 4)
-    {
-        __m128i mask = _mm_set1_epi32(0x00ffffff);
-//        __m128i base = _mm_load_si128(reinterpret_cast<__m128i*>(
-//            &pixels[ptr]));
-        __m128i base = _mm_loadu_si128(reinterpret_cast<__m128i*>(
-            &pixels[ptr]));
-
-        std::vector<DyeColor>::const_iterator it = mColors.begin();
-        while (it != it_end)
-        {
-            const DyeColor &col = *it;
-            ++ it;
-            const DyeColor &col2 = *it;
-
-            __m128i base2 = _mm_and_si128(mask, base);
-            __m128i newMask = _mm_set1_epi32(col2.valueSOgl);
-            __m128i cmpMask = _mm_set1_epi32(col.valueSOgl);
-            __m128i cmpRes = _mm_cmpeq_epi32(base2, cmpMask);
-            cmpRes = _mm_and_si128(mask, cmpRes);
-            __m128i srcAnd = _mm_andnot_si128(cmpRes, base);
-            __m128i dstAnd = _mm_and_si128(cmpRes, newMask);
-            base = _mm_or_si128(srcAnd, dstAnd);
-            ++ it;
-        }
-//        _mm_store_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
-    }
diff --git a/src/resources/openglimagehelper.cpp b/src/resources/openglimagehelper.cpp
index 391c3b9b5..9f19c3e95 100644
--- a/src/resources/openglimagehelper.cpp
+++ b/src/resources/openglimagehelper.cpp
@@ -89,7 +89,7 @@ Image *OpenGLImageHelper::load(SDL_RWops *const rw, Dye const &dye)
         {
             const DyePalette *const pal = dye.getSPalete();
             if (pal)
-                pal->replaceSOGLColor(pixels, surf->w * surf->h);
+                DYEPALETTEP(pal, SOGLColor)(pixels, surf->w * surf->h);
             break;
         }
         case 2:
diff --git a/src/resources/safeopenglimagehelper.cpp b/src/resources/safeopenglimagehelper.cpp
index 676e2108c..cb318d8b9 100644
--- a/src/resources/safeopenglimagehelper.cpp
+++ b/src/resources/safeopenglimagehelper.cpp
@@ -86,7 +86,7 @@ Image *SafeOpenGLImageHelper::load(SDL_RWops *const rw,
         {
             const DyePalette *const pal = dye.getSPalete();
             if (pal)
-                pal->replaceSOGLColor(pixels, surf->w * surf->h);
+                DYEPALETTEP(pal, SOGLColor)(pixels, surf->w * surf->h);
             break;
         }
         case 2:
-- 
cgit v1.2.3-70-g09d2