#pragma once

#include "lib8tion/config.h"
#include "scale8.h"
#include "lib8tion/lib8static.h"
#include "intmap.h"
#include "fl/namespace.h"

#include "fl/compiler_control.h"


FL_DISABLE_WARNING_PUSH
FL_DISABLE_WARNING_UNUSED_PARAMETER
FL_DISABLE_WARNING_RETURN_TYPE
FL_DISABLE_WARNING_IMPLICIT_INT_CONVERSION

FASTLED_NAMESPACE_BEGIN

/// @file math8.h
/// Fast, efficient 8-bit math functions specifically
/// designed for high-performance LED programming.

/// @ingroup lib8tion
/// @{

/// @defgroup Math Basic Math Operations
/// Fast, efficient 8-bit math functions specifically
/// designed for high-performance LED programming.
///
/// Because of the AVR (Arduino) and ARM assembly language
/// implementations provided, using these functions often
/// results in smaller and faster code than the equivalent
/// program using plain "C" arithmetic and logic.
/// @{

/// Add one byte to another, saturating at 0xFF
/// @param i first byte to add
/// @param j second byte to add
/// @returns the sum of i + j, capped at 0xFF
LIB8STATIC_ALWAYS_INLINE uint8_t qadd8(uint8_t i, uint8_t j) {
#if QADD8_C == 1
    unsigned int t = i + j;
    if (t > 255)
        t = 255;
    return static_cast<uint8_t>(t);
#elif QADD8_AVRASM == 1
    asm volatile(
        /* First, add j to i, conditioning the C flag */
        "add %0, %1    \n\t"

        /* Now test the C flag.
        If C is clear, we branch around a load of 0xFF into i.
        If C is set, we go ahead and load 0xFF into i.
        */
        "brcc L_%=     \n\t"
        "ldi %0, 0xFF  \n\t"
        "L_%=: "
        : "+d"(i) // r16-r31, restricted by ldi
        : "r"(j));
    return i;
#elif QADD8_ARM_DSP_ASM == 1
    asm volatile("uqadd8 %0, %0, %1" : "+r"(i) : "r"(j));
    return i;
#else
#error "No implementation for qadd8 available."
#endif
}

/// Add one byte to another, saturating at 0x7F and -0x80
/// @param i first byte to add
/// @param j second byte to add
/// @returns the sum of i + j, capped at 0x7F and -0x80
LIB8STATIC_ALWAYS_INLINE int8_t qadd7(int8_t i, int8_t j) {
#if QADD7_C == 1
    int16_t t = i + j;
    if (t > 127)
        t = 127;
    else if (t < -128)
        t = -128;
    return static_cast<int8_t>(t);
#elif QADD7_AVRASM == 1
    asm volatile(
        /* First, add j to i, conditioning the V and C flags */
        "add %0, %1    \n\t"

        /* Now test the V flag.
        If V is clear, we branch to end.
        If V is set, we go ahead and load 0x7F into i.
        */
        "brvc L_%=     \n\t"
        "ldi %0, 0x7F  \n\t"

        /* When both numbers are negative, C is set.
        Adding it to make result negative. */
        "adc %0, __zero_reg__\n\t"
        "L_%=: "
        : "+d"(i) // r16-r31, restricted by ldi
        : "r"(j));
    return i;
#elif QADD7_ARM_DSP_ASM == 1
    asm volatile("qadd8 %0, %0, %1" : "+r"(i) : "r"(j));
    return i;
#else
#error "No implementation for qadd7 available."
#endif
}

/// Subtract one byte from another, saturating at 0x00
/// @param i byte to subtract from
/// @param j byte to subtract
/// @returns i - j with a floor of 0
LIB8STATIC_ALWAYS_INLINE uint8_t qsub8(uint8_t i, uint8_t j) {
#if QSUB8_C == 1
    int t = i - j;
    if (t < 0)
        t = 0;
    return static_cast<uint8_t>(t);
#elif QSUB8_AVRASM == 1

    asm volatile(
        /* First, subtract j from i, conditioning the C flag */
        "sub %0, %1    \n\t"

        /* Now test the C flag.
        If C is clear, we branch around a load of 0x00 into i.
        If C is set, we go ahead and load 0x00 into i.
        */
        "brcc L_%=     \n\t"
        "ldi %0, 0x00  \n\t"
        "L_%=: "
        : "+d"(i) // r16-r31, restricted by ldi
        : "r"(j));
    return i;
#else
#error "No implementation for qsub8 available."
#endif
}

/// Add one byte to another, with 8-bit result
/// @note This does not saturate and may overflow!
/// @param i first byte to add
/// @param j second byte to add
/// @returns the sum of i + j, 8-bit
LIB8STATIC_ALWAYS_INLINE uint8_t add8(uint8_t i, uint8_t j) {
#if ADD8_C == 1
    int t = i + j;
    return static_cast<uint8_t>(t);
#elif ADD8_AVRASM == 1
    // Add j to i, period.
    asm volatile("add %0, %1" : "+r"(i) : "r"(j));
    return i;
#else
#error "No implementation for add8 available."
#endif
}

/// Add one byte to two bytes, with 16-bit result
/// @note This does not saturate and may overflow!
/// @param i first value to add, 8-bit
/// @param j second value to add, 16-bit
/// @returns the sum of i + j, 16-bit
LIB8STATIC_ALWAYS_INLINE uint16_t add8to16(uint8_t i, uint16_t j) {
#if ADD8_C == 1
    uint16_t t = i + j;
    return t;
#elif ADD8_AVRASM == 1
    // Add i(one byte) to j(two bytes)
    asm volatile("add %A[j], %[i]              \n\t"
                 "adc %B[j], __zero_reg__      \n\t"
                 : [j] "+r"(j)
                 : [i] "r"(i));
    return i;
#else
#error "No implementation for add8to16 available."
#endif
}

/// Subtract one byte from another, 8-bit result
/// @note This does not saturate and may overflow!
/// @param i byte to subtract from
/// @param j byte to subtract
/// @returns i - j
LIB8STATIC_ALWAYS_INLINE uint8_t sub8(uint8_t i, uint8_t j) {
#if SUB8_C == 1
    int t = i - j;
    return static_cast<uint8_t>(t);
#elif SUB8_AVRASM == 1
    // Subtract j from i, period.
    asm volatile("sub %0, %1" : "+r"(i) : "r"(j));
    return i;
#else
#error "No implementation for sub8 available."
#endif
}

/// Calculate an integer average of two unsigned
/// 8-bit integer values (uint8_t), rounded down.
/// Fractional results are rounded down, e.g. avg8(20,41) = 30
/// @param i first value to average
/// @param j second value to average
/// @returns mean average of i and j, rounded down
LIB8STATIC_ALWAYS_INLINE uint8_t avg8(uint8_t i, uint8_t j) {
#if AVG8_C == 1
    return (i + j) >> 1;
#elif AVG8_AVRASM == 1
    asm volatile(
        /* First, add j to i, 9th bit overflows into C flag */
        "add %0, %1    \n\t"
        /* Divide by two, moving C flag into high 8th bit */
        "ror %0        \n\t"
        : "+r"(i)
        : "r"(j));
    return i;
#else
#error "No implementation for avg8 available."
#endif
}

/// Calculate an integer average of two unsigned
/// 16-bit integer values (uint16_t), rounded down.
/// Fractional results are rounded down, e.g. avg16(20,41) = 30
/// @param i first value to average
/// @param j second value to average
/// @returns mean average of i and j, rounded down
LIB8STATIC_ALWAYS_INLINE uint16_t avg16(uint16_t i, uint16_t j) {
#if AVG16_C == 1
    // return (uint32_t)((uint32_t)(i) + (uint32_t)(j)) >> 1;
    uint32_t tmp = i;
    tmp += j;
    return static_cast<uint16_t>(tmp >> 1);
#elif AVG16_AVRASM == 1
    asm volatile(
        /* First, add jLo (heh) to iLo, 9th bit overflows into C flag */
        "add %A[i], %A[j]    \n\t"
        /* Now, add C + jHi to iHi, 17th bit overflows into C flag */
        "adc %B[i], %B[j]    \n\t"
        /* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now
           in
           C */
        "ror %B[i]        \n\t"
        /* Divide iLo by two, moving C flag into high 8th bit */
        "ror %A[i]        \n\t"
        : [i] "+r"(i)
        : [j] "r"(j));
    return i;
#else
#error "No implementation for avg16 available."
#endif
}

/// Calculate an integer average of two unsigned
/// 8-bit integer values (uint8_t), rounded up.
/// Fractional results are rounded up, e.g. avg8r(20,41) = 31
/// @param i first value to average
/// @param j second value to average
/// @returns mean average of i and j, rounded up
LIB8STATIC_ALWAYS_INLINE uint8_t avg8r(uint8_t i, uint8_t j) {
#if AVG8R_C == 1
    return (i + j + 1) >> 1;
#elif AVG8R_AVRASM == 1
    asm volatile(
        /* First, add j to i, 9th bit overflows into C flag */
        "add %0, %1          \n\t"
        /* Divide by two, moving C flag into high 8th bit, old 1st bit now in C
         */
        "ror %0              \n\t"
        /* Add C flag */
        "adc %0, __zero_reg__\n\t"
        : "+r"(i)
        : "r"(j));
    return i;
#else
#error "No implementation for avg8r available."
#endif
}

/// Calculate an integer average of two unsigned
/// 16-bit integer values (uint16_t), rounded up.
/// Fractional results are rounded up, e.g. avg16r(20,41) = 31
/// @param i first value to average
/// @param j second value to average
/// @returns mean average of i and j, rounded up
LIB8STATIC_ALWAYS_INLINE uint16_t avg16r(uint16_t i, uint16_t j) {
#if AVG16R_C == 1
    // return (uint32_t)((uint32_t)(i) + (uint32_t)(j) + 1) >> 1;
    uint32_t tmp = i;
    tmp += j;
    tmp += 1;
    return static_cast<uint16_t>(tmp >> 1);
#elif AVG16R_AVRASM == 1
    asm volatile(
        /* First, add jLo (heh) to iLo, 9th bit overflows into C flag */
        "add %A[i], %A[j]    \n\t"
        /* Now, add C + jHi to iHi, 17th bit overflows into C flag */
        "adc %B[i], %B[j]    \n\t"
        /* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now
           in
           C */
        "ror %B[i]        \n\t"
        /* Divide iLo by two, moving C flag into high 8th bit, old 1st bit now
           in
           C */
        "ror %A[i]        \n\t"
        /* Add C flag */
        "adc %A[i], __zero_reg__\n\t"
        "adc %B[i], __zero_reg__\n\t"
        : [i] "+r"(i)
        : [j] "r"(j));
    return i;
#else
#error "No implementation for avg16r available."
#endif
}

/// Calculate an integer average of two signed 7-bit
/// integers (int8_t).
/// If the first argument is even, result is rounded down.
/// If the first argument is odd, result is rounded up.
/// @param i first value to average
/// @param j second value to average
/// @returns mean average of i and j, rounded
LIB8STATIC_ALWAYS_INLINE int8_t avg7(int8_t i, int8_t j) {
#if AVG7_C == 1
    return (i >> 1) + (j >> 1) + (i & 0x1);
#elif AVG7_AVRASM == 1
    asm volatile("asr %1        \n\t"
                 "asr %0        \n\t"
                 "adc %0, %1    \n\t"
                 : "+r"(i)
                 : "r"(j));
    return i;
#else
#error "No implementation for avg7 available."
#endif
}

/// Calculate an integer average of two signed 15-bit
/// integers (int16_t).
/// If the first argument is even, result is rounded down.
/// If the first argument is odd, result is rounded up.
/// @param i first value to average
/// @param j second value to average
/// @returns mean average of i and j, rounded
LIB8STATIC_ALWAYS_INLINE int16_t avg15(int16_t i, int16_t j) {
#if AVG15_C == 1
    return (i >> 1) + (j >> 1) + (i & 0x1);
#elif AVG15_AVRASM == 1
    asm volatile(
        /* first divide j by 2, throwing away lowest bit */
        "asr %B[j]          \n\t"
        "ror %A[j]          \n\t"
        /* now divide i by 2, with lowest bit going into C */
        "asr %B[i]          \n\t"
        "ror %A[i]          \n\t"
        /* add j + C to i */
        "adc %A[i], %A[j]   \n\t"
        "adc %B[i], %B[j]   \n\t"
        : [i] "+r"(i)
        : [j] "r"(j));
    return i;
#else
#error "No implementation for avg15 available."
#endif
}

/// Calculate the remainder of one unsigned 8-bit
/// value divided by anoter, aka A % M.
/// Implemented by repeated subtraction, which is
/// very compact, and very fast if A is "probably"
/// less than M.  If A is a large multiple of M,
/// the loop has to execute multiple times.  However,
/// even in that case, the loop is only two
/// instructions long on AVR, i.e., quick.
/// @param a dividend byte
/// @param m divisor byte
/// @returns remainder of a / m (i.e. a % m)
LIB8STATIC_ALWAYS_INLINE uint8_t mod8(uint8_t a, uint8_t m) {
#if defined(__AVR__)
    asm volatile("L_%=:  sub %[a],%[m]    \n\t"
                 "       brcc L_%=        \n\t"
                 "       add %[a],%[m]    \n\t"
                 : [a] "+r"(a)
                 : [m] "r"(m));
#else
    while (a >= m)
        a -= m;
#endif
    return a;
}

/// Add two numbers, and calculate the modulo
/// of the sum and a third number, M.
/// In other words, it returns (A+B) % M.
/// It is designed as a compact mechanism for
/// incrementing a "mode" switch and wrapping
/// around back to "mode 0" when the switch
/// goes past the end of the available range.
/// e.g. if you have seven modes, this switches
/// to the next one and wraps around if needed:
///   @code{.cpp}
///   mode = addmod8( mode, 1, 7);
///   @endcode
/// @param a dividend byte
/// @param b value to add to the dividend
/// @param m divisor byte
/// @returns remainder of (a + b) / m
/// @see mod8() for notes on performance.
LIB8STATIC uint8_t addmod8(uint8_t a, uint8_t b, uint8_t m) {
#if defined(__AVR__)
    asm volatile("       add %[a],%[b]    \n\t"
                 "L_%=:  sub %[a],%[m]    \n\t"
                 "       brcc L_%=        \n\t"
                 "       add %[a],%[m]    \n\t"
                 : [a] "+r"(a)
                 : [b] "r"(b), [m] "r"(m));
#else
    a += b;
    while (a >= m)
        a -= m;
#endif
    return a;
}

/// Subtract two numbers, and calculate the modulo
/// of the difference and a third number, M.
/// In other words, it returns (A-B) % M.
/// It is designed as a compact mechanism for
/// decrementing a "mode" switch and wrapping
/// around back to "mode 0" when the switch
/// goes past the start of the available range.
/// e.g. if you have seven modes, this switches
/// to the previous one and wraps around if needed:
///   @code{.cpp}
///   mode = submod8( mode, 1, 7);
///   @endcode
/// @param a dividend byte
/// @param b value to subtract from the dividend
/// @param m divisor byte
/// @returns remainder of (a - b) / m
/// @see mod8() for notes on performance.
LIB8STATIC uint8_t submod8(uint8_t a, uint8_t b, uint8_t m) {
#if defined(__AVR__)
    asm volatile("       sub %[a],%[b]    \n\t"
                 "L_%=:  sub %[a],%[m]    \n\t"
                 "       brcc L_%=        \n\t"
                 "       add %[a],%[m]    \n\t"
                 : [a] "+r"(a)
                 : [b] "r"(b), [m] "r"(m));
#else
    a -= b;
    while (a >= m)
        a -= m;
#endif
    return a;
}

/// 8x8 bit multiplication, with 8-bit result.
/// @param i first byte to multiply
/// @param j second byte to multiply
/// @returns the product of i * j
/// @note This does not saturate and may overflow!
LIB8STATIC_ALWAYS_INLINE uint8_t mul8(uint8_t i, uint8_t j) {
#if MUL8_C == 1
    return ((int)i * (int)(j)) & 0xFF;
#elif MUL8_AVRASM == 1
    asm volatile(
        /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
        "mul %0, %1          \n\t"
        /* Extract the LOW 8-bits (r0) */
        "mov %0, r0          \n\t"
        /* Restore r1 to "0"; it's expected to always be that */
        "clr __zero_reg__    \n\t"
        : "+r"(i)
        : "r"(j)
        : "r0", "r1");
    return i;
#else
#error "No implementation for mul8 available."
#endif
}

/// 8x8 bit multiplication with 8-bit result, saturating at 0xFF.
/// @param i first byte to multiply
/// @param j second byte to multiply
/// @returns the product of i * j, capping at 0xFF
LIB8STATIC_ALWAYS_INLINE uint8_t qmul8(uint8_t i, uint8_t j) {
#if QMUL8_C == 1
    unsigned p = (unsigned)i * (unsigned)j;
    if (p > 255)
        p = 255;
    return p;
#elif QMUL8_AVRASM == 1
    asm volatile(
        /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
        "  mul %0, %1          \n\t"
        /* Extract the LOW 8-bits (r0) */
        "  mov %0, r0          \n\t"
        /* If high byte of result is zero, all is well. */
        "  tst r1              \n\t"
        "  breq Lnospill_%=    \n\t"
        /* If high byte of result > 0, saturate to 0xFF */
        "  ldi %0, 0xFF         \n\t"
        "Lnospill_%=:          \n\t"
        /* Restore r1 to "0"; it's expected to always be that */
        "  clr __zero_reg__    \n\t"
        : "+d"(i) // r16-r31, restricted by ldi
        : "r"(j)
        : "r0", "r1");
    return i;
#else
#error "No implementation for qmul8 available."
#endif
}

/// Take the absolute value of a signed 8-bit uint8_t.
LIB8STATIC_ALWAYS_INLINE int8_t abs8(int8_t i) {
#if ABS8_C == 1
    if (i < 0)
        i = -i;
    return i;
#elif ABS8_AVRASM == 1
    asm volatile(
        /* First, check the high bit, and prepare to skip if it's clear */
        "sbrc %0, 7 \n"

        /* Negate the value */
        "neg %0     \n"

        : "+r"(i)
        : "r"(i));
    return i;
#else
#error "No implementation for abs8 available."
#endif
}

/// Square root for 16-bit integers.
/// About three times faster and five times smaller
/// than Arduino's general `sqrt` on AVR.
LIB8STATIC uint8_t sqrt16(uint16_t x) {
    if (x <= 1) {
        return x;
    }

    uint8_t low = 1; // lower bound
    uint8_t hi, mid;

    if (x > 7904) {
        hi = 255;
    } else {
        hi = (x >> 5) + 8; // initial estimate for upper bound
    }

    do {
        mid = (low + hi) >> 1;
        if ((uint16_t)(mid * mid) > x) {
            hi = mid - 1;
        } else {
            if (mid == 255) {
                return 255;
            }
            low = mid + 1;
        }
    } while (hi >= low);

    return low - 1;
}

LIB8STATIC_ALWAYS_INLINE uint8_t sqrt8(uint8_t x) {
    return sqrt16(map8_to_16(x));
}

/// Blend a variable proportion (0-255) of one byte to another.
/// @param a the starting byte value
/// @param b the byte value to blend toward
/// @param amountOfB the proportion (0-255) of b to blend
/// @returns a byte value between a and b, inclusive
#if (FASTLED_BLEND_FIXED == 1)
LIB8STATIC uint8_t blend8(uint8_t a, uint8_t b, uint8_t amountOfB) {

    // The BLEND_FIXED formula is
    //
    //   result = (  A*(amountOfA) + B*(amountOfB)              )/ 256
    //
    // …where amountOfA = 255-amountOfB.
    //
    // This formula will never return 255, which is why the BLEND_FIXED +
    // SCALE8_FIXED version is
    //
    //   result = (  A*(amountOfA) + A + B*(amountOfB) + B      ) / 256
    //
    // We can rearrange this formula for some great optimisations.
    //
    //   result = (  A*(amountOfA) + A + B*(amountOfB) + B      ) / 256
    //          = (  A*(255-amountOfB) + A + B*(amountOfB) + B  ) / 256
    //          = (  A*(256-amountOfB) + B*(amountOfB) + B      ) / 256
    //          = (  A*256 + B + B*(amountOfB) - A*(amountOfB)  ) / 256  // this
    //          is the version used in SCALE8_FIXED AVR below = (  A*256 + B +
    //          (B-A)*(amountOfB)              ) / 256  // this is the version
    //          used in SCALE8_FIXED C below

    uint16_t partial;
    uint8_t result;

#if BLEND8_C == 1

#if (FASTLED_SCALE8_FIXED == 1)
    partial = (a << 8) | b; // A*256 + B

    // on many platforms this compiles to a single multiply of (B-A) * amountOfB
    partial += (b * amountOfB);
    partial -= (a * amountOfB);

#else
    uint8_t amountOfA = 255 - amountOfB;

    // on the other hand, this compiles to two multiplies, and gives the "wrong"
    // answer :]
    partial = (a * amountOfA);
    partial += (b * amountOfB);
#endif

    result = partial >> 8;

    return result;

#elif BLEND8_AVRASM == 1

#if (FASTLED_SCALE8_FIXED == 1)

    // 1 or 2 cycles depending on how the compiler optimises
    partial = (a << 8) | b;

    // 7 cycles
    asm volatile("  mul %[a], %[amountOfB]        \n\t"
                 "  sub %A[partial], r0           \n\t"
                 "  sbc %B[partial], r1           \n\t"
                 "  mul %[b], %[amountOfB]        \n\t"
                 "  add %A[partial], r0           \n\t"
                 "  adc %B[partial], r1           \n\t"
                 "  clr __zero_reg__              \n\t"
                 : [partial] "+r"(partial)
                 : [amountOfB] "r"(amountOfB), [a] "r"(a), [b] "r"(b)
                 : "r0", "r1");

#else

    // non-SCALE8-fixed version

    // 7 cycles
    asm volatile(
        /* partial = b * amountOfB */
        "  mul %[b], %[amountOfB]        \n\t"
        "  movw %A[partial], r0          \n\t"

        /* amountOfB (aka amountOfA) = 255 - amountOfB */
        "  com %[amountOfB]              \n\t"

        /* partial += a * amountOfB (aka amountOfA) */
        "  mul %[a], %[amountOfB]        \n\t"

        "  add %A[partial], r0           \n\t"
        "  adc %B[partial], r1           \n\t"

        "  clr __zero_reg__              \n\t"

        : [partial] "=r"(partial), [amountOfB] "+r"(amountOfB)
        : [a] "r"(a), [b] "r"(b)
        : "r0", "r1");

#endif

    result = partial >> 8;

    return result;

#else
#error "No implementation for blend8 available."
#endif
}

#else
LIB8STATIC uint8_t blend8(uint8_t a, uint8_t b, uint8_t amountOfB) {
    // This version loses precision in the integer math
    // and can actually return results outside of the range
    // from a to b.  Its use is not recommended.
    uint8_t result;
    uint8_t amountOfA = 255 - amountOfB;
    result = scale8_LEAVING_R1_DIRTY(a, amountOfA) +
             scale8_LEAVING_R1_DIRTY(b, amountOfB);
    cleanup_R1();
    return result;
}
#endif

/// @} Math
/// @} lib8tion

FASTLED_NAMESPACE_END

FL_DISABLE_WARNING_POP