1 files changed, 552 insertions, 0 deletions
diff --git a/lib/lib8tion/math8.h b/lib/lib8tion/math8.h
new file mode 100644
index 000000000..8c6b6c227
--- /dev/null
+++ b/lib/lib8tion/math8.h
@@ -0,0 +1,552 @@
+#ifndef __INC_LIB8TION_MATH_H
+#define __INC_LIB8TION_MATH_H
+#include "scale8.h"
+///@ingroup lib8tion
+///@defgroup Math Basic math operations
+/// Fast, efficient 8-bit math functions specifically
+/// designed for high-performance LED programming.
+///
+/// Because of the AVR(Arduino) and ARM assembly language
+/// implementations provided, using these functions often
+/// results in smaller and faster code than the equivalent
+/// program using plain "C" arithmetic and logic.
+///@{
+/// add one byte to another, saturating at 0xFF
+/// @param i - first byte to add
+/// @param j - second byte to add
+/// @returns the sum of i & j, capped at 0xFF
+LIB8STATIC_ALWAYS_INLINE uint8_t qadd8( uint8_t i, uint8_t j)
+{
+#if QADD8_C == 1
+    uint16_t t = i + j;
+    if (t > 255) t = 255;
+    return t;
+#elif QADD8_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, conditioning the C flag */
+         "add %0, %1    \n\t"
+         /* Now test the C flag.
+           If C is clear, we branch around a load of 0xFF into i.
+           If C is set, we go ahead and load 0xFF into i.
+         */
+         "brcc L_%=     \n\t"
+         "ldi %0, 0xFF  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#elif QADD8_ARM_DSP_ASM == 1
+    asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+    return i;
+#else
+#error "No implementation for qadd8 available."
+#endif
+}
+/// Add one byte to another, saturating at 0x7F
+/// @param i - first byte to add
+/// @param j - second byte to add
+/// @returns the sum of i & j, capped at 0xFF
+LIB8STATIC_ALWAYS_INLINE int8_t qadd7( int8_t i, int8_t j)
+{
+#if QADD7_C == 1
+    int16_t t = i + j;
+    if (t > 127) t = 127;
+    return t;
+#elif QADD7_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, conditioning the V flag */
+         "add %0, %1    \n\t"
+         /* Now test the V flag.
+          If V is clear, we branch around a load of 0x7F into i.
+          If V is set, we go ahead and load 0x7F into i.
+          */
+         "brvc L_%=     \n\t"
+         "ldi %0, 0x7F  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#elif QADD7_ARM_DSP_ASM == 1
+    asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+    return i;
+#else
+#error "No implementation for qadd7 available."
+#endif
+}
+/// subtract one byte from another, saturating at 0x00
+/// @returns i - j with a floor of 0
+LIB8STATIC_ALWAYS_INLINE uint8_t qsub8( uint8_t i, uint8_t j)
+{
+#if QSUB8_C == 1
+    int16_t t = i - j;
+    if (t < 0) t = 0;
+    return t;
+#elif QSUB8_AVRASM == 1
+    asm volatile(
+         /* First, subtract j from i, conditioning the C flag */
+         "sub %0, %1    \n\t"
+         /* Now test the C flag.
+          If C is clear, we branch around a load of 0x00 into i.
+          If C is set, we go ahead and load 0x00 into i.
+          */
+         "brcc L_%=     \n\t"
+         "ldi %0, 0x00  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#else
+#error "No implementation for qsub8 available."
+#endif
+}
+/// add one byte to another, with one byte result
+LIB8STATIC_ALWAYS_INLINE uint8_t add8( uint8_t i, uint8_t j)
+{
+#if ADD8_C == 1
+    uint16_t t = i + j;
+    return t;
+#elif ADD8_AVRASM == 1
+    // Add j to i, period.
+    asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
+    return i;
+#else
+#error "No implementation for add8 available."
+#endif
+}
+/// add one byte to another, with one byte result
+LIB8STATIC_ALWAYS_INLINE uint16_t add8to16( uint8_t i, uint16_t j)
+{
+#if ADD8_C == 1
+    uint16_t t = i + j;
+    return t;
+#elif ADD8_AVRASM == 1
+    // Add i(one byte) to j(two bytes)
+    asm volatile( "add %A[j], %[i]              \n\t"
+                  "adc %B[j], __zero_reg__      \n\t"
+                 : [j] "+a" (j)
+                 : [i] "a"  (i)
+                 );
+    return i;
+#else
+#error "No implementation for add8to16 available."
+#endif
+}
+/// subtract one byte from another, 8-bit result
+LIB8STATIC_ALWAYS_INLINE uint8_t sub8( uint8_t i, uint8_t j)
+{
+#if SUB8_C == 1
+    int16_t t = i - j;
+    return t;
+#elif SUB8_AVRASM == 1
+    // Subtract j from i, period.
+    asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
+    return i;
+#else
+#error "No implementation for sub8 available."
+#endif
+}
+/// Calculate an integer average of two unsigned
+///       8-bit integer values (uint8_t).
+///       Fractional results are rounded down, e.g. avg8(20,41) = 30
+LIB8STATIC_ALWAYS_INLINE uint8_t avg8( uint8_t i, uint8_t j)
+{
+#if AVG8_C == 1
+    return (i + j) >> 1;
+#elif AVG8_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, 9th bit overflows into C flag */
+         "add %0, %1    \n\t"
+         /* Divide by two, moving C flag into high 8th bit */
+         "ror %0        \n\t"
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#else
+#error "No implementation for avg8 available."
+#endif
+}
+/// Calculate an integer average of two unsigned
+///       16-bit integer values (uint16_t).
+///       Fractional results are rounded down, e.g. avg16(20,41) = 30
+LIB8STATIC_ALWAYS_INLINE uint16_t avg16( uint16_t i, uint16_t j)
+{
+#if AVG16_C == 1
+    return (uint32_t)((uint32_t)(i) + (uint32_t)(j)) >> 1;
+#elif AVG16_AVRASM == 1
+    asm volatile(
+                 /* First, add jLo (heh) to iLo, 9th bit overflows into C flag */
+                 "add %A[i], %A[j]    \n\t"
+                 /* Now, add C + jHi to iHi, 17th bit overflows into C flag */
+                 "adc %B[i], %B[j]    \n\t"
+                 /* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now in C */
+                 "ror %B[i]        \n\t"
+                 /* Divide iLo by two, moving C flag into high 8th bit */
+                 "ror %A[i]        \n\t"
+                 : [i] "+a" (i)
+                 : [j] "a"  (j) );
+    return i;
+#else
+#error "No implementation for avg16 available."
+#endif
+}
+/// Calculate an integer average of two signed 7-bit
+///       integers (int8_t)
+///       If the first argument is even, result is rounded down.
+///       If the first argument is odd, result is result up.
+LIB8STATIC_ALWAYS_INLINE int8_t avg7( int8_t i, int8_t j)
+{
+#if AVG7_C == 1
+    return ((i + j) >> 1) + (i & 0x1);
+#elif AVG7_AVRASM == 1
+    asm volatile(
+                 "asr %1        \n\t"
+                 "asr %0        \n\t"
+                 "adc %0, %1    \n\t"
+                 : "+a" (i)
+                 : "a"  (j) );
+    return i;
+#else
+#error "No implementation for avg7 available."
+#endif
+}
+/// Calculate an integer average of two signed 15-bit
+///       integers (int16_t)
+///       If the first argument is even, result is rounded down.
+///       If the first argument is odd, result is result up.
+LIB8STATIC_ALWAYS_INLINE int16_t avg15( int16_t i, int16_t j)
+{
+#if AVG15_C == 1
+    return ((int32_t)((int32_t)(i) + (int32_t)(j)) >> 1) + (i & 0x1);
+#elif AVG15_AVRASM == 1
+    asm volatile(
+                 /* first divide j by 2, throwing away lowest bit */
+                 "asr %B[j]          \n\t"
+                 "ror %A[j]          \n\t"
+                 /* now divide i by 2, with lowest bit going into C */
+                 "asr %B[i]          \n\t"
+                 "ror %A[i]          \n\t"
+                 /* add j + C to i */
+                 "adc %A[i], %A[j]   \n\t"
+                 "adc %B[i], %B[j]   \n\t"
+                 : [i] "+a" (i)
+                 : [j] "a"  (j) );
+    return i;
+#else
+#error "No implementation for avg15 available."
+#endif
+}
+///       Calculate the remainder of one unsigned 8-bit
+///       value divided by anoter, aka A % M.
+///       Implemented by repeated subtraction, which is
+///       very compact, and very fast if A is 'probably'
+///       less than M.  If A is a large multiple of M,
+///       the loop has to execute multiple times.  However,
+///       even in that case, the loop is only two
+///       instructions long on AVR, i.e., quick.
+LIB8STATIC_ALWAYS_INLINE uint8_t mod8( uint8_t a, uint8_t m)
+{
+#if defined(__AVR__)
+    asm volatile (
+                  "L_%=:  sub %[a],%[m]    \n\t"
+                  "       brcc L_%=        \n\t"
+                  "       add %[a],%[m]    \n\t"
+                  : [a] "+r" (a)
+                  : [m] "r"  (m)
+                  );
+#else
+    while( a >= m) a -= m;
+#endif
+    return a;
+}
+///          Add two numbers, and calculate the modulo
+///          of the sum and a third number, M.
+///          In other words, it returns (A+B) % M.
+///          It is designed as a compact mechanism for
+///          incrementing a 'mode' switch and wrapping
+///          around back to 'mode 0' when the switch
+///          goes past the end of the available range.
+///          e.g. if you have seven modes, this switches
+///          to the next one and wraps around if needed:
+///            mode = addmod8( mode, 1, 7);
+///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
+LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)
+{
+#if defined(__AVR__)
+    asm volatile (
+                  "       add %[a],%[b]    \n\t"
+                  "L_%=:  sub %[a],%[m]    \n\t"
+                  "       brcc L_%=        \n\t"
+                  "       add %[a],%[m]    \n\t"
+                  : [a] "+r" (a)
+                  : [b] "r"  (b), [m] "r" (m)
+                  );
+#else
+    a += b;
+    while( a >= m) a -= m;
+#endif
+    return a;
+}
+///          Subtract two numbers, and calculate the modulo
+///          of the difference and a third number, M.
+///          In other words, it returns (A-B) % M.
+///          It is designed as a compact mechanism for
+///          incrementing a 'mode' switch and wrapping
+///          around back to 'mode 0' when the switch
+///          goes past the end of the available range.
+///          e.g. if you have seven modes, this switches
+///          to the next one and wraps around if needed:
+///            mode = addmod8( mode, 1, 7);
+///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
+LIB8STATIC uint8_t submod8( uint8_t a, uint8_t b, uint8_t m)
+{
+#if defined(__AVR__)
+    asm volatile (
+                  "       sub %[a],%[b]    \n\t"
+                  "L_%=:  sub %[a],%[m]    \n\t"
+                  "       brcc L_%=        \n\t"
+                  "       add %[a],%[m]    \n\t"
+                  : [a] "+r" (a)
+                  : [b] "r"  (b), [m] "r" (m)
+                  );
+#else
+    a -= b;
+    while( a >= m) a -= m;
+#endif
+    return a;
+}
+/// 8x8 bit multiplication, with 8 bit result
+LIB8STATIC_ALWAYS_INLINE uint8_t mul8( uint8_t i, uint8_t j)
+{
+#if MUL8_C == 1
+    return ((uint16_t)i * (uint16_t)(j) ) & 0xFF;
+#elif MUL8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+         "mul %0, %1          \n\t"
+         /* Extract the LOW 8-bits (r0) */
+         "mov %0, r0          \n\t"
+         /* Restore r1 to "0"; it's expected to always be that */
+         "clr __zero_reg__    \n\t"
+         : "+a" (i)
+         : "a"  (j)
+         : "r0", "r1");
+    return i;
+#else
+#error "No implementation for mul8 available."
+#endif
+}
+/// saturating 8x8 bit multiplication, with 8 bit result
+/// @returns the product of i * j, capping at 0xFF
+LIB8STATIC_ALWAYS_INLINE uint8_t qmul8( uint8_t i, uint8_t j)
+{
+#if QMUL8_C == 1
+    int p = ((uint16_t)i * (uint16_t)(j) );
+    if( p > 255) p = 255;
+    return p;
+#elif QMUL8_AVRASM == 1
+    asm volatile(
+                 /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+                 "  mul %0, %1          \n\t"
+                 /* If high byte of result is zero, all is well. */
+                 "  tst r1              \n\t"
+                 "  breq Lnospill_%=    \n\t"
+                 /* If high byte of result > 0, saturate low byte to 0xFF */
+                 "  ldi %0,0xFF         \n\t"
+                 "  rjmp Ldone_%=       \n\t"
+                 "Lnospill_%=:          \n\t"
+                 /* Extract the LOW 8-bits (r0) */
+                 "  mov %0, r0          \n\t"
+                 "Ldone_%=:             \n\t"
+                 /* Restore r1 to "0"; it's expected to always be that */
+                 "  clr __zero_reg__    \n\t"
+                 : "+a" (i)
+                 : "a"  (j)
+                 : "r0", "r1");
+    return i;
+#else
+#error "No implementation for qmul8 available."
+#endif
+}
+/// take abs() of a signed 8-bit uint8_t
+LIB8STATIC_ALWAYS_INLINE int8_t abs8( int8_t i)
+{
+#if ABS8_C == 1
+    if( i < 0) i = -i;
+    return i;
+#elif ABS8_AVRASM == 1
+    asm volatile(
+         /* First, check the high bit, and prepare to skip if it's clear */
+         "sbrc %0, 7 \n"
+         /* Negate the value */
+         "neg %0     \n"
+         : "+r" (i) : "r" (i) );
+    return i;
+#else
+#error "No implementation for abs8 available."
+#endif
+}
+///         square root for 16-bit integers
+///         About three times faster and five times smaller
+///         than Arduino's general sqrt on AVR.
+LIB8STATIC uint8_t sqrt16(uint16_t x)
+{
+    if( x <= 1) {
+        return x;
+    }
+    uint8_t low = 1; // lower bound
+    uint8_t hi, mid;
+    if( x > 7904) {
+        hi = 255;
+    } else {
+        hi = (x >> 5) + 8; // initial estimate for upper bound
+    }
+    do {
+        mid = (low + hi) >> 1;
+        if ((uint16_t)(mid * mid) > x) {
+            hi = mid - 1;
+        } else {
+            if( mid == 255) {
+                return 255;
+            }
+            low = mid + 1;
+        }
+    } while (hi >= low);
+    return low - 1;
+}
+/// blend a variable proproportion(0-255) of one byte to another
+/// @param a - the starting byte value
+/// @param b - the byte value to blend toward
+/// @param amountOfB - the proportion (0-255) of b to blend
+/// @returns a byte value between a and b, inclusive
+#if (FASTLED_BLEND_FIXED == 1)
+LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
+{
+#if BLEND8_C == 1
+    uint16_t partial;
+    uint8_t result;
+    uint8_t amountOfA = 255 - amountOfB;
+    partial = (a * amountOfA);
+#if (FASTLED_SCALE8_FIXED == 1)
+    partial += a;
+    //partial = add8to16( a, partial);
+#endif
+    partial += (b * amountOfB);
+#if (FASTLED_SCALE8_FIXED == 1)
+    partial += b;
+    //partial = add8to16( b, partial);
+#endif
+    result = partial >> 8;
+    return result;
+#elif BLEND8_AVRASM == 1
+    uint16_t partial;
+    uint8_t result;
+    asm volatile (
+        /* partial = b * amountOfB */
+        "  mul %[b], %[amountOfB]        \n\t"
+        "  movw %A[partial], r0          \n\t"
+        /* amountOfB (aka amountOfA) = 255 - amountOfB */
+        "  com %[amountOfB]              \n\t"
+        /* partial += a * amountOfB (aka amountOfA) */
+        "  mul %[a], %[amountOfB]        \n\t"
+        "  add %A[partial], r0           \n\t"
+        "  adc %B[partial], r1           \n\t"
+        "  clr __zero_reg__              \n\t"
+#if (FASTLED_SCALE8_FIXED == 1)
+        /* partial += a */
+        "  add %A[partial], %[a]         \n\t"
+        "  adc %B[partial], __zero_reg__ \n\t"
+        // partial += b
+        "  add %A[partial], %[b]         \n\t"
+        "  adc %B[partial], __zero_reg__ \n\t"
+#endif
+        : [partial] "=r" (partial),
+          [amountOfB] "+a" (amountOfB)
+        : [a] "a" (a),
+          [b] "a" (b)
+        : "r0", "r1"
+    );
+    result = partial >> 8;
+    return result;
+#else
+#error "No implementation for blend8 available."
+#endif
+}
+#else
+LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
+{
+    // This version loses precision in the integer math
+    // and can actually return results outside of the range
+    // from a to b.  Its use is not recommended.
+    uint8_t result;
+    uint8_t amountOfA = 255 - amountOfB;
+    result = scale8_LEAVING_R1_DIRTY( a, amountOfA)
+           + scale8_LEAVING_R1_DIRTY( b, amountOfB);
+    cleanup_R1();
+    return result;
+}
+#endif
+///@}
+#endif

diff --git a/lib/lib8tion/math8.h b/lib/lib8tion/math8.h new file mode 100644 index 000000000..8c6b6c227 --- /dev/null +++ b/lib/lib8tion/math8.h
@@ -0,0 +1,552 @@
	1	#ifndef __INC_LIB8TION_MATH_H
	2	#define __INC_LIB8TION_MATH_H
	3
	4	#include "scale8.h"
	5
	6	///@ingroup lib8tion
	7
	8	///@defgroup Math Basic math operations
	9	/// Fast, efficient 8-bit math functions specifically
	10	/// designed for high-performance LED programming.
	11	///
	12	/// Because of the AVR(Arduino) and ARM assembly language
	13	/// implementations provided, using these functions often
	14	/// results in smaller and faster code than the equivalent
	15	/// program using plain "C" arithmetic and logic.
	16	///@{
	17
	18
	19	/// add one byte to another, saturating at 0xFF
	20	/// @param i - first byte to add
	21	/// @param j - second byte to add
	22	/// @returns the sum of i & j, capped at 0xFF
	23	LIB8STATIC_ALWAYS_INLINE uint8_t qadd8( uint8_t i, uint8_t j)
	24	{
	25	#if QADD8_C == 1
	26	uint16_t t = i + j;
	27	if (t > 255) t = 255;
	28	return t;
	29	#elif QADD8_AVRASM == 1
	30	asm volatile(
	31	/* First, add j to i, conditioning the C flag */
	32	"add %0, %1 \n\t"
	33
	34	/* Now test the C flag.
	35	If C is clear, we branch around a load of 0xFF into i.
	36	If C is set, we go ahead and load 0xFF into i.
	37	*/
	38	"brcc L_%= \n\t"
	39	"ldi %0, 0xFF \n\t"
	40	"L_%=: "
	41	: "+a" (i)
	42	: "a" (j) );
	43	return i;
	44	#elif QADD8_ARM_DSP_ASM == 1
	45	asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
	46	return i;
	47	#else
	48	#error "No implementation for qadd8 available."
	49	#endif
	50	}
	51
	52	/// Add one byte to another, saturating at 0x7F
	53	/// @param i - first byte to add
	54	/// @param j - second byte to add
	55	/// @returns the sum of i & j, capped at 0xFF
	56	LIB8STATIC_ALWAYS_INLINE int8_t qadd7( int8_t i, int8_t j)
	57	{
	58	#if QADD7_C == 1
	59	int16_t t = i + j;
	60	if (t > 127) t = 127;
	61	return t;
	62	#elif QADD7_AVRASM == 1
	63	asm volatile(
	64	/* First, add j to i, conditioning the V flag */
	65	"add %0, %1 \n\t"
	66
	67	/* Now test the V flag.
	68	If V is clear, we branch around a load of 0x7F into i.
	69	If V is set, we go ahead and load 0x7F into i.
	70	*/
	71	"brvc L_%= \n\t"
	72	"ldi %0, 0x7F \n\t"
	73	"L_%=: "
	74	: "+a" (i)
	75	: "a" (j) );
	76
	77	return i;
	78	#elif QADD7_ARM_DSP_ASM == 1
	79	asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
	80	return i;
	81	#else
	82	#error "No implementation for qadd7 available."
	83	#endif
	84	}
	85
	86	/// subtract one byte from another, saturating at 0x00
	87	/// @returns i - j with a floor of 0
	88	LIB8STATIC_ALWAYS_INLINE uint8_t qsub8( uint8_t i, uint8_t j)
	89	{
	90	#if QSUB8_C == 1
	91	int16_t t = i - j;
	92	if (t < 0) t = 0;
	93	return t;
	94	#elif QSUB8_AVRASM == 1
	95
	96	asm volatile(
	97	/* First, subtract j from i, conditioning the C flag */
	98	"sub %0, %1 \n\t"
	99
	100	/* Now test the C flag.
	101	If C is clear, we branch around a load of 0x00 into i.
	102	If C is set, we go ahead and load 0x00 into i.
	103	*/
	104	"brcc L_%= \n\t"
	105	"ldi %0, 0x00 \n\t"
	106	"L_%=: "
	107	: "+a" (i)
	108	: "a" (j) );
	109
	110	return i;
	111	#else
	112	#error "No implementation for qsub8 available."
	113	#endif
	114	}
	115
	116	/// add one byte to another, with one byte result
	117	LIB8STATIC_ALWAYS_INLINE uint8_t add8( uint8_t i, uint8_t j)
	118	{
	119	#if ADD8_C == 1
	120	uint16_t t = i + j;
	121	return t;
	122	#elif ADD8_AVRASM == 1
	123	// Add j to i, period.
	124	asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
	125	return i;
	126	#else
	127	#error "No implementation for add8 available."
	128	#endif
	129	}
	130
	131	/// add one byte to another, with one byte result
	132	LIB8STATIC_ALWAYS_INLINE uint16_t add8to16( uint8_t i, uint16_t j)
	133	{
	134	#if ADD8_C == 1
	135	uint16_t t = i + j;
	136	return t;
	137	#elif ADD8_AVRASM == 1
	138	// Add i(one byte) to j(two bytes)
	139	asm volatile( "add %A[j], %[i] \n\t"
	140	"adc %B[j], __zero_reg__ \n\t"
	141	: [j] "+a" (j)
	142	: [i] "a" (i)
	143	);
	144	return i;
	145	#else
	146	#error "No implementation for add8to16 available."
	147	#endif
	148	}
	149
	150
	151	/// subtract one byte from another, 8-bit result
	152	LIB8STATIC_ALWAYS_INLINE uint8_t sub8( uint8_t i, uint8_t j)
	153	{
	154	#if SUB8_C == 1
	155	int16_t t = i - j;
	156	return t;
	157	#elif SUB8_AVRASM == 1
	158	// Subtract j from i, period.
	159	asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
	160	return i;
	161	#else
	162	#error "No implementation for sub8 available."
	163	#endif
	164	}
	165
	166	/// Calculate an integer average of two unsigned
	167	/// 8-bit integer values (uint8_t).
	168	/// Fractional results are rounded down, e.g. avg8(20,41) = 30
	169	LIB8STATIC_ALWAYS_INLINE uint8_t avg8( uint8_t i, uint8_t j)
	170	{
	171	#if AVG8_C == 1
	172	return (i + j) >> 1;
	173	#elif AVG8_AVRASM == 1
	174	asm volatile(
	175	/* First, add j to i, 9th bit overflows into C flag */
	176	"add %0, %1 \n\t"
	177	/* Divide by two, moving C flag into high 8th bit */
	178	"ror %0 \n\t"
	179	: "+a" (i)
	180	: "a" (j) );
	181	return i;
	182	#else
	183	#error "No implementation for avg8 available."
	184	#endif
	185	}
	186
	187	/// Calculate an integer average of two unsigned
	188	/// 16-bit integer values (uint16_t).
	189	/// Fractional results are rounded down, e.g. avg16(20,41) = 30
	190	LIB8STATIC_ALWAYS_INLINE uint16_t avg16( uint16_t i, uint16_t j)
	191	{
	192	#if AVG16_C == 1
	193	return (uint32_t)((uint32_t)(i) + (uint32_t)(j)) >> 1;
	194	#elif AVG16_AVRASM == 1
	195	asm volatile(
	196	/* First, add jLo (heh) to iLo, 9th bit overflows into C flag */
	197	"add %A[i], %A[j] \n\t"
	198	/* Now, add C + jHi to iHi, 17th bit overflows into C flag */
	199	"adc %B[i], %B[j] \n\t"
	200	/* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now in C */
	201	"ror %B[i] \n\t"
	202	/* Divide iLo by two, moving C flag into high 8th bit */
	203	"ror %A[i] \n\t"
	204	: [i] "+a" (i)
	205	: [j] "a" (j) );
	206	return i;
	207	#else
	208	#error "No implementation for avg16 available."
	209	#endif
	210	}
	211
	212
	213	/// Calculate an integer average of two signed 7-bit
	214	/// integers (int8_t)
	215	/// If the first argument is even, result is rounded down.
	216	/// If the first argument is odd, result is result up.
	217	LIB8STATIC_ALWAYS_INLINE int8_t avg7( int8_t i, int8_t j)
	218	{
	219	#if AVG7_C == 1
	220	return ((i + j) >> 1) + (i & 0x1);
	221	#elif AVG7_AVRASM == 1
	222	asm volatile(
	223	"asr %1 \n\t"
	224	"asr %0 \n\t"
	225	"adc %0, %1 \n\t"
	226	: "+a" (i)
	227	: "a" (j) );
	228	return i;
	229	#else
	230	#error "No implementation for avg7 available."
	231	#endif
	232	}
	233
	234	/// Calculate an integer average of two signed 15-bit
	235	/// integers (int16_t)
	236	/// If the first argument is even, result is rounded down.
	237	/// If the first argument is odd, result is result up.
	238	LIB8STATIC_ALWAYS_INLINE int16_t avg15( int16_t i, int16_t j)
	239	{
	240	#if AVG15_C == 1
	241	return ((int32_t)((int32_t)(i) + (int32_t)(j)) >> 1) + (i & 0x1);
	242	#elif AVG15_AVRASM == 1
	243	asm volatile(
	244	/* first divide j by 2, throwing away lowest bit */
	245	"asr %B[j] \n\t"
	246	"ror %A[j] \n\t"
	247	/* now divide i by 2, with lowest bit going into C */
	248	"asr %B[i] \n\t"
	249	"ror %A[i] \n\t"
	250	/* add j + C to i */
	251	"adc %A[i], %A[j] \n\t"
	252	"adc %B[i], %B[j] \n\t"
	253	: [i] "+a" (i)
	254	: [j] "a" (j) );
	255	return i;
	256	#else
	257	#error "No implementation for avg15 available."
	258	#endif
	259	}
	260
	261
	262	/// Calculate the remainder of one unsigned 8-bit
	263	/// value divided by anoter, aka A % M.
	264	/// Implemented by repeated subtraction, which is
	265	/// very compact, and very fast if A is 'probably'
	266	/// less than M. If A is a large multiple of M,
	267	/// the loop has to execute multiple times. However,
	268	/// even in that case, the loop is only two
	269	/// instructions long on AVR, i.e., quick.
	270	LIB8STATIC_ALWAYS_INLINE uint8_t mod8( uint8_t a, uint8_t m)
	271	{
	272	#if defined(__AVR__)
	273	asm volatile (
	274	"L_%=: sub %[a],%[m] \n\t"
	275	" brcc L_%= \n\t"
	276	" add %[a],%[m] \n\t"
	277	: [a] "+r" (a)
	278	: [m] "r" (m)
	279	);
	280	#else
	281	while( a >= m) a -= m;
	282	#endif
	283	return a;
	284	}
	285
	286	/// Add two numbers, and calculate the modulo
	287	/// of the sum and a third number, M.
	288	/// In other words, it returns (A+B) % M.
	289	/// It is designed as a compact mechanism for
	290	/// incrementing a 'mode' switch and wrapping
	291	/// around back to 'mode 0' when the switch
	292	/// goes past the end of the available range.
	293	/// e.g. if you have seven modes, this switches
	294	/// to the next one and wraps around if needed:
	295	/// mode = addmod8( mode, 1, 7);
	296	///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
	297	LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)
	298	{
	299	#if defined(__AVR__)
	300	asm volatile (
	301	" add %[a],%[b] \n\t"
	302	"L_%=: sub %[a],%[m] \n\t"
	303	" brcc L_%= \n\t"
	304	" add %[a],%[m] \n\t"
	305	: [a] "+r" (a)
	306	: [b] "r" (b), [m] "r" (m)
	307	);
	308	#else
	309	a += b;
	310	while( a >= m) a -= m;
	311	#endif
	312	return a;
	313	}
	314
	315	/// Subtract two numbers, and calculate the modulo
	316	/// of the difference and a third number, M.
	317	/// In other words, it returns (A-B) % M.
	318	/// It is designed as a compact mechanism for
	319	/// incrementing a 'mode' switch and wrapping
	320	/// around back to 'mode 0' when the switch
	321	/// goes past the end of the available range.
	322	/// e.g. if you have seven modes, this switches
	323	/// to the next one and wraps around if needed:
	324	/// mode = addmod8( mode, 1, 7);
	325	///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
	326	LIB8STATIC uint8_t submod8( uint8_t a, uint8_t b, uint8_t m)
	327	{
	328	#if defined(__AVR__)
	329	asm volatile (
	330	" sub %[a],%[b] \n\t"
	331	"L_%=: sub %[a],%[m] \n\t"
	332	" brcc L_%= \n\t"
	333	" add %[a],%[m] \n\t"
	334	: [a] "+r" (a)
	335	: [b] "r" (b), [m] "r" (m)
	336	);
	337	#else
	338	a -= b;
	339	while( a >= m) a -= m;
	340	#endif
	341	return a;
	342	}
	343
	344	/// 8x8 bit multiplication, with 8 bit result
	345	LIB8STATIC_ALWAYS_INLINE uint8_t mul8( uint8_t i, uint8_t j)
	346	{
	347	#if MUL8_C == 1
	348	return ((uint16_t)i * (uint16_t)(j) ) & 0xFF;
	349	#elif MUL8_AVRASM == 1
	350	asm volatile(
	351	/* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
	352	"mul %0, %1 \n\t"
	353	/* Extract the LOW 8-bits (r0) */
	354	"mov %0, r0 \n\t"
	355	/* Restore r1 to "0"; it's expected to always be that */
	356	"clr __zero_reg__ \n\t"
	357	: "+a" (i)
	358	: "a" (j)
	359	: "r0", "r1");
	360
	361	return i;
	362	#else
	363	#error "No implementation for mul8 available."
	364	#endif
	365	}
	366
	367
	368	/// saturating 8x8 bit multiplication, with 8 bit result
	369	/// @returns the product of i * j, capping at 0xFF
	370	LIB8STATIC_ALWAYS_INLINE uint8_t qmul8( uint8_t i, uint8_t j)
	371	{
	372	#if QMUL8_C == 1
	373	int p = ((uint16_t)i * (uint16_t)(j) );
	374	if( p > 255) p = 255;
	375	return p;
	376	#elif QMUL8_AVRASM == 1
	377	asm volatile(
	378	/* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
	379	" mul %0, %1 \n\t"
	380	/* If high byte of result is zero, all is well. */
	381	" tst r1 \n\t"
	382	" breq Lnospill_%= \n\t"
	383	/* If high byte of result > 0, saturate low byte to 0xFF */
	384	" ldi %0,0xFF \n\t"
	385	" rjmp Ldone_%= \n\t"
	386	"Lnospill_%=: \n\t"
	387	/* Extract the LOW 8-bits (r0) */
	388	" mov %0, r0 \n\t"
	389	"Ldone_%=: \n\t"
	390	/* Restore r1 to "0"; it's expected to always be that */
	391	" clr __zero_reg__ \n\t"
	392	: "+a" (i)
	393	: "a" (j)
	394	: "r0", "r1");
	395
	396	return i;
	397	#else
	398	#error "No implementation for qmul8 available."
	399	#endif
	400	}
	401
	402
	403	/// take abs() of a signed 8-bit uint8_t
	404	LIB8STATIC_ALWAYS_INLINE int8_t abs8( int8_t i)
	405	{
	406	#if ABS8_C == 1
	407	if( i < 0) i = -i;
	408	return i;
	409	#elif ABS8_AVRASM == 1
	410
	411
	412	asm volatile(
	413	/* First, check the high bit, and prepare to skip if it's clear */
	414	"sbrc %0, 7 \n"
	415
	416	/* Negate the value */
	417	"neg %0 \n"
	418
	419	: "+r" (i) : "r" (i) );
	420	return i;
	421	#else
	422	#error "No implementation for abs8 available."
	423	#endif
	424	}
	425
	426	/// square root for 16-bit integers
	427	/// About three times faster and five times smaller
	428	/// than Arduino's general sqrt on AVR.
	429	LIB8STATIC uint8_t sqrt16(uint16_t x)
	430	{
	431	if( x <= 1) {
	432	return x;
	433	}
	434
	435	uint8_t low = 1; // lower bound
	436	uint8_t hi, mid;
	437
	438	if( x > 7904) {
	439	hi = 255;
	440	} else {
	441	hi = (x >> 5) + 8; // initial estimate for upper bound
	442	}
	443
	444	do {
	445	mid = (low + hi) >> 1;
	446	if ((uint16_t)(mid * mid) > x) {
	447	hi = mid - 1;
	448	} else {
	449	if( mid == 255) {
	450	return 255;
	451	}
	452	low = mid + 1;
	453	}
	454	} while (hi >= low);
	455
	456	return low - 1;
	457	}
	458
	459	/// blend a variable proproportion(0-255) of one byte to another
	460	/// @param a - the starting byte value
	461	/// @param b - the byte value to blend toward
	462	/// @param amountOfB - the proportion (0-255) of b to blend
	463	/// @returns a byte value between a and b, inclusive
	464	#if (FASTLED_BLEND_FIXED == 1)
	465	LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
	466	{
	467	#if BLEND8_C == 1
	468	uint16_t partial;
	469	uint8_t result;
	470
	471	uint8_t amountOfA = 255 - amountOfB;
	472
	473	partial = (a * amountOfA);
	474	#if (FASTLED_SCALE8_FIXED == 1)
	475	partial += a;
	476	//partial = add8to16( a, partial);
	477	#endif
	478
	479	partial += (b * amountOfB);
	480	#if (FASTLED_SCALE8_FIXED == 1)
	481	partial += b;
	482	//partial = add8to16( b, partial);
	483	#endif
	484
	485	result = partial >> 8;
	486
	487	return result;
	488
	489	#elif BLEND8_AVRASM == 1
	490	uint16_t partial;
	491	uint8_t result;
	492
	493	asm volatile (
	494	/* partial = b * amountOfB */
	495	" mul %[b], %[amountOfB] \n\t"
	496	" movw %A[partial], r0 \n\t"
	497
	498	/* amountOfB (aka amountOfA) = 255 - amountOfB */
	499	" com %[amountOfB] \n\t"
	500
	501	/* partial += a * amountOfB (aka amountOfA) */
	502	" mul %[a], %[amountOfB] \n\t"
	503
	504	" add %A[partial], r0 \n\t"
	505	" adc %B[partial], r1 \n\t"
	506
	507	" clr __zero_reg__ \n\t"
	508
	509	#if (FASTLED_SCALE8_FIXED == 1)
	510	/* partial += a */
	511	" add %A[partial], %[a] \n\t"
	512	" adc %B[partial], __zero_reg__ \n\t"
	513
	514	// partial += b
	515	" add %A[partial], %[b] \n\t"
	516	" adc %B[partial], __zero_reg__ \n\t"
	517	#endif
	518
	519	: [partial] "=r" (partial),
	520	[amountOfB] "+a" (amountOfB)
	521	: [a] "a" (a),
	522	[b] "a" (b)
	523	: "r0", "r1"
	524	);
	525
	526	result = partial >> 8;
	527
	528	return result;
	529
	530	#else
	531	#error "No implementation for blend8 available."
	532	#endif
	533	}
	534
	535	#else
	536	LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
	537	{
	538	// This version loses precision in the integer math
	539	// and can actually return results outside of the range
	540	// from a to b. Its use is not recommended.
	541	uint8_t result;
	542	uint8_t amountOfA = 255 - amountOfB;
	543	result = scale8_LEAVING_R1_DIRTY( a, amountOfA)
	544	+ scale8_LEAVING_R1_DIRTY( b, amountOfB);
	545	cleanup_R1();
	546	return result;
	547	}
	548	#endif
	549
	550
	551	///@}
	552	#endif