1 files changed, 542 insertions, 0 deletions
diff --git a/lib/lib8tion/scale8.h b/lib/lib8tion/scale8.h
new file mode 100644
index 000000000..9895fd4d7
--- /dev/null
+++ b/lib/lib8tion/scale8.h
@@ -0,0 +1,542 @@
+#ifndef __INC_LIB8TION_SCALE_H
+#define __INC_LIB8TION_SCALE_H
+///@ingroup lib8tion
+///@defgroup Scaling Scaling functions
+/// Fast, efficient 8-bit scaling functions specifically
+/// designed for high-performance LED programming.
+///
+/// Because of the AVR(Arduino) and ARM assembly language
+/// implementations provided, using these functions often
+/// results in smaller and faster code than the equivalent
+/// program using plain "C" arithmetic and logic.
+///@{
+///  scale one byte by a second one, which is treated as
+///  the numerator of a fraction whose denominator is 256
+///  In other words, it computes i * (scale / 256)
+///  4 clocks AVR with MUL, 2 clocks ARM
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+#if (FASTLED_SCALE8_FIXED == 1)
+    return (((uint16_t)i) * (1+(uint16_t)(scale))) >> 8;
+#else
+    return ((uint16_t)i * (uint16_t)(scale) ) >> 8;
+#endif
+#elif SCALE8_AVRASM == 1
+#if defined(LIB8_ATTINY)
+#if (FASTLED_SCALE8_FIXED == 1)
+    uint8_t work=i;
+#else
+    uint8_t work=0;
+#endif
+    uint8_t cnt=0x80;
+    asm volatile(
+#if (FASTLED_SCALE8_FIXED == 1)
+        "  inc %[scale]                 \n\t"
+        "  breq DONE_%=                 \n\t"
+        "  clr %[work]                  \n\t"
+#endif
+        "LOOP_%=:                       \n\t"
+        /*"  sbrc %[scale], 0             \n\t"
+        "  add %[work], %[i]            \n\t"
+        "  ror %[work]                  \n\t"
+        "  lsr %[scale]                 \n\t"
+        "  clc                          \n\t"*/
+        "  sbrc %[scale], 0             \n\t"
+        "  add %[work], %[i]            \n\t"
+        "  ror %[work]                  \n\t"
+        "  lsr %[scale]                 \n\t"
+        "  lsr %[cnt]                   \n\t"
+        "brcc LOOP_%=                   \n\t"
+        "DONE_%=:                       \n\t"
+        : [work] "+r" (work), [cnt] "+r" (cnt)
+        : [scale] "r" (scale), [i] "r" (i)
+        :
+      );
+    return work;
+#else
+    asm volatile(
+#if (FASTLED_SCALE8_FIXED==1)
+        // Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
+        "mul %0, %1          \n\t"
+        // Add i to r0, possibly setting the carry flag
+        "add r0, %0         \n\t"
+        // load the immediate 0 into i (note, this does _not_ touch any flags)
+        "ldi %0, 0x00       \n\t"
+        // walk and chew gum at the same time
+        "adc %0, r1          \n\t"
+#else
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1          \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1          \n\t"
+         /* Restore r1 to "0"; it's expected to always be that */
+#endif
+         "clr __zero_reg__    \n\t"
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+    /* Return the result */
+    return i;
+#endif
+#else
+#error "No implementation for scale8 available."
+#endif
+}
+///  The "video" version of scale8 guarantees that the output will
+///  be only be zero if one or both of the inputs are zero.  If both
+///  inputs are non-zero, the output is guaranteed to be non-zero.
+///  This makes for better 'video'/LED dimming, at the cost of
+///  several additional cycles.
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    return j;
+#elif SCALE8_AVRASM == 1
+    uint8_t j=0;
+    asm volatile(
+        "  tst %[i]\n\t"
+        "  breq L_%=\n\t"
+        "  mul %[i], %[scale]\n\t"
+        "  mov %[j], r1\n\t"
+        "  clr __zero_reg__\n\t"
+        "  cpse %[scale], r1\n\t"
+        "  subi %[j], 0xFF\n\t"
+        "L_%=: \n\t"
+        : [j] "+a" (j)
+        : [i] "a" (i), [scale] "a" (scale)
+        : "r0", "r1");
+    return j;
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // asm volatile(
+    //      "      tst %0           \n"
+    //      "      breq L_%=        \n"
+    //      "      mul %0, %1       \n"
+    //      "      mov %0, r1       \n"
+    //      "      add %0, %2       \n"
+    //      "      clr __zero_reg__ \n"
+    //      "L_%=:                  \n"
+    //      : "+a" (i)
+    //      : "a" (scale), "a" (nonzeroscale)
+    //      : "r0", "r1");
+    // // Return the result
+    // return i;
+#else
+#error "No implementation for scale8_video available."
+#endif
+}
+/// This version of scale8 does not clean up the R1 register on AVR
+/// If you are doing several 'scale8's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+#if (FASTLED_SCALE8_FIXED == 1)
+    return (((uint16_t)i) * ((uint16_t)(scale)+1)) >> 8;
+#else
+    return ((int)i * (int)(scale) ) >> 8;
+#endif
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+      #if (FASTLED_SCALE8_FIXED==1)
+              // Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
+              "mul %0, %1          \n\t"
+              // Add i to r0, possibly setting the carry flag
+              "add r0, %0         \n\t"
+              // load the immediate 0 into i (note, this does _not_ touch any flags)
+              "ldi %0, 0x00       \n\t"
+              // walk and chew gum at the same time
+              "adc %0, r1          \n\t"
+      #else
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1    \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1    \n\t"
+      #endif
+         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF  */
+         /* "clr __zero_reg__    \n\t" */
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+    // Return the result
+    return i;
+#else
+#error "No implementation for scale8_LEAVING_R1_DIRTY available."
+#endif
+}
+/// This version of scale8_video does not clean up the R1 register on AVR
+/// If you are doing several 'scale8_video's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    return j;
+#elif SCALE8_AVRASM == 1
+    uint8_t j=0;
+    asm volatile(
+        "  tst %[i]\n\t"
+        "  breq L_%=\n\t"
+        "  mul %[i], %[scale]\n\t"
+        "  mov %[j], r1\n\t"
+        "  breq L_%=\n\t"
+        "  subi %[j], 0xFF\n\t"
+        "L_%=: \n\t"
+        : [j] "+a" (j)
+        : [i] "a" (i), [scale] "a" (scale)
+        : "r0", "r1");
+    return j;
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // asm volatile(
+    //      "      tst %0           \n"
+    //      "      breq L_%=        \n"
+    //      "      mul %0, %1       \n"
+    //      "      mov %0, r1       \n"
+    //      "      add %0, %2       \n"
+    //      "      clr __zero_reg__ \n"
+    //      "L_%=:                  \n"
+    //      : "+a" (i)
+    //      : "a" (scale), "a" (nonzeroscale)
+    //      : "r0", "r1");
+    // // Return the result
+    // return i;
+#else
+#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
+#endif
+}
+/// Clean up the r1 register after a series of *LEAVING_R1_DIRTY calls
+LIB8STATIC_ALWAYS_INLINE void cleanup_R1(void)
+{
+#if CLEANUP_R1_AVRASM == 1
+    // Restore r1 to "0"; it's expected to always be that
+    asm volatile( "clr __zero_reg__  \n\t" : : : "r1" );
+#endif
+}
+/// scale a 16-bit unsigned value by an 8-bit value,
+///         considered as numerator of a fraction whose denominator
+///         is 256. In other words, it computes i * (scale / 256)
+LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )
+{
+#if SCALE16BY8_C == 1
+    uint16_t result;
+#if FASTLED_SCALE8_FIXED == 1
+    result = (i * (1+((uint16_t)scale))) >> 8;
+#else
+    result = (i * scale) / 256;
+#endif
+    return result;
+#elif SCALE16BY8_AVRASM == 1
+#if FASTLED_SCALE8_FIXED == 1
+    uint16_t result = 0;
+    asm volatile(
+                 // result.A = HighByte( (i.A x scale) + i.A )
+                 "  mul %A[i], %[scale]                 \n\t"
+                 "  add r0, %A[i]                       \n\t"
+            //   "  adc r1, [zero]                      \n\t"
+            //   "  mov %A[result], r1                  \n\t"
+                 "  adc %A[result], r1                  \n\t"
+                 // result.A-B += i.B x scale
+                 "  mul %B[i], %[scale]                 \n\t"
+                 "  add %A[result], r0                  \n\t"
+                 "  adc %B[result], r1                  \n\t"
+                 // cleanup r1
+                 "  clr __zero_reg__                    \n\t"
+                 // result.A-B += i.B
+                 "  add %A[result], %B[i]               \n\t"
+                 "  adc %B[result], __zero_reg__        \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i), [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
+    return result;
+#else
+    uint16_t result = 0;
+    asm volatile(
+         // result.A = HighByte(i.A x j )
+         "  mul %A[i], %[scale]                 \n\t"
+         "  mov %A[result], r1                  \n\t"
+         //"  clr %B[result]                      \n\t"
+         // result.A-B += i.B x j
+         "  mul %B[i], %[scale]                 \n\t"
+         "  add %A[result], r0                  \n\t"
+         "  adc %B[result], r1                  \n\t"
+         // cleanup r1
+         "  clr __zero_reg__                    \n\t"
+         : [result] "+r" (result)
+         : [i] "r" (i), [scale] "r" (scale)
+         : "r0", "r1"
+         );
+    return result;
+#endif
+#else
+    #error "No implementation for scale16by8 available."
+#endif
+}
+/// scale a 16-bit unsigned value by a 16-bit value,
+///         considered as numerator of a fraction whose denominator
+///         is 65536. In other words, it computes i * (scale / 65536)
+LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
+{
+  #if SCALE16_C == 1
+    uint16_t result;
+#if FASTLED_SCALE8_FIXED == 1
+    result = ((uint32_t)(i) * (1+(uint32_t)(scale))) / 65536;
+#else
+    result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
+#endif
+    return result;
+#elif SCALE16_AVRASM == 1
+#if FASTLED_SCALE8_FIXED == 1
+    // implemented sort of like
+    //   result = ((i * scale) + i ) / 65536
+    //
+    // why not like this, you may ask?
+    //   result = (i * (scale+1)) / 65536
+    // the answer is that if scale is 65535, then scale+1
+    // will be zero, which is not what we want.
+    uint32_t result;
+    asm volatile(
+                 // result.A-B  = i.A x scale.A
+                 "  mul %A[i], %A[scale]                 \n\t"
+                 //  save results...
+                 // basic idea:
+                 //"  mov %A[result], r0                 \n\t"
+                 //"  mov %B[result], r1                 \n\t"
+                 // which can be written as...
+                 "  movw %A[result], r0                   \n\t"
+                 // Because we're going to add i.A-B to
+                 // result.A-D, we DO need to keep both
+                 // the r0 and r1 portions of the product
+                 // UNlike in the 'unfixed scale8' version.
+                 // So the movw here is needed.
+                 : [result] "=r" (result)
+                 : [i] "r" (i),
+                 [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
+    asm volatile(
+                 // result.C-D  = i.B x scale.B
+                 "  mul %B[i], %B[scale]                 \n\t"
+                 //"  mov %C[result], r0                 \n\t"
+                 //"  mov %D[result], r1                 \n\t"
+                 "  movw %C[result], r0                   \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                 [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
+    const uint8_t  zero = 0;
+    asm volatile(
+                 // result.B-D += i.B x scale.A
+                 "  mul %B[i], %A[scale]                 \n\t"
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                 // result.B-D += i.A x scale.B
+                 "  mul %A[i], %B[scale]                 \n\t"
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                 // cleanup r1
+                 "  clr r1                               \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                 [scale] "r" (scale),
+                 [zero] "r" (zero)
+                 : "r0", "r1"
+                 );
+    asm volatile(
+                 // result.A-D += i.A-B
+                 "  add %A[result], %A[i]                \n\t"
+                 "  adc %B[result], %B[i]                \n\t"
+                 "  adc %C[result], %[zero]              \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                 [zero] "r" (zero)
+                 );
+    result = result >> 16;
+    return result;
+#else
+    uint32_t result;
+    asm volatile(
+                 // result.A-B  = i.A x scale.A
+                 "  mul %A[i], %A[scale]                 \n\t"
+                 //  save results...
+                 // basic idea:
+                 //"  mov %A[result], r0                 \n\t"
+                 //"  mov %B[result], r1                 \n\t"
+                 // which can be written as...
+                 "  movw %A[result], r0                   \n\t"
+                 // We actually don't need to do anything with r0,
+                 // as result.A is never used again here, so we
+                 // could just move the high byte, but movw is
+                 // one clock cycle, just like mov, so might as
+                 // well, in case we want to use this code for
+                 // a generic 16x16 multiply somewhere.
+                 : [result] "=r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
+    asm volatile(
+                 // result.C-D  = i.B x scale.B
+                 "  mul %B[i], %B[scale]                 \n\t"
+                 //"  mov %C[result], r0                 \n\t"
+                 //"  mov %D[result], r1                 \n\t"
+                 "  movw %C[result], r0                   \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
+    const uint8_t  zero = 0;
+    asm volatile(
+                 // result.B-D += i.B x scale.A
+                 "  mul %B[i], %A[scale]                 \n\t"
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                 // result.B-D += i.A x scale.B
+                 "  mul %A[i], %B[scale]                 \n\t"
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                 // cleanup r1
+                 "  clr r1                               \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale),
+                   [zero] "r" (zero)
+                 : "r0", "r1"
+                 );
+    result = result >> 16;
+    return result;
+#endif
+#else
+    #error "No implementation for scale16 available."
+#endif
+}
+///@}
+///@defgroup Dimming Dimming and brightening functions
+///
+/// Dimming and brightening functions
+///
+/// The eye does not respond in a linear way to light.
+/// High speed PWM'd LEDs at 50% duty cycle appear far
+/// brighter then the 'half as bright' you might expect.
+///
+/// If you want your midpoint brightness leve (128) to
+/// appear half as bright as 'full' brightness (255), you
+/// have to apply a 'dimming function'.
+///@{
+/// Adjust a scaling value for dimming
+LIB8STATIC uint8_t dim8_raw( uint8_t x)
+{
+    return scale8( x, x);
+}
+/// Adjust a scaling value for dimming for video (value will never go below 1)
+LIB8STATIC uint8_t dim8_video( uint8_t x)
+{
+    return scale8_video( x, x);
+}
+/// Linear version of the dimming function that halves for values < 128
+LIB8STATIC uint8_t dim8_lin( uint8_t x )
+{
+    if( x & 0x80 ) {
+        x = scale8( x, x);
+    } else {
+        x += 1;
+        x /= 2;
+    }
+    return x;
+}
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_raw( uint8_t x)
+{
+    uint8_t ix = 255 - x;
+    return 255 - scale8( ix, ix);
+}
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_video( uint8_t x)
+{
+    uint8_t ix = 255 - x;
+    return 255 - scale8_video( ix, ix);
+}
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_lin( uint8_t x )
+{
+    uint8_t ix = 255 - x;
+    if( ix & 0x80 ) {
+        ix = scale8( ix, ix);
+    } else {
+        ix += 1;
+        ix /= 2;
+    }
+    return 255 - ix;
+}
+///@}
+#endif

diff --git a/lib/lib8tion/scale8.h b/lib/lib8tion/scale8.h new file mode 100644 index 000000000..9895fd4d7 --- /dev/null +++ b/lib/lib8tion/scale8.h
@@ -0,0 +1,542 @@
	1	#ifndef __INC_LIB8TION_SCALE_H
	2	#define __INC_LIB8TION_SCALE_H
	3
	4	///@ingroup lib8tion
	5
	6	///@defgroup Scaling Scaling functions
	7	/// Fast, efficient 8-bit scaling functions specifically
	8	/// designed for high-performance LED programming.
	9	///
	10	/// Because of the AVR(Arduino) and ARM assembly language
	11	/// implementations provided, using these functions often
	12	/// results in smaller and faster code than the equivalent
	13	/// program using plain "C" arithmetic and logic.
	14	///@{
	15
	16	/// scale one byte by a second one, which is treated as
	17	/// the numerator of a fraction whose denominator is 256
	18	/// In other words, it computes i * (scale / 256)
	19	/// 4 clocks AVR with MUL, 2 clocks ARM
	20	LIB8STATIC_ALWAYS_INLINE uint8_t scale8( uint8_t i, fract8 scale)
	21	{
	22	#if SCALE8_C == 1
	23	#if (FASTLED_SCALE8_FIXED == 1)
	24	return (((uint16_t)i) * (1+(uint16_t)(scale))) >> 8;
	25	#else
	26	return ((uint16_t)i * (uint16_t)(scale) ) >> 8;
	27	#endif
	28	#elif SCALE8_AVRASM == 1
	29	#if defined(LIB8_ATTINY)
	30	#if (FASTLED_SCALE8_FIXED == 1)
	31	uint8_t work=i;
	32	#else
	33	uint8_t work=0;
	34	#endif
	35	uint8_t cnt=0x80;
	36	asm volatile(
	37	#if (FASTLED_SCALE8_FIXED == 1)
	38	" inc %[scale] \n\t"
	39	" breq DONE_%= \n\t"
	40	" clr %[work] \n\t"
	41	#endif
	42	"LOOP_%=: \n\t"
	43	/*" sbrc %[scale], 0 \n\t"
	44	" add %[work], %[i] \n\t"
	45	" ror %[work] \n\t"
	46	" lsr %[scale] \n\t"
	47	" clc \n\t"*/
	48	" sbrc %[scale], 0 \n\t"
	49	" add %[work], %[i] \n\t"
	50	" ror %[work] \n\t"
	51	" lsr %[scale] \n\t"
	52	" lsr %[cnt] \n\t"
	53	"brcc LOOP_%= \n\t"
	54	"DONE_%=: \n\t"
	55	: [work] "+r" (work), [cnt] "+r" (cnt)
	56	: [scale] "r" (scale), [i] "r" (i)
	57	:
	58	);
	59	return work;
	60	#else
	61	asm volatile(
	62	#if (FASTLED_SCALE8_FIXED==1)
	63	// Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
	64	"mul %0, %1 \n\t"
	65	// Add i to r0, possibly setting the carry flag
	66	"add r0, %0 \n\t"
	67	// load the immediate 0 into i (note, this does _not_ touch any flags)
	68	"ldi %0, 0x00 \n\t"
	69	// walk and chew gum at the same time
	70	"adc %0, r1 \n\t"
	71	#else
	72	/* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
	73	"mul %0, %1 \n\t"
	74	/* Move the high 8-bits of the product (r1) back to i */
	75	"mov %0, r1 \n\t"
	76	/* Restore r1 to "0"; it's expected to always be that */
	77	#endif
	78	"clr __zero_reg__ \n\t"
	79
	80	: "+a" (i) /* writes to i */
	81	: "a" (scale) /* uses scale */
	82	: "r0", "r1" /* clobbers r0, r1 */ );
	83
	84	/* Return the result */
	85	return i;
	86	#endif
	87	#else
	88	#error "No implementation for scale8 available."
	89	#endif
	90	}
	91
	92
	93	/// The "video" version of scale8 guarantees that the output will
	94	/// be only be zero if one or both of the inputs are zero. If both
	95	/// inputs are non-zero, the output is guaranteed to be non-zero.
	96	/// This makes for better 'video'/LED dimming, at the cost of
	97	/// several additional cycles.
	98	LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video( uint8_t i, fract8 scale)
	99	{
	100	#if SCALE8_C == 1 \|\| defined(LIB8_ATTINY)
	101	uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
	102	// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
	103	// uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
	104	return j;
	105	#elif SCALE8_AVRASM == 1
	106	uint8_t j=0;
	107	asm volatile(
	108	" tst %[i]\n\t"
	109	" breq L_%=\n\t"
	110	" mul %[i], %[scale]\n\t"
	111	" mov %[j], r1\n\t"
	112	" clr __zero_reg__\n\t"
	113	" cpse %[scale], r1\n\t"
	114	" subi %[j], 0xFF\n\t"
	115	"L_%=: \n\t"
	116	: [j] "+a" (j)
	117	: [i] "a" (i), [scale] "a" (scale)
	118	: "r0", "r1");
	119
	120	return j;
	121	// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
	122	// asm volatile(
	123	// " tst %0 \n"
	124	// " breq L_%= \n"
	125	// " mul %0, %1 \n"
	126	// " mov %0, r1 \n"
	127	// " add %0, %2 \n"
	128	// " clr __zero_reg__ \n"
	129	// "L_%=: \n"
	130
	131	// : "+a" (i)
	132	// : "a" (scale), "a" (nonzeroscale)
	133	// : "r0", "r1");
	134
	135	// // Return the result
	136	// return i;
	137	#else
	138	#error "No implementation for scale8_video available."
	139	#endif
	140	}
	141
	142
	143	/// This version of scale8 does not clean up the R1 register on AVR
	144	/// If you are doing several 'scale8's in a row, use this, and
	145	/// then explicitly call cleanup_R1.
	146	LIB8STATIC_ALWAYS_INLINE uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
	147	{
	148	#if SCALE8_C == 1
	149	#if (FASTLED_SCALE8_FIXED == 1)
	150	return (((uint16_t)i) * ((uint16_t)(scale)+1)) >> 8;
	151	#else
	152	return ((int)i * (int)(scale) ) >> 8;
	153	#endif
	154	#elif SCALE8_AVRASM == 1
	155	asm volatile(
	156	#if (FASTLED_SCALE8_FIXED==1)
	157	// Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
	158	"mul %0, %1 \n\t"
	159	// Add i to r0, possibly setting the carry flag
	160	"add r0, %0 \n\t"
	161	// load the immediate 0 into i (note, this does _not_ touch any flags)
	162	"ldi %0, 0x00 \n\t"
	163	// walk and chew gum at the same time
	164	"adc %0, r1 \n\t"
	165	#else
	166	/* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
	167	"mul %0, %1 \n\t"
	168	/* Move the high 8-bits of the product (r1) back to i */
	169	"mov %0, r1 \n\t"
	170	#endif
	171	/* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF */
	172	/* "clr __zero_reg__ \n\t" */
	173
	174	: "+a" (i) /* writes to i */
	175	: "a" (scale) /* uses scale */
	176	: "r0", "r1" /* clobbers r0, r1 */ );
	177
	178	// Return the result
	179	return i;
	180	#else
	181	#error "No implementation for scale8_LEAVING_R1_DIRTY available."
	182	#endif
	183	}
	184
	185
	186	/// This version of scale8_video does not clean up the R1 register on AVR
	187	/// If you are doing several 'scale8_video's in a row, use this, and
	188	/// then explicitly call cleanup_R1.
	189	LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
	190	{
	191	#if SCALE8_C == 1 \|\| defined(LIB8_ATTINY)
	192	uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
	193	// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
	194	// uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
	195	return j;
	196	#elif SCALE8_AVRASM == 1
	197	uint8_t j=0;
	198	asm volatile(
	199	" tst %[i]\n\t"
	200	" breq L_%=\n\t"
	201	" mul %[i], %[scale]\n\t"
	202	" mov %[j], r1\n\t"
	203	" breq L_%=\n\t"
	204	" subi %[j], 0xFF\n\t"
	205	"L_%=: \n\t"
	206	: [j] "+a" (j)
	207	: [i] "a" (i), [scale] "a" (scale)
	208	: "r0", "r1");
	209
	210	return j;
	211	// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
	212	// asm volatile(
	213	// " tst %0 \n"
	214	// " breq L_%= \n"
	215	// " mul %0, %1 \n"
	216	// " mov %0, r1 \n"
	217	// " add %0, %2 \n"
	218	// " clr __zero_reg__ \n"
	219	// "L_%=: \n"
	220
	221	// : "+a" (i)
	222	// : "a" (scale), "a" (nonzeroscale)
	223	// : "r0", "r1");
	224
	225	// // Return the result
	226	// return i;
	227	#else
	228	#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
	229	#endif
	230	}
	231
	232	/// Clean up the r1 register after a series of *LEAVING_R1_DIRTY calls
	233	LIB8STATIC_ALWAYS_INLINE void cleanup_R1(void)
	234	{
	235	#if CLEANUP_R1_AVRASM == 1
	236	// Restore r1 to "0"; it's expected to always be that
	237	asm volatile( "clr __zero_reg__ \n\t" : : : "r1" );
	238	#endif
	239	}
	240
	241
	242	/// scale a 16-bit unsigned value by an 8-bit value,
	243	/// considered as numerator of a fraction whose denominator
	244	/// is 256. In other words, it computes i * (scale / 256)
	245
	246	LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )
	247	{
	248	#if SCALE16BY8_C == 1
	249	uint16_t result;
	250	#if FASTLED_SCALE8_FIXED == 1
	251	result = (i * (1+((uint16_t)scale))) >> 8;
	252	#else
	253	result = (i * scale) / 256;
	254	#endif
	255	return result;
	256	#elif SCALE16BY8_AVRASM == 1
	257	#if FASTLED_SCALE8_FIXED == 1
	258	uint16_t result = 0;
	259	asm volatile(
	260	// result.A = HighByte( (i.A x scale) + i.A )
	261	" mul %A[i], %[scale] \n\t"
	262	" add r0, %A[i] \n\t"
	263	// " adc r1, [zero] \n\t"
	264	// " mov %A[result], r1 \n\t"
	265	" adc %A[result], r1 \n\t"
	266
	267	// result.A-B += i.B x scale
	268	" mul %B[i], %[scale] \n\t"
	269	" add %A[result], r0 \n\t"
	270	" adc %B[result], r1 \n\t"
	271
	272	// cleanup r1
	273	" clr __zero_reg__ \n\t"
	274
	275	// result.A-B += i.B
	276	" add %A[result], %B[i] \n\t"
	277	" adc %B[result], __zero_reg__ \n\t"
	278
	279	: [result] "+r" (result)
	280	: [i] "r" (i), [scale] "r" (scale)
	281	: "r0", "r1"
	282	);
	283	return result;
	284	#else
	285	uint16_t result = 0;
	286	asm volatile(
	287	// result.A = HighByte(i.A x j )
	288	" mul %A[i], %[scale] \n\t"
	289	" mov %A[result], r1 \n\t"
	290	//" clr %B[result] \n\t"
	291
	292	// result.A-B += i.B x j
	293	" mul %B[i], %[scale] \n\t"
	294	" add %A[result], r0 \n\t"
	295	" adc %B[result], r1 \n\t"
	296
	297	// cleanup r1
	298	" clr __zero_reg__ \n\t"
	299
	300	: [result] "+r" (result)
	301	: [i] "r" (i), [scale] "r" (scale)
	302	: "r0", "r1"
	303	);
	304	return result;
	305	#endif
	306	#else
	307	#error "No implementation for scale16by8 available."
	308	#endif
	309	}
	310
	311	/// scale a 16-bit unsigned value by a 16-bit value,
	312	/// considered as numerator of a fraction whose denominator
	313	/// is 65536. In other words, it computes i * (scale / 65536)
	314
	315	LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
	316	{
	317	#if SCALE16_C == 1
	318	uint16_t result;
	319	#if FASTLED_SCALE8_FIXED == 1
	320	result = ((uint32_t)(i) * (1+(uint32_t)(scale))) / 65536;
	321	#else
	322	result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
	323	#endif
	324	return result;
	325	#elif SCALE16_AVRASM == 1
	326	#if FASTLED_SCALE8_FIXED == 1
	327	// implemented sort of like
	328	// result = ((i * scale) + i ) / 65536
	329	//
	330	// why not like this, you may ask?
	331	// result = (i * (scale+1)) / 65536
	332	// the answer is that if scale is 65535, then scale+1
	333	// will be zero, which is not what we want.
	334	uint32_t result;
	335	asm volatile(
	336	// result.A-B = i.A x scale.A
	337	" mul %A[i], %A[scale] \n\t"
	338	// save results...
	339	// basic idea:
	340	//" mov %A[result], r0 \n\t"
	341	//" mov %B[result], r1 \n\t"
	342	// which can be written as...
	343	" movw %A[result], r0 \n\t"
	344	// Because we're going to add i.A-B to
	345	// result.A-D, we DO need to keep both
	346	// the r0 and r1 portions of the product
	347	// UNlike in the 'unfixed scale8' version.
	348	// So the movw here is needed.
	349	: [result] "=r" (result)
	350	: [i] "r" (i),
	351	[scale] "r" (scale)
	352	: "r0", "r1"
	353	);
	354
	355	asm volatile(
	356	// result.C-D = i.B x scale.B
	357	" mul %B[i], %B[scale] \n\t"
	358	//" mov %C[result], r0 \n\t"
	359	//" mov %D[result], r1 \n\t"
	360	" movw %C[result], r0 \n\t"
	361	: [result] "+r" (result)
	362	: [i] "r" (i),
	363	[scale] "r" (scale)
	364	: "r0", "r1"
	365	);
	366
	367	const uint8_t zero = 0;
	368	asm volatile(
	369	// result.B-D += i.B x scale.A
	370	" mul %B[i], %A[scale] \n\t"
	371
	372	" add %B[result], r0 \n\t"
	373	" adc %C[result], r1 \n\t"
	374	" adc %D[result], %[zero] \n\t"
	375
	376	// result.B-D += i.A x scale.B
	377	" mul %A[i], %B[scale] \n\t"
	378
	379	" add %B[result], r0 \n\t"
	380	" adc %C[result], r1 \n\t"
	381	" adc %D[result], %[zero] \n\t"
	382
	383	// cleanup r1
	384	" clr r1 \n\t"
	385
	386	: [result] "+r" (result)
	387	: [i] "r" (i),
	388	[scale] "r" (scale),
	389	[zero] "r" (zero)
	390	: "r0", "r1"
	391	);
	392
	393	asm volatile(
	394	// result.A-D += i.A-B
	395	" add %A[result], %A[i] \n\t"
	396	" adc %B[result], %B[i] \n\t"
	397	" adc %C[result], %[zero] \n\t"
	398	" adc %D[result], %[zero] \n\t"
	399	: [result] "+r" (result)
	400	: [i] "r" (i),
	401	[zero] "r" (zero)
	402	);
	403
	404	result = result >> 16;
	405	return result;
	406	#else
	407	uint32_t result;
	408	asm volatile(
	409	// result.A-B = i.A x scale.A
	410	" mul %A[i], %A[scale] \n\t"
	411	// save results...
	412	// basic idea:
	413	//" mov %A[result], r0 \n\t"
	414	//" mov %B[result], r1 \n\t"
	415	// which can be written as...
	416	" movw %A[result], r0 \n\t"
	417	// We actually don't need to do anything with r0,
	418	// as result.A is never used again here, so we
	419	// could just move the high byte, but movw is
	420	// one clock cycle, just like mov, so might as
	421	// well, in case we want to use this code for
	422	// a generic 16x16 multiply somewhere.
	423
	424	: [result] "=r" (result)
	425	: [i] "r" (i),
	426	[scale] "r" (scale)
	427	: "r0", "r1"
	428	);
	429
	430	asm volatile(
	431	// result.C-D = i.B x scale.B
	432	" mul %B[i], %B[scale] \n\t"
	433	//" mov %C[result], r0 \n\t"
	434	//" mov %D[result], r1 \n\t"
	435	" movw %C[result], r0 \n\t"
	436	: [result] "+r" (result)
	437	: [i] "r" (i),
	438	[scale] "r" (scale)
	439	: "r0", "r1"
	440	);
	441
	442	const uint8_t zero = 0;
	443	asm volatile(
	444	// result.B-D += i.B x scale.A
	445	" mul %B[i], %A[scale] \n\t"
	446
	447	" add %B[result], r0 \n\t"
	448	" adc %C[result], r1 \n\t"
	449	" adc %D[result], %[zero] \n\t"
	450
	451	// result.B-D += i.A x scale.B
	452	" mul %A[i], %B[scale] \n\t"
	453
	454	" add %B[result], r0 \n\t"
	455	" adc %C[result], r1 \n\t"
	456	" adc %D[result], %[zero] \n\t"
	457
	458	// cleanup r1
	459	" clr r1 \n\t"
	460
	461	: [result] "+r" (result)
	462	: [i] "r" (i),
	463	[scale] "r" (scale),
	464	[zero] "r" (zero)
	465	: "r0", "r1"
	466	);
	467
	468	result = result >> 16;
	469	return result;
	470	#endif
	471	#else
	472	#error "No implementation for scale16 available."
	473	#endif
	474	}
	475	///@}
	476
	477	///@defgroup Dimming Dimming and brightening functions
	478	///
	479	/// Dimming and brightening functions
	480	///
	481	/// The eye does not respond in a linear way to light.
	482	/// High speed PWM'd LEDs at 50% duty cycle appear far
	483	/// brighter then the 'half as bright' you might expect.
	484	///
	485	/// If you want your midpoint brightness leve (128) to
	486	/// appear half as bright as 'full' brightness (255), you
	487	/// have to apply a 'dimming function'.
	488	///@{
	489
	490	/// Adjust a scaling value for dimming
	491	LIB8STATIC uint8_t dim8_raw( uint8_t x)
	492	{
	493	return scale8( x, x);
	494	}
	495
	496	/// Adjust a scaling value for dimming for video (value will never go below 1)
	497	LIB8STATIC uint8_t dim8_video( uint8_t x)
	498	{
	499	return scale8_video( x, x);
	500	}
	501
	502	/// Linear version of the dimming function that halves for values < 128
	503	LIB8STATIC uint8_t dim8_lin( uint8_t x )
	504	{
	505	if( x & 0x80 ) {
	506	x = scale8( x, x);
	507	} else {
	508	x += 1;
	509	x /= 2;
	510	}
	511	return x;
	512	}
	513
	514	/// inverse of the dimming function, brighten a value
	515	LIB8STATIC uint8_t brighten8_raw( uint8_t x)
	516	{
	517	uint8_t ix = 255 - x;
	518	return 255 - scale8( ix, ix);
	519	}
	520
	521	/// inverse of the dimming function, brighten a value
	522	LIB8STATIC uint8_t brighten8_video( uint8_t x)
	523	{
	524	uint8_t ix = 255 - x;
	525	return 255 - scale8_video( ix, ix);
	526	}
	527
	528	/// inverse of the dimming function, brighten a value
	529	LIB8STATIC uint8_t brighten8_lin( uint8_t x )
	530	{
	531	uint8_t ix = 255 - x;
	532	if( ix & 0x80 ) {
	533	ix = scale8( ix, ix);
	534	} else {
	535	ix += 1;
	536	ix /= 2;
	537	}
	538	return 255 - ix;
	539	}
	540
	541	///@}
	542	#endif