Merge branch 'MK3_3.12_Optimizations' of https://github.com/leptun/Prusa-Firmware into MK3_3.12_Optimizations

2022-02-01 18:37:57 +00:00 · 2022-02-01 18:37:57 +00:00 · 8681d84c44
commit 8681d84c44
parent 53dfcf9d6f aec2db7683
1 changed files with 31 additions and 26 deletions
--- a/Firmware/speed_lookuptable.h
+++ b/Firmware/speed_lookuptable.h
@ -8,30 +8,36 @@ extern const uint16_t speed_lookuptable_slow[256][2] PROGMEM;
 #ifndef _NO_ASM
-// intRes = intIn1 * intIn2 >> 8
+// return ((x * y) >> 8) with rounding when shifting right
-// uses:
+FORCE_INLINE uint16_t MUL8x16R8(uint8_t x, uint16_t y) {
-// r26 to store 0
+    uint16_t out;
-// r27 to store the byte 1 of the 24 bit result
+    __asm__ (
-#define MultiU16X8toH16(intRes, charIn1, intIn2) \
+    // %0 out
-asm volatile ( \
+    // %1 x
-"clr r26 \n\t" \
+    // %2 y
-"mul %A1, %B2 \n\t" \
+    // uint8_t: %An or %n
-"movw %A0, r0 \n\t" \
+    // uint16_t: %Bn %An
-"mul %A1, %A2 \n\t" \
+    // __uint24: %Cn %Bn %An
-"add %A0, r1 \n\t" \
+    // uint32_t: %Dn %Cn %Bn %An
-"adc %B0, r26 \n\t" \
+    //
-"lsr r0 \n\t" \
+    //
-"adc %A0, r26 \n\t" \
+    //    B2 A2 *
-"adc %B0, r26 \n\t" \
+    //       A1
-"clr r1 \n\t" \
+    //---------
-: \
+    // B0 A0 RR
-"=&r" (intRes) \
+    "mul %B2, %A1" "\n\t"
-: \
+    "movw %0, r0" "\n\t"
-"d" (charIn1), \
+    "mul %A2, %A1" "\n\t"
-"d" (intIn2) \
+    "lsl r0" "\n\t"         //push MSB to carry for rounding
-: \
+    "adc %A0, r1" "\n\t"    //add with carry (for rounding)
-"r26" \
+    "clr r1" "\n\t"         //make r1 __zero_reg__ again
-)
+    "adc %B0, r1" "\n\t"    //propagate carry of addition (add 0 with carry)
    : "=&r" (out)
    : "r" (x), "r" (y)
    : "r0", "r1"            //clobbers: Technically these are either scratch registers or always 0 registers, but I'm making sure the compiler knows just in case.
    );
    return out;
 }
 // intRes = longIn1 * longIn2 >> 24
 // uses:
@ -115,8 +121,7 @@ FORCE_INLINE unsigned short calc_timer(uint16_t step_rate, uint8_t& step_loops)
    unsigned short table_address = (unsigned short)&speed_lookuptable_fast[(unsigned char)(step_rate>>8)][0];
    unsigned char tmp_step_rate = (step_rate & 0x00ff);
    uint16_t gain = (uint16_t)pgm_read_word_near(table_address+2);
-    MultiU16X8toH16(timer, tmp_step_rate, gain);
+    timer = (unsigned short)pgm_read_word_near(table_address) - MUL8x16R8(tmp_step_rate, gain);
    timer = (unsigned short)pgm_read_word_near(table_address) - timer;
  }
  else { // lower step rates
    unsigned short table_address = (unsigned short)&speed_lookuptable_slow[0][0];