From 77966135e81a3f6667de63a04527dc27d1f7fc9a Mon Sep 17 00:00:00 2001
From: X-Ryl669 <boite.pour.spam@gmail.com>
Date: Sat, 6 Feb 2021 05:43:51 +0100
Subject: [PATCH] Fix ARM delay function (#20901)

---
 Marlin/src/HAL/DUE/HAL_SPI.cpp                |   8 +-
 .../dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp    |   1 +
 .../dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp    |   1 +
 Marlin/src/HAL/STM32/HAL.cpp                  |  11 --
 Marlin/src/HAL/STM32/HAL.h                    |   1 +
 Marlin/src/HAL/STM32/HAL_SPI.cpp              |  30 ++-
 Marlin/src/HAL/shared/Delay.cpp               | 176 ++++++++++++++++++
 Marlin/src/HAL/shared/Delay.h                 | 132 +++++++------
 Marlin/src/MarlinCore.cpp                     |   3 +
 Marlin/src/core/macros.h                      |   2 +
 Marlin/src/gcode/gcode_d.cpp                  |   8 +-
 11 files changed, 279 insertions(+), 94 deletions(-)
 create mode 100644 Marlin/src/HAL/shared/Delay.cpp

diff --git a/Marlin/src/HAL/DUE/HAL_SPI.cpp b/Marlin/src/HAL/DUE/HAL_SPI.cpp
index 342c3737353..6cb2912c12a 100644
--- a/Marlin/src/HAL/DUE/HAL_SPI.cpp
+++ b/Marlin/src/HAL/DUE/HAL_SPI.cpp
@@ -240,7 +240,7 @@
   }
 
   // all the others
-  static uint32_t spiDelayCyclesX4 = (F_CPU) / 1000000; // 4µs => 125khz
+  static uint32_t spiDelayCyclesX4 = 4 * (F_CPU) / 1000000; // 4µs => 125khz
 
   static uint8_t spiTransferX(uint8_t b) { // using Mode 0
     int bits = 8;
@@ -249,12 +249,12 @@
       b <<= 1; // little setup time
 
       WRITE(SD_SCK_PIN, HIGH);
-      __delay_4cycles(spiDelayCyclesX4);
+      DELAY_CYCLES(spiDelayCyclesX4);
 
       b |= (READ(SD_MISO_PIN) != 0);
 
       WRITE(SD_SCK_PIN, LOW);
-      __delay_4cycles(spiDelayCyclesX4);
+      DELAY_CYCLES(spiDelayCyclesX4);
     } while (--bits);
     return b;
   }
@@ -510,7 +510,7 @@
         spiRxBlock = (pfnSpiRxBlock)spiRxBlockX;
         break;
       default:
-        spiDelayCyclesX4 = ((F_CPU) / 1000000) >> (6 - spiRate);
+        spiDelayCyclesX4 = ((F_CPU) / 1000000) >> (6 - spiRate) << 2; // spiRate of 2 gives the maximum error with current CPU
         spiTransferTx = (pfnSpiTransfer)spiTransferX;
         spiTransferRx = (pfnSpiTransfer)spiTransferX;
         spiTxBlock = (pfnSpiTxBlock)spiTxBlockX;
diff --git a/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp b/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp
index 7df180cbaa8..d01cd4dd6b2 100644
--- a/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp
+++ b/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp
@@ -59,6 +59,7 @@
 
 #if ENABLED(U8GLIB_ST7920)
 
+#include "../../../inc/MarlinConfig.h"
 #include "../../shared/Delay.h"
 
 #include <U8glib.h>
diff --git a/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp b/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp
index 615a386c354..4fb7a6e2c31 100644
--- a/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp
+++ b/Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp
@@ -59,6 +59,7 @@
 
 #if HAS_MARLINUI_U8GLIB
 
+#include "../../../inc/MarlinConfig.h"
 #include "../../shared/Delay.h"
 
 #include <U8glib.h>
diff --git a/Marlin/src/HAL/STM32/HAL.cpp b/Marlin/src/HAL/STM32/HAL.cpp
index 06744f16b83..e694a16a820 100644
--- a/Marlin/src/HAL/STM32/HAL.cpp
+++ b/Marlin/src/HAL/STM32/HAL.cpp
@@ -57,17 +57,6 @@ uint16_t HAL_adc_result;
 // Public functions
 // ------------------------
 
-// Needed for DELAY_NS() / DELAY_US() on CORTEX-M7
-#if (defined(__arm__) || defined(__thumb__)) && __CORTEX_M == 7
-  // HAL pre-initialization task
-  // Force the preinit function to run between the premain() and main() function
-  // of the STM32 arduino core
-  __attribute__((constructor (102)))
-  void HAL_preinit() {
-    enableCycleCounter();
-  }
-#endif
-
 // HAL initialization task
 void HAL_init() {
   FastIO_init();
diff --git a/Marlin/src/HAL/STM32/HAL.h b/Marlin/src/HAL/STM32/HAL.h
index 8f6c0a5990d..be0cc309626 100644
--- a/Marlin/src/HAL/STM32/HAL.h
+++ b/Marlin/src/HAL/STM32/HAL.h
@@ -194,6 +194,7 @@ void flashFirmware(const int16_t);
 typedef void (*systickCallback_t)(void);
 void systick_attach_callback(systickCallback_t cb);
 void HAL_SYSTICK_Callback();
+
 extern volatile uint32_t systick_uptime_millis;
 
 #define HAL_CAN_SET_PWM_FREQ   // This HAL supports PWM Frequency adjustment
diff --git a/Marlin/src/HAL/STM32/HAL_SPI.cpp b/Marlin/src/HAL/STM32/HAL_SPI.cpp
index eef480777bb..c9f23e6fa31 100644
--- a/Marlin/src/HAL/STM32/HAL_SPI.cpp
+++ b/Marlin/src/HAL/STM32/HAL_SPI.cpp
@@ -51,18 +51,28 @@ static SPISettings spiConfig;
     OUT_WRITE(SD_MOSI_PIN, HIGH);
   }
 
-  static uint16_t delay_STM32_soft_spi;
+  // Use function with compile-time value so we can actually reach the desired frequency
+  // Need to adjust this a little bit: on a 72MHz clock, we have 14ns/clock
+  // and we'll use ~3 cycles to jump to the method and going back, so it'll take ~40ns from the given clock here
+  #define CALLING_COST_NS  (3U * 1000000000U) / (F_CPU)
+  void (*delaySPIFunc)();
+  void delaySPI_125()  { DELAY_NS(125 - CALLING_COST_NS); }
+  void delaySPI_250()  { DELAY_NS(250 - CALLING_COST_NS); }
+  void delaySPI_500()  { DELAY_NS(500 - CALLING_COST_NS); }
+  void delaySPI_1000() { DELAY_NS(1000 - CALLING_COST_NS); }
+  void delaySPI_2000() { DELAY_NS(2000 - CALLING_COST_NS); }
+  void delaySPI_4000() { DELAY_NS(4000 - CALLING_COST_NS); }
 
   void spiInit(uint8_t spiRate) {
     // Use datarates Marlin uses
     switch (spiRate) {
-      case SPI_FULL_SPEED:   delay_STM32_soft_spi =  125; break;  // desired: 8,000,000  actual: ~1.1M
-      case SPI_HALF_SPEED:   delay_STM32_soft_spi =  125; break;  // desired: 4,000,000  actual: ~1.1M
-      case SPI_QUARTER_SPEED:delay_STM32_soft_spi =  250; break;  // desired: 2,000,000  actual: ~890K
-      case SPI_EIGHTH_SPEED: delay_STM32_soft_spi =  500; break;  // desired: 1,000,000  actual: ~590K
-      case SPI_SPEED_5:      delay_STM32_soft_spi = 1000; break;  // desired:   500,000  actual: ~360K
-      case SPI_SPEED_6:      delay_STM32_soft_spi = 2000; break;  // desired:   250,000  actual: ~210K
-      default:               delay_STM32_soft_spi = 4000; break;  // desired:   125,000  actual: ~123K
+      case SPI_FULL_SPEED:   delaySPIFunc =  &delaySPI_125; break;  // desired: 8,000,000  actual: ~1.1M
+      case SPI_HALF_SPEED:   delaySPIFunc =  &delaySPI_125; break;  // desired: 4,000,000  actual: ~1.1M
+      case SPI_QUARTER_SPEED:delaySPIFunc =  &delaySPI_250; break;  // desired: 2,000,000  actual: ~890K
+      case SPI_EIGHTH_SPEED: delaySPIFunc =  &delaySPI_500; break;  // desired: 1,000,000  actual: ~590K
+      case SPI_SPEED_5:      delaySPIFunc = &delaySPI_1000; break;  // desired:   500,000  actual: ~360K
+      case SPI_SPEED_6:      delaySPIFunc = &delaySPI_2000; break;  // desired:   250,000  actual: ~210K
+      default:               delaySPIFunc = &delaySPI_4000; break;  // desired:   125,000  actual: ~123K
     }
     SPI.begin();
   }
@@ -75,9 +85,9 @@ static SPISettings spiConfig;
       WRITE(SD_SCK_PIN, LOW);
       WRITE(SD_MOSI_PIN, b & 0x80);
 
-      DELAY_NS(delay_STM32_soft_spi);
+      delaySPIFunc();
       WRITE(SD_SCK_PIN, HIGH);
-      DELAY_NS(delay_STM32_soft_spi);
+      delaySPIFunc();
 
       b <<= 1;        // little setup time
       b |= (READ(SD_MISO_PIN) != 0);
diff --git a/Marlin/src/HAL/shared/Delay.cpp b/Marlin/src/HAL/shared/Delay.cpp
new file mode 100644
index 00000000000..1ae1b3e381f
--- /dev/null
+++ b/Marlin/src/HAL/shared/Delay.cpp
@@ -0,0 +1,176 @@
+/**
+ * Marlin 3D Printer Firmware
+ * Copyright (c) 2020 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
+ *
+ * Based on Sprinter and grbl.
+ * Copyright (c) 2011 Camiel Gubbels / Erik van der Zalm
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ */
+#include "Delay.h"
+
+#include "../../inc/MarlinConfig.h"
+
+#if defined(__arm__) || defined(__thumb__)
+
+  static uint32_t ASM_CYCLES_PER_ITERATION = 4;   // Initial bet which will be adjusted in calibrate_delay_loop
+
+  // Simple assembler loop counting down
+  void delay_asm(uint32_t cy) {
+    cy = _MAX(cy / ASM_CYCLES_PER_ITERATION, 1U); // Zero is forbidden here
+    __asm__ __volatile__(
+      A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax
+      L("1")
+      A("subs %[cnt],#1")
+      A("bne 1b")
+      : [cnt]"+r"(cy)   // output: +r means input+output
+      :                 // input:
+      : "cc"            // clobbers:
+    );
+  }
+
+  // We can't use CMSIS since it's not available on all platform, so fallback to hardcoded register values
+  #define HW_REG(X)   *(volatile uint32_t *)(X)
+  #define _DWT_CTRL   0xE0001000
+  #define _DWT_CYCCNT 0xE0001004      // CYCCNT is 32bits, takes 37s or so to wrap.
+  #define _DEM_CR     0xE000EDFC
+  #define _LAR        0xE0001FB0
+
+  // Use hardware cycle counter instead, it's much safer
+  void delay_dwt(uint32_t count) {
+    // Reuse the ASM_CYCLES_PER_ITERATION variable to avoid wasting another useless variable
+    register uint32_t start = HW_REG(_DWT_CYCCNT) - ASM_CYCLES_PER_ITERATION, elapsed;
+    do {
+      elapsed = HW_REG(_DWT_CYCCNT) - start;
+    } while (elapsed < count);
+  }
+
+  // Pointer to asm function, calling the functions has a ~20 cycles overhead
+  DelayImpl DelayCycleFnc = delay_asm;
+
+  void calibrate_delay_loop() {
+    // Check if we have a working DWT implementation in the CPU (see https://developer.arm.com/documentation/ddi0439/b/Data-Watchpoint-and-Trace-Unit/DWT-Programmers-Model)
+    if (!HW_REG(_DWT_CTRL)) {
+      // No DWT present, so fallback to plain old ASM nop counting
+      // Unfortunately, we don't exactly know how many iteration it'll take to decrement a counter in a loop
+      // It depends on the CPU architecture, the code current position (flash vs SRAM)
+      // So, instead of wild guessing and making mistake, instead
+      // compute it once for all
+      ASM_CYCLES_PER_ITERATION = 1;
+      // We need to fetch some reference clock before waiting
+      cli();
+        uint32_t start = micros();
+        delay_asm(1000); // On a typical CPU running in MHz, waiting 1000 "unknown cycles" means it'll take between 1ms to 6ms, that's perfectly acceptable
+        uint32_t end = micros();
+      sei();
+      uint32_t expectedCycles = (end - start) * ((F_CPU) / 1000000UL); // Convert microseconds to cycles
+      // Finally compute the right scale
+      ASM_CYCLES_PER_ITERATION = (uint32_t)(expectedCycles / 1000);
+
+      // No DWT present, likely a Cortex M0 so NOP counting is our best bet here
+      DelayCycleFnc = delay_asm;
+    }
+    else {
+      // Enable DWT counter
+      // From https://stackoverflow.com/a/41188674/1469714
+      HW_REG(_DEM_CR) = HW_REG(_DEM_CR) | 0x01000000;   // Enable trace
+      #if __CORTEX_M == 7
+        HW_REG(_LAR) = 0xC5ACCE55;                      // Unlock access to DWT registers, see https://developer.arm.com/documentation/ihi0029/e/ section B2.3.10
+      #endif
+      HW_REG(_DWT_CYCCNT) = 0;                          // Clear DWT cycle counter
+      HW_REG(_DWT_CTRL) = HW_REG(_DWT_CTRL) | 1;        // Enable DWT cycle counter
+
+      // Then calibrate the constant offset from the counter
+      ASM_CYCLES_PER_ITERATION = 0;
+      uint32_t s = HW_REG(_DWT_CYCCNT);
+      uint32_t e = HW_REG(_DWT_CYCCNT);  // (e - s) contains the number of cycle required to read the cycle counter
+      delay_dwt(0);
+      uint32_t f = HW_REG(_DWT_CYCCNT);  // (f - e) contains the delay to call the delay function + the time to read the cycle counter
+      ASM_CYCLES_PER_ITERATION = (f - e) - (e - s);
+
+      // Use safer DWT function
+      DelayCycleFnc = delay_dwt;
+    }
+  }
+
+  #if ENABLED(MARLIN_DEV_MODE)
+    void dump_delay_accuracy_check()
+    {
+      auto report_call_time = [](PGM_P const name, const uint32_t cycles, const uint32_t total, const bool do_flush=true) {
+        SERIAL_ECHOPGM("Calling ");
+        serialprintPGM(name);
+        SERIAL_ECHOLNPAIR(" for ", cycles, "cycles took: ", total, "cycles");
+        if (do_flush) SERIAL_FLUSH();
+      };
+
+      uint32_t s, e;
+
+      SERIAL_ECHOLNPAIR("Computed delay calibration value: ", ASM_CYCLES_PER_ITERATION);
+      SERIAL_FLUSH();
+      // Display the results of the calibration above
+      constexpr uint32_t testValues[] = { 1, 5, 10, 20, 50, 100, 150, 200, 350, 500, 750, 1000 };
+      for (auto i : testValues) {
+        s = micros(); DELAY_US(i); e = micros();
+        report_call_time(PSTR("delay"), i, e - s);
+      }
+
+      if (HW_REG(_DWT_CTRL)) {
+        for (auto i : testValues) {
+          s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(i); e = HW_REG(_DWT_CYCCNT);
+          report_call_time(PSTR("delay"), i, e - s);
+        }
+
+        // Measure the delay to call a real function compared to a function pointer
+        s = HW_REG(_DWT_CYCCNT); delay_dwt(1); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(PSTR("delay_dwt"), 1, e - s);
+
+        static PGMSTR(dcd, "DELAY_CYCLES directly ");
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES( 1); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd,  1, e - s, false);
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES( 5); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd,  5, e - s, false);
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(10); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd, 10, e - s, false);
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(20); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd, 20, e - s, false);
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(50); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd, 50, e - s, false);
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(100); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd, 100, e - s, false);
+
+        s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(200); e = HW_REG(_DWT_CYCCNT);
+        report_call_time(dcd, 200, e - s, false);
+      }
+    }
+  #endif // MARLIN_DEV_MODE
+
+
+#else
+
+  void calibrate_delay_loop() {}
+  #if ENABLED(MARLIN_DEV_MODE)
+    void dump_delay_accuracy_check() {
+      static PGMSTR(none, "N/A on this platform");
+      serialprintPGM(none);
+    }
+  #endif
+
+#endif
diff --git a/Marlin/src/HAL/shared/Delay.h b/Marlin/src/HAL/shared/Delay.h
index a48f3f79cfd..dc1f158b44a 100644
--- a/Marlin/src/HAL/shared/Delay.h
+++ b/Marlin/src/HAL/shared/Delay.h
@@ -21,6 +21,8 @@
  */
 #pragma once
 
+#include "../../inc/MarlinConfigPre.h"
+
 /**
  * Busy wait delay cycles routines:
  *
@@ -31,79 +33,68 @@
 
 #include "../../core/macros.h"
 
+void calibrate_delay_loop();
+
 #if defined(__arm__) || defined(__thumb__)
 
-  #if __CORTEX_M == 7
+  // We want to have delay_cycle function with the lowest possible overhead, so we adjust at the function at runtime based on the current CPU best feature
+  typedef void (*DelayImpl)(uint32_t);
+  extern DelayImpl DelayCycleFnc;
 
-    // Cortex-M3 through M7 can use the cycle counter of the DWT unit
-    // https://www.anthonyvh.com/2017/05/18/cortex_m-cycle_counter/
+  // I've measured 36 cycles on my system to call the cycle waiting method, but it shouldn't change much to have a bit more margin, it only consume a bit more flash
+  #define TRIP_POINT_FOR_CALLING_FUNCTION   40
 
-    FORCE_INLINE static void enableCycleCounter() {
-      CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
-
-      #if __CORTEX_M == 7
-        DWT->LAR = 0xC5ACCE55; // Unlock DWT on the M7
-      #endif
-
-      DWT->CYCCNT = 0;
-      DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+  // A simple recursive template class that output exactly one 'nop' of code per recursion
+  template <int N> struct NopWriter {
+    FORCE_INLINE static void build() {
+      __asm__ __volatile__("nop");
+      NopWriter<N-1>::build();
     }
+  };
+  // End the loop
+  template <> struct NopWriter<0> { FORCE_INLINE static void build() {} };
 
-    FORCE_INLINE volatile uint32_t getCycleCount() { return DWT->CYCCNT; }
-
-    FORCE_INLINE static void DELAY_CYCLES(const uint32_t x) {
-      const uint32_t endCycles = getCycleCount() + x;
-      while (PENDING(getCycleCount(), endCycles)) {}
-    }
-
-  #else
-
-    // https://blueprints.launchpad.net/gcc-arm-embedded/+spec/delay-cycles
-
-    #define nop() __asm__ __volatile__("nop;\n\t":::)
-
-    FORCE_INLINE static void __delay_4cycles(uint32_t cy) { // +1 cycle
-      #if ARCH_PIPELINE_RELOAD_CYCLES < 2
-        #define EXTRA_NOP_CYCLES A("nop")
-      #else
-        #define EXTRA_NOP_CYCLES ""
-      #endif
-
-      __asm__ __volatile__(
-        A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax
-        L("1")
-        A("subs %[cnt],#1")
-        EXTRA_NOP_CYCLES
-        A("bne 1b")
-        : [cnt]"+r"(cy)   // output: +r means input+output
-        :                 // input:
-        : "cc"            // clobbers:
-      );
-    }
-
-    // Delay in cycles
-    FORCE_INLINE static void DELAY_CYCLES(uint32_t x) {
-
-      if (__builtin_constant_p(x)) {
-        #define MAXNOPS 4
-
-        if (x <= (MAXNOPS)) {
-          switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); }
-        }
-        else { // because of +1 cycle inside delay_4cycles
-          const uint32_t rem = (x - 1) % (MAXNOPS);
-          switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); }
-          if ((x = (x - 1) / (MAXNOPS)))
-            __delay_4cycles(x); // if need more then 4 nop loop is more optimal
-        }
-        #undef MAXNOPS
+  namespace Private {
+    // Split recursing template in 2 different class so we don't reach the maximum template instantiation depth limit
+    template <bool belowTP, int N> struct Helper {
+      FORCE_INLINE static void build() {
+        DelayCycleFnc(N - 2); //  Approximative cost of calling the function (might be off by one or 2 cycles)
       }
-      else if ((x >>= 2))
-        __delay_4cycles(x);
-    }
-    #undef nop
+    };
 
-  #endif
+    template <int N> struct Helper<true, N> {
+      FORCE_INLINE static void build() {
+        NopWriter<N - 1>::build();
+      }
+    };
+
+    template <> struct Helper<true, 0> {
+      FORCE_INLINE static void build() {}
+    };
+
+  }
+  // Select a behavior based on the constexpr'ness of the parameter
+  // If called with a compile-time parameter, then write as many NOP as required to reach the asked cycle count
+  // (there is some tripping point here to start looping when it's more profitable than gruntly executing NOPs)
+  // If not called from a compile-time parameter, fallback to a runtime loop counting version instead
+  template <bool compileTime, int Cycles>
+  struct SmartDelay {
+    FORCE_INLINE SmartDelay(int) {
+      if (Cycles == 0) return;
+      Private::Helper<Cycles < TRIP_POINT_FOR_CALLING_FUNCTION, Cycles>::build();
+    }
+  };
+  // Runtime version below. There is no way this would run under less than ~TRIP_POINT_FOR_CALLING_FUNCTION cycles
+  template <int T>
+  struct SmartDelay<false, T> {
+    FORCE_INLINE SmartDelay(int v) { DelayCycleFnc(v); }
+  };
+
+  #define DELAY_CYCLES(X) do { SmartDelay<IS_CONSTEXPR(X), IS_CONSTEXPR(X) ? X : 0> _smrtdly_X(X); } while(0)
+
+  // For delay in microseconds, no smart delay selection is required, directly call the delay function
+  // Teensy compiler is too old and does not accept smart delay compile-time / run-time selection correctly
+  #define DELAY_US(x) DelayCycleFnc((x) * ((F_CPU) / 1000000UL))
 
 #elif defined(__AVR__)
 
@@ -144,10 +135,15 @@
   }
   #undef nop
 
+  // Delay in microseconds
+  #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
+
 #elif defined(__PLAT_LINUX__) || defined(ESP32)
 
-  // specified inside platform
+  // DELAY_CYCLES specified inside platform
 
+  // Delay in microseconds
+  #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
 #else
 
   #error "Unsupported MCU architecture"
@@ -157,5 +153,5 @@
 // Delay in nanoseconds
 #define DELAY_NS(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL) / 1000UL)
 
-// Delay in microseconds
-#define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
+
+
diff --git a/Marlin/src/MarlinCore.cpp b/Marlin/src/MarlinCore.cpp
index f0d4fa05a6e..626f934ddc0 100644
--- a/Marlin/src/MarlinCore.cpp
+++ b/Marlin/src/MarlinCore.cpp
@@ -1005,6 +1005,9 @@ void setup() {
   SERIAL_ECHO_MSG("Compiled: " __DATE__);
   SERIAL_ECHO_MSG(STR_FREE_MEMORY, freeMemory(), STR_PLANNER_BUFFER_BYTES, (int)sizeof(block_t) * (BLOCK_BUFFER_SIZE));
 
+  // Some HAL need precise delay adjustment
+  calibrate_delay_loop();
+
   // Init buzzer pin(s)
   #if USE_BEEPER
     SETUP_RUN(buzzer.init());
diff --git a/Marlin/src/core/macros.h b/Marlin/src/core/macros.h
index 905b85d56da..8361248e1c2 100644
--- a/Marlin/src/core/macros.h
+++ b/Marlin/src/core/macros.h
@@ -61,6 +61,8 @@
 #define _O2          __attribute__((optimize("O2")))
 #define _O3          __attribute__((optimize("O3")))
 
+#define IS_CONSTEXPR(...) __builtin_constant_p(__VA_ARGS__) // Only valid solution with C++14. Should use std::is_constant_evaluated() in C++20 instead
+
 #ifndef UNUSED
   #define UNUSED(x) ((void)(x))
 #endif
diff --git a/Marlin/src/gcode/gcode_d.cpp b/Marlin/src/gcode/gcode_d.cpp
index 0bd29558753..653ae6a553f 100644
--- a/Marlin/src/gcode/gcode_d.cpp
+++ b/Marlin/src/gcode/gcode_d.cpp
@@ -30,6 +30,8 @@
   #include "../HAL/shared/eeprom_if.h"
   #include "../HAL/shared/Delay.h"
 
+  extern void dump_delay_accuracy_check();
+
   /**
    * Dn: G-code for development and testing
    *
@@ -141,7 +143,7 @@
         }
       } break;
 
-      case 5: { // D4 Read / Write onboard Flash
+      case 5: { // D5 Read / Write onboard Flash
         #define FLASH_SIZE 1024
         uint8_t *pointer = parser.hex_adr_val('A');
         uint16_t len = parser.ushortval('C', 1);
@@ -162,6 +164,10 @@
         }
       } break;
 
+      case 6: // D6 Check delay loop accuracy
+        dump_delay_accuracy_check();
+      break;
+
       case 100: { // D100 Disable heaters and attempt a hard hang (Watchdog Test)
         SERIAL_ECHOLNPGM("Disabling heaters and attempting to trigger Watchdog");
         SERIAL_ECHOLNPGM("(USE_WATCHDOG " TERN(USE_WATCHDOG, "ENABLED", "DISABLED") ")");