diff --git a/Marlin/Configuration.h b/Marlin/Configuration.h
index c933b6887c..25a2571549 100644
--- a/Marlin/Configuration.h
+++ b/Marlin/Configuration.h
@@ -108,8 +108,9 @@
  * 250000 works in most cases, but you might try a lower speed if
  * you commonly experience drop-outs during host printing.
+ * You may try up to 1000000 to speed up SD file transfer.
- * :[2400, 9600, 19200, 38400, 57600, 115200, 250000]
+ * :[2400, 9600, 19200, 38400, 57600, 115200, 250000, 500000, 1000000]
 #define BAUDRATE 250000
diff --git a/Marlin/Configuration_adv.h b/Marlin/Configuration_adv.h
index fbab9b8191..402c5b18cf 100644
--- a/Marlin/Configuration_adv.h
+++ b/Marlin/Configuration_adv.h
@@ -751,7 +751,7 @@
 #define MAX_CMD_SIZE 96
 #define BUFSIZE 4
-// Transfer Buffer Size
+// Transmission to Host Buffer Size
 // To save 386 bytes of PROGMEM (and TX_BUFFER_SIZE+3 bytes of RAM) set to 0.
 // To buffer a simple "ok" you need 4 bytes.
 // For ADVANCED_OK (M105) you need 32 bytes.
@@ -760,6 +760,28 @@
 // :[0, 2, 4, 8, 16, 32, 64, 128, 256]
 #define TX_BUFFER_SIZE 0
+// Host Receive Buffer Size
+// Without XON/XOFF flow control (see SERIAL_XON_XOFF below) 32 bytes should be enough.
+// To use flow control, set this buffer size to at least 1024 bytes.
+// :[0, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+//#define RX_BUFFER_SIZE 1024
+#if RX_BUFFER_SIZE >= 1024
+  // Enable to have the controller send XON/XOFF control characters to
+  // the host to signal the RX buffer is becoming full.
+  //#define SERIAL_XON_XOFF
+  // Enable this option to collect and display the maximum
+  // RX queue usage after transferring a file to SD.
+  // Enable this option to collect and display the number
+  // of dropped bytes after a file transfer to SD.
 // Enable an emergency-command parser to intercept certain commands as they
 // enter the serial receive buffer, so they cannot be blocked.
 // Currently handles M108, M112, M410
diff --git a/Marlin/src/HAL/HAL_AVR/MarlinSerial.cpp b/Marlin/src/HAL/HAL_AVR/MarlinSerial.cpp
index 6e6430f77a..ca83f24c02 100644
--- a/Marlin/src/HAL/HAL_AVR/MarlinSerial.cpp
+++ b/Marlin/src/HAL/HAL_AVR/MarlinSerial.cpp
@@ -27,16 +27,31 @@
  * Modified 23 November 2006 by David A. Mellis
  * Modified 28 September 2010 by Mark Sproul
  * Modified 14 February 2016 by Andreas Hardtung (added tx buffer)
+ * Modified 01 October 2017 by Eduardo José Tagle (added XON/XOFF)
 #ifdef __AVR__
-#include "MarlinSerial.h"
-#include "../../Marlin.h"
 // Disable HardwareSerial.cpp to support chips without a UART (Attiny, etc.)
+#include "../../inc/MarlinConfig.h"
 #if !defined(USBCON) && (defined(UBRRH) || defined(UBRR0H) || defined(UBRR1H) || defined(UBRR2H) || defined(UBRR3H))
+  #include "MarlinSerial.h"
+  #include "../../Marlin.h"
+  struct ring_buffer_r {
+    unsigned char buffer[RX_BUFFER_SIZE];
+    volatile ring_buffer_pos_t head, tail;
+  };
+  #if TX_BUFFER_SIZE > 0
+    struct ring_buffer_t {
+      unsigned char buffer[TX_BUFFER_SIZE];
+      volatile uint8_t head, tail;
+    };
+  #endif
     ring_buffer_r rx_buffer = { { 0 }, 0, 0 };
     #if TX_BUFFER_SIZE > 0
@@ -45,6 +60,23 @@
+    uint8_t xon_xoff_state = XON_XOFF_CHAR_SENT | XON_CHAR;
+    constexpr uint8_t XON_XOFF_CHAR_SENT = 0x80;  // XON / XOFF Character was sent
+    constexpr uint8_t XON_XOFF_CHAR_MASK = 0x1F;  // XON / XOFF character to send
+    // XON / XOFF character definitions
+    constexpr uint8_t XON_CHAR  = 17;
+    constexpr uint8_t XOFF_CHAR = 19;
+  #endif
+    uint8_t rx_dropped_bytes = 0;
+  #endif
+    ring_buffer_pos_t rx_max_enqueued = 0;
+  #endif
     #include "../../module/stepper.h"
@@ -136,20 +168,78 @@
-  FORCE_INLINE void store_char(unsigned char c) {
-      const uint8_t h = rx_buffer.head,
-                    i = (uint8_t)(h + 1) & (RX_BUFFER_SIZE - 1);
+  FORCE_INLINE void store_rxd_char() {
+    const ring_buffer_pos_t h = rx_buffer.head,
+                            i = (ring_buffer_pos_t)(h + 1) & (ring_buffer_pos_t)(RX_BUFFER_SIZE - 1);
-      // if we should be storing the received character into the location
-      // just before the tail (meaning that the head would advance to the
-      // current location of the tail), we're about to overflow the buffer
-      // and so we don't write the character or advance the head.
-      if (i != rx_buffer.tail) {
-        rx_buffer.buffer[h] = c;
-        rx_buffer.head = i;
+    // If the character is to be stored at the index just before the tail
+    // (such that the head would advance to the current tail), the buffer is
+    // critical, so don't write the character or advance the head.
+    if (i != rx_buffer.tail) {
+      rx_buffer.buffer[h] = M_UDRx;
+      rx_buffer.head = i;
+    }
+    else {
+      (void)M_UDRx;
+        if (!++rx_dropped_bytes) ++rx_dropped_bytes;
+      #endif
+    }
+      // calculate count of bytes stored into the RX buffer
+      ring_buffer_pos_t rx_count = (ring_buffer_pos_t)(rx_buffer.head - rx_buffer.tail) & (ring_buffer_pos_t)(RX_BUFFER_SIZE - 1);
+      // Keep track of the maximum count of enqueued bytes
+      NOLESS(rx_max_enqueued, rx_count);
+    #endif
+      // for high speed transfers, we can use XON/XOFF protocol to do
+      // software handshake and avoid overruns.
+      if ((xon_xoff_state & XON_XOFF_CHAR_MASK) == XON_CHAR) {
+        // calculate count of bytes stored into the RX buffer
+        ring_buffer_pos_t rx_count = (ring_buffer_pos_t)(rx_buffer.head - rx_buffer.tail) & (ring_buffer_pos_t)(RX_BUFFER_SIZE - 1);
+        // if we are above 12.5% of RX buffer capacity, send XOFF before
+        // we run out of RX buffer space .. We need 325 bytes @ 250kbits/s to
+        // let the host react and stop sending bytes. This translates to 13mS
+        // propagation time.
+        if (rx_count >= (RX_BUFFER_SIZE) / 8) {
+          // If TX interrupts are disabled and data register is empty,
+          // just write the byte to the data register and be done. This
+          // shortcut helps significantly improve the effective datarate
+          // at high (>500kbit/s) bitrates, where interrupt overhead
+          // becomes a slowdown.
+          if (!TEST(M_UCSRxB, M_UDRIEx) && TEST(M_UCSRxA, M_UDREx)) {
+            // Send an XOFF character
+            M_UDRx = XOFF_CHAR;
+            // clear the TXC bit -- "can be cleared by writing a one to its bit
+            // location". This makes sure flush() won't return until the bytes
+            // actually got written
+            SBI(M_UCSRxA, M_TXCx);
+            // And remember it was sent
+            xon_xoff_state = XOFF_CHAR | XON_XOFF_CHAR_SENT;
+          }
+          else {
+            // TX interrupts disabled, but buffer still not empty ... or
+            // TX interrupts enabled. Reenable TX ints and schedule XOFF
+            // character to be sent
+            #if TX_BUFFER_SIZE > 0
+              SBI(M_UCSRxB, M_UDRIEx);
+              xon_xoff_state = XOFF_CHAR;
+            #else
+              // We are not using TX interrupts, we will have to send this manually
+              while (!TEST(M_UCSRxA, M_UDREx)) { /* nada */ };
+              M_UDRx = XOFF_CHAR;
+              // And remember we already sent it
+              xon_xoff_state = XOFF_CHAR | XON_XOFF_CHAR_SENT;
+            #endif
+          }
+        }
+    #endif // SERIAL_XON_XOFF
@@ -160,37 +250,41 @@
     FORCE_INLINE void _tx_udr_empty_irq(void) {
       // If interrupts are enabled, there must be more data in the output
-      // buffer. Send the next byte
-      const uint8_t t = tx_buffer.tail,
-                    c = tx_buffer.buffer[t];
-      tx_buffer.tail = (t + 1) & (TX_BUFFER_SIZE - 1);
+      // buffer.
-      M_UDRx = c;
+        // Do a priority insertion of an XON/XOFF char, if needed.
+        const uint8_t state = xon_xoff_state;
+        if (!(state & XON_XOFF_CHAR_SENT)) {
+          M_UDRx = state & XON_XOFF_CHAR_MASK;
+          xon_xoff_state = state | XON_XOFF_CHAR_SENT;
+        }
+        else
+      #endif
+      { // Send the next byte
+        const uint8_t t = tx_buffer.tail, c = tx_buffer.buffer[t];
+        tx_buffer.tail = (t + 1) & (TX_BUFFER_SIZE - 1);
+        M_UDRx = c;
+      }
       // clear the TXC bit -- "can be cleared by writing a one to its bit
       // location". This makes sure flush() won't return until the bytes
       // actually got written
       SBI(M_UCSRxA, M_TXCx);
-      if (tx_buffer.head == tx_buffer.tail) {
-        // Buffer empty, so disable interrupts
+      // Disable interrupts if the buffer is empty
+      if (tx_buffer.head == tx_buffer.tail)
         CBI(M_UCSRxB, M_UDRIEx);
-      }
     #ifdef M_USARTx_UDRE_vect
-      ISR(M_USARTx_UDRE_vect) {
-        _tx_udr_empty_irq();
-      }
+      ISR(M_USARTx_UDRE_vect) { _tx_udr_empty_irq(); }
   #endif // TX_BUFFER_SIZE
   #ifdef M_USARTx_RX_vect
-    ISR(M_USARTx_RX_vect) {
-      const unsigned char c = M_UDRx;
-      store_char(c);
-    }
+    ISR(M_USARTx_RX_vect) { store_rxd_char(); }
   // Public Methods
@@ -200,9 +294,9 @@
     bool useU2X = true;
     #if F_CPU == 16000000UL && SERIAL_PORT == 0
-      // hard-coded exception for compatibility with the bootloader shipped
-      // with the Duemilanove and previous boards and the firmware on the 8U2
-      // on the Uno and Mega 2560.
+      // Hard-coded exception for compatibility with the bootloader shipped
+      // with the Duemilanove and previous boards, and the firmware on the
+      // 8U2 on the Uno and Mega 2560.
       if (baud == 57600) useU2X = false;
@@ -237,8 +331,9 @@
   void MarlinSerial::checkRx(void) {
     if (TEST(M_UCSRxA, M_RXCx)) {
-      const uint8_t c = M_UDRx;
-      store_char(c);
+        store_rxd_char();
@@ -252,47 +347,76 @@
   int MarlinSerial::read(void) {
     int v;
-      const uint8_t t = rx_buffer.tail;
+      const ring_buffer_pos_t t = rx_buffer.tail;
       if (rx_buffer.head == t)
         v = -1;
       else {
         v = rx_buffer.buffer[t];
-        rx_buffer.tail = (uint8_t)(t + 1) & (RX_BUFFER_SIZE - 1);
+        rx_buffer.tail = (ring_buffer_pos_t)(t + 1) & (RX_BUFFER_SIZE - 1);
+          if ((xon_xoff_state & XON_XOFF_CHAR_MASK) == XOFF_CHAR) {
+            // Get count of bytes in the RX buffer
+            ring_buffer_pos_t rx_count = (ring_buffer_pos_t)(rx_buffer.head - rx_buffer.tail) & (ring_buffer_pos_t)(RX_BUFFER_SIZE - 1);
+            // When below 10% of RX buffer capacity, send XON before
+            // running out of RX buffer bytes
+            if (rx_count < (RX_BUFFER_SIZE) / 10) {
+              xon_xoff_state = XON_CHAR | XON_XOFF_CHAR_SENT;
+              CRITICAL_SECTION_END;       // End critical section before returning!
+              writeNoHandshake(XON_CHAR);
+              return v;
+            }
+          }
+        #endif
     return v;
-  uint8_t MarlinSerial::available(void) {
+  ring_buffer_pos_t MarlinSerial::available(void) {
-      const uint8_t h = rx_buffer.head,
-                    t = rx_buffer.tail;
+      const ring_buffer_pos_t h = rx_buffer.head, t = rx_buffer.tail;
-    return (uint8_t)(RX_BUFFER_SIZE + h - t) & (RX_BUFFER_SIZE - 1);
+    return (ring_buffer_pos_t)(RX_BUFFER_SIZE + h - t) & (RX_BUFFER_SIZE - 1);
   void MarlinSerial::flush(void) {
-    // RX
-    // don't reverse this or there may be problems if the RX interrupt
-    // occurs after reading the value of rx_buffer_head but before writing
-    // the value to rx_buffer_tail; the previous value of rx_buffer_head
-    // may be written to rx_buffer_tail, making it appear as if the buffer
-    // were full, not empty.
+    // Don't change this order of operations. If the RX interrupt occurs between
+    // reading rx_buffer_head and updating rx_buffer_tail, the previous rx_buffer_head
+    // may be written to rx_buffer_tail, making the buffer appear full rather than empty.
       rx_buffer.head = rx_buffer.tail;
+      if ((xon_xoff_state & XON_XOFF_CHAR_MASK) == XOFF_CHAR) {
+        xon_xoff_state = XON_CHAR | XON_XOFF_CHAR_SENT;
+        writeNoHandshake(XON_CHAR);
+      }
+    #endif
   #if TX_BUFFER_SIZE > 0
     uint8_t MarlinSerial::availableForWrite(void) {
-        const uint8_t h = tx_buffer.head,
-                      t = tx_buffer.tail;
+        const uint8_t h = tx_buffer.head, t = tx_buffer.tail;
       return (uint8_t)(TX_BUFFER_SIZE + h - t) & (TX_BUFFER_SIZE - 1);
     void MarlinSerial::write(const uint8_t c) {
+        const uint8_t state = xon_xoff_state;
+        if (!(state & XON_XOFF_CHAR_SENT)) {
+          // Send 2 chars: XON/XOFF, then a user-specified char
+          writeNoHandshake(state & XON_XOFF_CHAR_MASK);
+          xon_xoff_state = state | XON_XOFF_CHAR_SENT;
+        }
+      #endif
+      writeNoHandshake(c);
+    }
+    void MarlinSerial::writeNoHandshake(const uint8_t c) {
       _written = true;
         bool emty = (tx_buffer.head == tx_buffer.tail);
@@ -353,20 +477,34 @@
       // If we get here, nothing is queued anymore (DRIE is disabled) and
       // the hardware finished tranmission (TXC is set).
-  }
+    }
-  #else
-    void MarlinSerial::write(uint8_t c) {
-      while (!TEST(M_UCSRxA, M_UDREx))
-        ;
+  #else // TX_BUFFER_SIZE == 0
+    void MarlinSerial::write(const uint8_t c) {
+      while (!TEST(M_UCSRxA, M_UDREx)) { /* nada */ }
+      M_UDRx = c;
+        // Do a priority insertion of an XON/XOFF char, if needed.
+        const uint8_t state = xon_xoff_state;
+        if (!(state & XON_XOFF_CHAR_SENT)) {
+          writeNoHandshake(state & XON_XOFF_CHAR_MASK);
+          xon_xoff_state = state | XON_XOFF_CHAR_SENT;
+        }
+      #endif
+      writeNoHandshake(c);
+    }
+    void MarlinSerial::writeNoHandshake(const uint8_t c) {
+      while (!TEST(M_UCSRxA, M_UDREx)) ;
       M_UDRx = c;
-  #endif
-  // end NEW
-  /// imports from print.h
+  #endif // TX_BUFFER_SIZE == 0
+  /**
+   * Imports from print.h
+   */
   void MarlinSerial::print(char c, int base) {
     print((long)c, base);
@@ -516,4 +654,4 @@
   HardwareSerial bluetoothSerial;
+#endif // __AVR__
diff --git a/Marlin/src/HAL/HAL_AVR/MarlinSerial.h b/Marlin/src/HAL/HAL_AVR/MarlinSerial.h
index 0448e5502c..ef4f165e0c 100644
--- a/Marlin/src/HAL/HAL_AVR/MarlinSerial.h
+++ b/Marlin/src/HAL/HAL_AVR/MarlinSerial.h
@@ -21,13 +21,13 @@
-  MarlinSerial.h - Hardware serial library for Wiring
-  Copyright (c) 2006 Nicholas Zambetti.  All right reserved.
-  Modified 28 September 2010 by Mark Sproul
-  Modified 14 February 2016 by Andreas Hardtung (added tx buffer)
+ * MarlinSerial.h - Hardware serial library for Wiring
+ * Copyright (c) 2006 Nicholas Zambetti.  All right reserved.
+ *
+ * Modified 28 September 2010 by Mark Sproul
+ * Modified 14 February 2016 by Andreas Hardtung (added tx buffer)
+ * Modified 01 October 2017 by Eduardo José Tagle (added XON/XOFF)
+ */
@@ -89,34 +89,33 @@
   #ifndef TX_BUFFER_SIZE
     #define TX_BUFFER_SIZE 32
-  #if !((RX_BUFFER_SIZE == 256) ||(RX_BUFFER_SIZE == 128) ||(RX_BUFFER_SIZE == 64) ||(RX_BUFFER_SIZE == 32) ||(RX_BUFFER_SIZE == 16) ||(RX_BUFFER_SIZE == 8) ||(RX_BUFFER_SIZE == 4) ||(RX_BUFFER_SIZE == 2))
-    #error "RX_BUFFER_SIZE has to be a power of 2 and >= 2"
-  #endif
-  #if !((TX_BUFFER_SIZE == 256) ||(TX_BUFFER_SIZE == 128) ||(TX_BUFFER_SIZE == 64) ||(TX_BUFFER_SIZE == 32) ||(TX_BUFFER_SIZE == 16) ||(TX_BUFFER_SIZE == 8) ||(TX_BUFFER_SIZE == 4) ||(TX_BUFFER_SIZE == 2) ||(TX_BUFFER_SIZE == 0))
-    #error TX_BUFFER_SIZE has to be a power of 2 or 0
+    #error "XON/XOFF requires RX_BUFFER_SIZE >= 1024 for reliable transfers without drops."
-  struct ring_buffer_r {
-    unsigned char buffer[RX_BUFFER_SIZE];
-    volatile uint8_t head;
-    volatile uint8_t tail;
-  };
-  #if TX_BUFFER_SIZE > 0
-    struct ring_buffer_t {
-      unsigned char buffer[TX_BUFFER_SIZE];
-      volatile uint8_t head;
-      volatile uint8_t tail;
-    };
+    #error "RX_BUFFER_SIZE must be a power of 2 greater than 1."
-    extern ring_buffer_r rx_buffer;
-    #if TX_BUFFER_SIZE > 0
-      extern ring_buffer_t tx_buffer;
-    #endif
+    #error "TX_BUFFER_SIZE must be 0 or a power of 2 greater than 1."
+  #if RX_BUFFER_SIZE > 256
+    typedef uint16_t ring_buffer_pos_t;
+  #else
+    typedef uint8_t ring_buffer_pos_t;
+  #endif
+    extern uint8_t rx_dropped_bytes;
+  #endif
+    extern ring_buffer_pos_t rx_max_enqueued;
+  #endif  
   class MarlinSerial { //: public Stream
@@ -126,19 +125,23 @@
       static int peek(void);
       static int read(void);
       static void flush(void);
-      static uint8_t available(void);
+      static ring_buffer_pos_t available(void);
       static void checkRx(void);
       static void write(const uint8_t c);
       #if TX_BUFFER_SIZE > 0
         static uint8_t availableForWrite(void);
         static void flushTX(void);
+      static void writeNoHandshake(const uint8_t c);
-    private:
-      static void printNumber(unsigned long, const uint8_t);
-      static void printFloat(double, uint8_t);
+        FORCE_INLINE static uint32_t dropped() { return rx_dropped_bytes; }
+      #endif
+        FORCE_INLINE static ring_buffer_pos_t rxMaxEnqueued() { return rx_max_enqueued; }
+      #endif  
-    public:
       static FORCE_INLINE void write(const char* str) { while (*str) write(*str++); }
       static FORCE_INLINE void write(const uint8_t* buffer, size_t size) { while (size--) write(*buffer++); }
       static FORCE_INLINE void print(const String& s) { for (int i = 0; i < (int)s.length(); i++) write(s[i]); }
@@ -163,6 +166,10 @@
       static void println(double, int = 2);
       static void println(void);
       operator bool() { return true; }
+    private:
+      static void printNumber(unsigned long, const uint8_t);
+      static void printFloat(double, uint8_t);
   extern MarlinSerial customizedSerial;
diff --git a/Marlin/src/core/macros.h b/Marlin/src/core/macros.h
index e2f25eed4b..a89b6dafce 100644
--- a/Marlin/src/core/macros.h
+++ b/Marlin/src/core/macros.h
@@ -106,6 +106,7 @@
 #define CIRCLE_CIRC(R) (2.0 * M_PI * (R))
 #define SIGN(a) ((a>0)-(a<0))
+#define IS_POWER_OF_2(x) ((x) && !((x) & ((x) - 1)))
 // Macros to contrain values
 #define NOLESS(v,n) do{ if (v < n) v = n; }while(0)
diff --git a/Marlin/src/gcode/queue.cpp b/Marlin/src/gcode/queue.cpp
index 09c49933a6..fd28d20cfe 100644
--- a/Marlin/src/gcode/queue.cpp
+++ b/Marlin/src/gcode/queue.cpp
@@ -221,9 +221,9 @@ inline void get_serial_commands() {
    * Loop while serial characters are incoming and the queue is not full
-  while (commands_in_queue < BUFSIZE && MYSERIAL.available() > 0) {
-    char serial_char = MYSERIAL.read();
+  int c;
+  while (commands_in_queue < BUFSIZE && (c = MYSERIAL.read()) >= 0) {
+    char serial_char = c;
      * If the character ends the line
@@ -323,12 +323,9 @@ inline void get_serial_commands() {
       // The command will be injected when EOL is reached
     else if (serial_char == '\\') {  // Handle escapes
-      if (MYSERIAL.available() > 0) {
-        // if we have one more character, copy it over
-        serial_char = MYSERIAL.read();
-        if (!serial_comment_mode) serial_line_buffer[serial_count++] = serial_char;
-      }
-      // otherwise do nothing
+      // if we have one more character, copy it over
+      if ((c = MYSERIAL.read()) >= 0 && !serial_comment_mode)
+        serial_line_buffer[serial_count++] = serial_char;
     else { // it's not a newline, carriage return or escape char
       if (serial_char == ';') serial_comment_mode = true;
@@ -448,6 +445,15 @@ void advance_command_queue() {
         // M29 closes the file
+          SERIAL_ECHOLNPAIR("Dropped bytes: ", customizedSerial.dropped());
+        #endif
+          SERIAL_ECHOLNPAIR("Max RX Queue Size: ", customizedSerial.rxMaxEnqueued());
+        #endif
       else {