divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15


this sort of follows on having helped divmod10() thread: http://forum.arduino.cc/index.php?topic=167414.msg1280458#msg1280458

i have program large number of division 3 , division 5. built in functions division slow, faster way needed.

the 2 methods below achieve same result faster making use of reciprocal multiplication.
both of these 30 32 clock cycles including function call/ret overhead, compared on 200 cycles built in functions.

example useage:
code: [select]

 unsigned int x = 300;
 x = divu5(x); //instead of: x = x / 5;
 if (x == 60){
   serial.print("it works :d");
 }


for unsigned integer division 3      [32 clocks including 'call' , 'ret']
code: [select]
unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
 unsigned long working;
 asm volatile(
   "ldi  %a1, %3   \n\t"
   "ldi  %b1, 0xff \n\t"
   "ldi  %c1, 0x00 \n\t"
   "ldi  %d1, 0x00 \n\t"
   "cpi  %a0, 0xff \n\t"
   "cpc  %b0, %b1  \n\t"
   "brlo .+4       \n\t"
   "ldi  %c1, 0x01 \n\t"  //final answer++
   "rjmp .+4       \n\t"
   "subi %a0, 0xff \n\t"  //n++
   "sbci %b0, 0xff \n\t"
   "mul  %a0, %a1  \n\t"  //a * x
   "mov  %b1,  r1  \n\t"
   
   "add  %b1,  r0  \n\t"  //a* x'
   "adc  %c1,  r1  \n\t"
   
   "mul  %b0, %a1  \n\t"
   "add  %b1,  r0  \n\t"  //a* x'
   "adc  %c1,  r1  \n\t"
   "adc  %d1, %d1  \n\t"  //d0 known 0, need grab carry
   
   "add  %c1,  r0  \n\t"  //a* x'
   "adc  %d1,  r1  \n\t"
   
   "movw %a0, %c1  \n\t"  // >> 16
   
   "eor   r1,  r1  \n\t"
   
   : "=r" (n), "=r" (working)
   : "0" (n), "m" (0x55)
   : "r1", "r0"
 );
 return n;
}


for unsigned integer division 5      [32 clocks including 'call' , 'ret']
code: [select]

unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
 unsigned long working;
 asm volatile(
   "ldi  %a1, %3   \n\t"
   "ldi  %b1, 0xff \n\t"
   "ldi  %c1, 0x00 \n\t"
   "ldi  %d1, 0x00 \n\t"
   "cpi  %a0, 0xff \n\t"
   "cpc  %b0, %b1  \n\t"
   "brlo .+4       \n\t"
   "ldi  %c1, 0x01 \n\t"  //final answer++
   "rjmp .+4       \n\t"
   "subi %a0, 0xff \n\t"  //n++
   "sbci %b0, 0xff \n\t"
   "mul  %a0, %a1  \n\t"  //a * x
   "mov  %b1,  r1  \n\t"
   
   "add  %b1,  r0  \n\t"  //a* x'
   "adc  %c1,  r1  \n\t"
   
   "mul  %b0, %a1  \n\t"
   "add  %b1,  r0  \n\t"  //a* x'
   "adc  %c1,  r1  \n\t"
   "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
   
   "add  %c1,  r0  \n\t"  //a* x'
   "adc  %d1,  r1  \n\t"
   
   "movw %a0, %c1  \n\t"  // >> 16
   
   "eor   r1,  r1  \n\t"
   
   : "=r" (n), "=r" (working)
   : "0" (n), "m" (0x33)
   : "r1", "r0"
 );
 return n;
}


edit: updated code correct minor glitch (correction costs 2 clock cycles).

it can extended others such 60:      [31 clocks including 'call' , 'ret']
code: [select]
unsigned int divu60(unsigned int n) __attribute__((noinline));
unsigned int divu60(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
   
    "lsr  %b0       \n\t"
    "ror  %a0       \n\t"
    "lsr  %b0       \n\t"  // have divided 4, divide 15.
    "ror  %a0       \n\t"
   
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
   
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
   
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
   
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
   
    "movw %a0, %c1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x11)
    : "r1", "r0"
  );
  return n;
}


30:      [29 clocks including 'call' , 'ret']
code: [select]
unsigned int divu30(unsigned int n) __attribute__((noinline));
unsigned int divu30(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
   
    "lsr  %b0       \n\t"  // have divided 2, divide 15.
    "ror  %a0       \n\t"
   
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
   
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
   
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
   
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
   
    "movw %a0, %c1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x11)
    : "r1", "r0"
  );
  return n;
}


15:      [32 clocks including 'call' , 'ret']
code: [select]
unsigned int divu15(unsigned int n) __attribute__((noinline));
unsigned int divu15(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %b1, 0xff \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    "cpi  %a0, 0xff \n\t"
    "cpc  %b0, %b1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %c1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
   
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
   
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
   
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
   
    "movw %a0, %c1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x11)
    : "r1", "r0"
  );
  return n;
}


10:      [29 clocks including 'call' , 'ret']
code: [select]
unsigned int divu10(unsigned int n) __attribute__((noinline));
unsigned int divu10(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
   
    "lsr  %b0       \n\t"  // have divided 2, divide 5.
    "ror  %a0       \n\t"
   
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
   
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
   
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
   
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
   
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
   
    "movw %a0, %c1  \n\t"  // >> 16
       
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x33)
    : "r1", "r0"
  );
  return n;
}


and 6:      [29 clocks including 'call' , 'ret']
code: [select]
unsigned int divu6(unsigned int n) __attribute__((noinline));
unsigned int divu6(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
   
    "lsr  %b0       \n\t"  // have divided 2, divide 3.
    "ror  %a0       \n\t"
   
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
   
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
   
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
   
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
   
    "movw %a0, %c1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x55)
    : "r1", "r0"
  );
  return n;
}


edit: minor correction each function remove glitch n = 0xffff


Arduino Forum > Development > Other Software Development > divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15


arduino

Comments

Popular posts from this blog

Convierte tu Raspberry en un NAS. Firmware fvdw-sl 15.3 - Raspberry Pi Forums

How to format a Get Request

avrdude: verification error, first mismatch at byte 0x0000 0x0c != 0x62