MFT

February 25, 2015

this sort of follows on having helped divmod10() thread: http://forum.arduino.cc/index.php?topic=167414.msg1280458#msg1280458

i have program large number of division 3 , division 5. built in functions division slow, faster way needed.

the 2 methods below achieve same result faster making use of reciprocal multiplication.
both of these 30 32 clock cycles including function call/ret overhead, compared on 200 cycles built in functions.

example useage:

code: [select]


  unsigned int x = 300;
  x = divu5(x); //instead of: x = x / 5;
  if (x == 60){
    serial.print("it works :d");
  }

for unsigned integer division 3 [32 clocks including 'call' , 'ret']

code: [select]

unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %b1, 0xff \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    "cpi  %a0, 0xff \n\t"
    "cpc  %b0, %b1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %c1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d0 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x55)
    : "r1", "r0"
  );
  return n;
}

for unsigned integer division 5 [32 clocks including 'call' , 'ret']

code: [select]


unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %b1, 0xff \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    "cpi  %a0, 0xff \n\t"
    "cpc  %b0, %b1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %c1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x33)
    : "r1", "r0"
  );
  return n;
}

edit: updated code correct minor glitch (correction costs 2 clock cycles).

it can extended others such 60: [31 clocks including 'call' , 'ret']

code: [select]

unsigned int divu60(unsigned int n) __attribute__((noinline));
unsigned int divu60(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    
    "lsr  %b0       \n\t" 
    "ror  %a0       \n\t"
    "lsr  %b0       \n\t"  // have divided 4, divide 15.
    "ror  %a0       \n\t"
    
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x11)
    : "r1", "r0"
  );
  return n;
}

30: [29 clocks including 'call' , 'ret']

code: [select]

unsigned int divu30(unsigned int n) __attribute__((noinline));
unsigned int divu30(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    
    "lsr  %b0       \n\t"  // have divided 2, divide 15.
    "ror  %a0       \n\t"
    
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x11)
    : "r1", "r0"
  );
  return n;
}

15: [32 clocks including 'call' , 'ret']

code: [select]

unsigned int divu15(unsigned int n) __attribute__((noinline));
unsigned int divu15(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %b1, 0xff \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    "cpi  %a0, 0xff \n\t"
    "cpc  %b0, %b1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %c1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x11)
    : "r1", "r0"
  );
  return n;
}

10: [29 clocks including 'call' , 'ret']

code: [select]

unsigned int divu10(unsigned int n) __attribute__((noinline));
unsigned int divu10(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    
    "lsr  %b0       \n\t"  // have divided 2, divide 5.
    "ror  %a0       \n\t"
    
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
        
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x33)
    : "r1", "r0"
  );
  return n;
}

and 6: [29 clocks including 'call' , 'ret']

code: [select]

unsigned int divu6(unsigned int n) __attribute__((noinline));
unsigned int divu6(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %a1, %3   \n\t"
    "ldi  %c1, 0x00 \n\t"
    "ldi  %d1, 0x00 \n\t"
    
    "lsr  %b0       \n\t"  // have divided 2, divide 3.
    "ror  %a0       \n\t"
    
    "subi %a0, 0xff \n\t"  //n++
    "sbci %b0, 0xff \n\t"
    "mul  %a0, %a1  \n\t"  //a * x
    "mov  %b1,  r1  \n\t"
    
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    
    "mul  %b0, %a1  \n\t"
    "add  %b1,  r0  \n\t"  //a* x'
    "adc  %c1,  r1  \n\t"
    "adc  %d1, %d1  \n\t"  //d1 known 0, need grab carry
    
    "add  %c1,  r0  \n\t"  //a* x'
    "adc  %d1,  r1  \n\t"
    
    "movw %a0, %c1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "m" (0x55)
    : "r1", "r0"
  );
  return n;
}

edit: minor correction each function remove glitch n = 0xffff

Arduino Forum > Development > Other Software Development > divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15

arduino

Search This Blog

MFT

divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15

Comments

Post a Comment

Popular posts from this blog

Convierte tu Raspberry en un NAS. Firmware fvdw-sl 15.3 - Raspberry Pi Forums

How to format a Get Request

avrdude: verification error, first mismatch at byte 0x0000 0x0c != 0x62