divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15
this sort of follows on having helped divmod10() thread: http://forum.arduino.cc/index.php?topic=167414.msg1280458#msg1280458
i have program large number of division 3 , division 5. built in functions division slow, faster way needed.
the 2 methods below achieve same result faster making use of reciprocal multiplication.
both of these30 32 clock cycles including function call/ret overhead, compared on 200 cycles built in functions.
example useage:
for unsigned integer division 3 [32 clocks including 'call' , 'ret']
for unsigned integer division 5 [32 clocks including 'call' , 'ret']
edit: updated code correct minor glitch (correction costs 2 clock cycles).
i have program large number of division 3 , division 5. built in functions division slow, faster way needed.
the 2 methods below achieve same result faster making use of reciprocal multiplication.
both of these
example useage:
code: [select]
unsigned int x = 300;
x = divu5(x); //instead of: x = x / 5;
if (x == 60){
serial.print("it works :d");
}
for unsigned integer division 3 [32 clocks including 'call' , 'ret']
code: [select]
unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %b1, 0xff \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"cpi %a0, 0xff \n\t"
"cpc %b0, %b1 \n\t"
"brlo .+4 \n\t"
"ldi %c1, 0x01 \n\t" //final answer++
"rjmp .+4 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d0 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x55)
: "r1", "r0"
);
return n;
}
for unsigned integer division 5 [32 clocks including 'call' , 'ret']
code: [select]
unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %b1, 0xff \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"cpi %a0, 0xff \n\t"
"cpc %b0, %b1 \n\t"
"brlo .+4 \n\t"
"ldi %c1, 0x01 \n\t" //final answer++
"rjmp .+4 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d1 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x33)
: "r1", "r0"
);
return n;
}
edit: updated code correct minor glitch (correction costs 2 clock cycles).
it can extended others such 60: [31 clocks including 'call' , 'ret']
30: [29 clocks including 'call' , 'ret']
15: [32 clocks including 'call' , 'ret']
10: [29 clocks including 'call' , 'ret']
and 6: [29 clocks including 'call' , 'ret']
edit: minor correction each function remove glitch n = 0xffff
code: [select]
unsigned int divu60(unsigned int n) __attribute__((noinline));
unsigned int divu60(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"lsr %b0 \n\t"
"ror %a0 \n\t"
"lsr %b0 \n\t" // have divided 4, divide 15.
"ror %a0 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d1 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x11)
: "r1", "r0"
);
return n;
}
30: [29 clocks including 'call' , 'ret']
code: [select]
unsigned int divu30(unsigned int n) __attribute__((noinline));
unsigned int divu30(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"lsr %b0 \n\t" // have divided 2, divide 15.
"ror %a0 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d1 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x11)
: "r1", "r0"
);
return n;
}
15: [32 clocks including 'call' , 'ret']
code: [select]
unsigned int divu15(unsigned int n) __attribute__((noinline));
unsigned int divu15(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %b1, 0xff \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"cpi %a0, 0xff \n\t"
"cpc %b0, %b1 \n\t"
"brlo .+4 \n\t"
"ldi %c1, 0x01 \n\t" //final answer++
"rjmp .+4 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d1 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x11)
: "r1", "r0"
);
return n;
}
10: [29 clocks including 'call' , 'ret']
code: [select]
unsigned int divu10(unsigned int n) __attribute__((noinline));
unsigned int divu10(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"lsr %b0 \n\t" // have divided 2, divide 5.
"ror %a0 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d1 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x33)
: "r1", "r0"
);
return n;
}
and 6: [29 clocks including 'call' , 'ret']
code: [select]
unsigned int divu6(unsigned int n) __attribute__((noinline));
unsigned int divu6(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %a1, %3 \n\t"
"ldi %c1, 0x00 \n\t"
"ldi %d1, 0x00 \n\t"
"lsr %b0 \n\t" // have divided 2, divide 3.
"ror %a0 \n\t"
"subi %a0, 0xff \n\t" //n++
"sbci %b0, 0xff \n\t"
"mul %a0, %a1 \n\t" //a * x
"mov %b1, r1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"mul %b0, %a1 \n\t"
"add %b1, r0 \n\t" //a* x'
"adc %c1, r1 \n\t"
"adc %d1, %d1 \n\t" //d1 known 0, need grab carry
"add %c1, r0 \n\t" //a* x'
"adc %d1, r1 \n\t"
"movw %a0, %c1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "m" (0x55)
: "r1", "r0"
);
return n;
}
edit: minor correction each function remove glitch n = 0xffff
Arduino Forum > Development > Other Software Development > divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15
arduino
Comments
Post a Comment