16-bit fast multiplication routine

Jerry Messina · Post by **Jerry Messina** » Wed Sep 24, 2014 7:17 pm

The PIC18 has a built-in hardware multiplier that can be used to accelerate multiplication operations.

The 8x8 multiplier is used by SF when multipying two bytes together, but when you multiply anything
larger than that it uses a software algorithm, so it's much slower.

Here's a routine that takes advantage of the multiplier to do a 16x16 operation. As you can see from some of
the comments, it's about 10x as fast as the native multiply, even with the overhead of the function call

Code: Select all

//
// umul16x16() - perform unsigned 16x16 multiply using the
// PIC18 8x8 hardware multiplier and return the 32-bit result
// the method is taken from the examples found in the
// hardware multiplier section of the datasheet
//
function umul16x16(arg1 as word, arg2 as word) as longword
    structure m32_t
        r0 as byte
        r1 as byte
        r2 as byte
        r3 as byte
        w as word       // extra temp space (also used by r24 type)
        // aliases
        w0 as r0.asword
        w1 as r2.asword
        dw as r0.aslongword
        r24 as r1.aslongword
    end structure
 
    dim d as m32_t
    
    d.w0 = arg1.bytes(0) * arg2.bytes(0)    // arg1L * arg2L
    d.w1 = arg1.bytes(1) * arg2.bytes(1)    // arg1H * arg2H
    d.w = arg1.bytes(0) * arg2.bytes(1)     // arg1L * arg2H
    d.r24 = d.r24 + d.w                     // add cross products
    d.w = arg1.bytes(1) * arg2.bytes(0)     // arg1H * arg2L
    d.r24 = d.r24 + d.w                     // add cross products
    result = d.dw
end function


// examples
 Dim A As Word
 Dim B As Word
 Dim C1, C2 As LongWord

 A = 2
 B = 1
 C1 = A * B               // 65us @ 32MHz
 C2 = umul16x16(A, B)     // 8us @ 32MHz

 A = 2048
 B = 1024
 C1 = A * B               // 65us @ 32MHz
 C2 = umul16x16(A, B)     // 8us @ 32MHz

 A = 65432
 B = 65431
 C1 = A * B               // 81us @ 32MHz
 C2 = umul16x16(A, B)     // 8us @ 32MHz

 A = 65535
 B = 65535
 C1 = A * B               // 89us @ 32MHz
 C2 = umul16x16(A, B)     // 8us @ 32MHz

while(true)
end while

Jerry Messina · Post by **Jerry Messina** » Thu Sep 25, 2014 7:06 pm

And here's a follow on to the previous post that has both 16-bit signed (smul16x16) and unsigned routines (umul16x16)
Like before, using the hardware multiplier is 8-10x faster

Code: Select all

//
// umul16x16() - perform unsigned 16x16 multiply using the
// PIC18 8x8 hardware multiplier and return the 32-bit result
// the method is taken from the examples found in the
// hardware multiplier section of the datasheet
//
function umul16x16(arg1 as word, arg2 as word) as longword
    structure m32_t
        r0 as byte
        r1 as byte
        r2 as byte
        r3 as byte
        w as word       // extra temp space
        // aliases
        w0 as r0.asword
        w1 as r2.asword
        dw as r0.aslongword
        r24 as r1.aslongword
    end structure
 
    dim d as m32_t
    
    d.w0 = arg1.bytes(0) * arg2.bytes(0)    // arg1L * arg2L
    d.w1 = arg1.bytes(1) * arg2.bytes(1)    // arg1H * arg2H
    d.w = arg1.bytes(0) * arg2.bytes(1)     // arg1L * arg2H
    d.r24 = d.r24 + d.w                     // add cross products
    d.w = arg1.bytes(1) * arg2.bytes(0)     // arg1H * arg2L
    d.r24 = d.r24 + d.w                     // add cross products
    result = d.dw
end function

//
// smul16x16() - perform signed 16x16 multiply using the
// PIC18 8x8 hardware multiplier and return the 32-bit signed result
//
function smul16x16(arg1 as integer, arg2 as integer) as longint
    structure m32_t
        r0 as byte
        r1 as byte
        r2 as byte
        r3 as byte
        w as word       // extra temp space
        w0 as r0.asword
        w1 as r2.asword
        dw as r0.aslongword
        li as r0.aslongint
        r24 as r1.aslongword
    end structure
 
    dim d as m32_t
    
    d.w0 = arg1.bytes(0) * arg2.bytes(0)    // arg1L * arg2L
    d.w1 = arg1.bytes(1) * arg2.bytes(1)    // arg1H * arg2H
    d.w = arg1.bytes(0) * arg2.bytes(1)     // arg1L * arg2H
    d.r24 = d.r24 + d.w                     // add cross products
    d.w = arg1.bytes(1) * arg2.bytes(0)     // arg1H * arg2L
    d.r24 = d.r24 + d.w                     // add cross products
    
    // check if arg2 is negative...
    if (arg2.bits(15) = 1) then
        d.w1 = d.w1 - word(arg1)
    endif
    // check if arg1 is negative...
    if (arg1.bits(15) = 1) then
        d.w1 = d.w1 - word(arg2)
    endif
    result = d.li
end function

//
// examples and test code
//
dim b1, b2 as byte          // 8-bit unsigned
dim si1, si2 as shortint    // 8-bit signed  -128 to 127 

dim w1, w2 as word          // 16-bit unsigned
dim i1, i2 as integer       // 16-bit signed  -32768 to 32767 

dim lw1, lw2 as longword    // 32-bit unsigned
dim li1, li2 as longint     // 32-bit signed -2147483648 to 2147483647 

dim fail as boolean

dim t, t1, t2 as word          // elap time
dim TMR0ON as T0CON.bits(7)

// check to make sure SF is using the hardware multiplier for 8-bit operations
// the functions above assume it does
// (look as the asm code for 'MULWF' instructions)
w1 = b1 * b2        // 8-bit unsigned
i1 = si1 * si2      // 8-bit signed

//
// unsigned examples
//
fail = false

w1 = 65535
w2 = 1
lw1 = w1 * w2               // sf method (~65us @ 32MHz)
lw2 = umul16x16(w1, w2)     // fast method (8us @ 32MHz)
if (lw1 <> lw2) then        // make sure results match
    fail = true
endif

w1 = 65535
w2 = 65535
lw1 = w1 * w2               // sf method (~89us @ 32MHz)
lw2 = umul16x16(w1, w2)     // fast method
if (lw1 <> lw2) then        // make sure results match
    fail = true
endif

//
// signed examples
//
fail = false

i1 = 32767
i2 = 32767
li1 = i1 * i2               // sf method
li2 = smul16x16(i1, i2)     // fast method
if (li1 <> li2) then        // make sure results match
    fail = true
endif

i1 = 32767
i2 = -32768
li1 = i1 * i2
li2 = smul16x16(i1, i2)
if (li1 <> li2) then
    fail = true
endif

i1 = -32768
i2 = 32767
li1 = i1 * i2
li2 = smul16x16(i1, i2)
if (li1 <> li2) then
    fail = true
endif

i1 = -32768
i2 = -32768
li1 = i1 * i2
li2 = smul16x16(i1, i2)
if (li1 <> li2) then
    fail = true
endif

{
//
// example code to count instruction cycles
// you can also use the simulator in MPLAB (which will give the time as well)
//

// set TMR0 to count instruction cycles
T0CON = %00001000

// just time the worst-case execution time (doesn't change much for hdw method)
w1 = 65535
w2 = 1

write_tmr(0)
TMR0ON = 1
lw1 = w1 * w2               // sf standard method
TMR0ON = 0
read_tmr(t1)                // t1 contains number of instruction cycles

write_tmr(0)
TMR0ON = 1
lw2 = umul16x16(w1, w2)     // fast method
TMR0ON = 0
read_tmr(t2)                // t2 contains number of instruction cycles

t = 0
}

while(true)
end while

Swordfish

16-bit fast multiplication routine

16-bit fast multiplication routine

Re: 16-bit fast multiplication routine