;   Copyright (C) 2016 Free Software Foundation, Inc.
;   Contributed by Alex Panek.
;
; This file is free software; you can redistribute it and/or modify it
; under the terms of the GNU General Public License as published by the
; Free Software Foundation; either version 3, or (at your option) any
; later version.
;
; This file is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; General Public License for more details.
;
; Under Section 7 of GPL version 3, you are granted additional
; permissions described in the GCC Runtime Library Exception, version
; 3.1, as published by the Free Software Foundation.
;
; You should have received a copy of the GNU General Public License and
; a copy of the GCC Runtime Library Exception along with this program;
; see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
; <http://www.gnu.org/licenses/>.


#include "vregs.h"

    .text


#ifdef __RL78_MUL_NONE__

; inputs:   BC:AX = (un)signed 32 bit value
;           [sp+4]..[sp+7]  = (un)signed 32 bit value
; output:   BC:AX  = the lower 32 bits of the product of the two inputs
; clobbers: -
START_FUNC ___mulsi3   ; aprox. 108 clocks on the S1 core

; X(r_0) = a0, A(r_1) = a1, C(r_2)  = a2, B(r_3)  = a3
; [sp+8] = b0, [sp+9] = b1, [sp+10] = b2, [sp+11] = b3

    push   de
    push   hl

    movw   hl, ax      ; L(r_6) = a0, H(r_7) = a1

    mov    a, [sp+11]  ; A = b3
    mulu   x
    movw   de, ax      ; DE = a0*b3

    movw   ax, [sp+10] ; X = b2
    mov    a, h        ; A = a1
    mulu   x
    addw   ax, de
    movw   de, ax      ; DE = a0*b3 + a1*b2

    mov    x, r_2      ; X = a2
    mov    a, [sp+9]   ; A = b1
    mulu   x
    addw   ax, de
    movw   de, ax      ; DE = a0*b3 + a1*b2 + a2*b1

    movw   ax, [sp+8]  ; X = b0
    mov    a, b        ; A = a3
    mulu   x
    addw   ax, de      ; AX = a0*b3 + a1*b2 + a2*b1 + a3*b0
    shlw   ax, 8       ; align the sum so that it can be added
    movw   de, ax      ; to the highest word of the result = S

    movw   ax, [sp+10] ; X = b2
    mov    a, l        ; A = a0
    mulu   x
    addw   ax, de
    movw   de, ax      ; DE = S + a0*b2

    mov    x, r_7      ; X = a1
    mov    a, [sp+9]   ; A = b1
    mulu   x
    addw   ax, de
    movw   de, ax      ; DE = S + a0*b2 + a1*b1

    movw   ax, [sp+8]  ; X = b0
    mov    a, c        ; A = a2
    mulu   x
    addw   ax, de      ; AX = S + a0*b2 + a1*b1 + a2 * b0
    movw   bc, ax      ; move to BC (base for the hi word of the result)

    mov    x, r_6      ; X = a0
    mov    a, [sp+9]   ; A = b1
    mulu   x
    movw   de, ax      ; DE = a0 * b1

    movw   ax, [sp+8]  ; X = b0
    mov    a, h        ; A = a1
    mulu   x
    addw   ax, de
    movw   de, ax      ; DE = a0 * b1 + a1 * b0

    sknc               ; the prev sum is aligned at bit 8 of the result so
    inc    b           ; it can generate a carry into the most significant byte)

    shrw   ax, 8       ; add the corresponding bits (8-15) of the (a0 * b1 + a1 * b0) sum
    addw   ax, bc      ; to the high word of the result
    movw   bc, ax

    movw   ax, [sp+8]  ; X = b0
    mov    a, l        ; A = a0
    mulu   x           ; AX = a0 * b0
    add    a, e        ; add the corresponding bits (0-7) of the (a0 * b1 + a1 * b0) sum
                       ; to the bits 8-15 of the result
    sknc               ; which may generate another carry into the higher 16 bits
    incw   bc          ; of the result

    pop    hl
    pop    de
    ret

END_FUNC ___mulsi3

#endif /* __RL78_MUL_NONE__ */


#ifdef __RL78_MUL_G13__

; inputs:   BC:AX = (un)signed 32 bit value
;           [sp+4]..[sp+7]  = (un)signed 32 bit value
; output:   BC:AX  = the lower 32 bits of the product of the two inputs
; clobbers: -
START_FUNC ___mulsi3

    push   bc              ; we'll need these later so push them on the stack =>
    push   ax              ; [sp] = %h1, [sp+2] = %H1, [sp+8] = %h2, [sp+10] = %H2

    mov    !MDUC, #0       ; unsigned multiply operation

    movw   MDAL, ax        ; %h1
    movw   ax, [sp+8]      ; %h2
    movw   MDAH, ax        ; <--- this starts the multiplication operation

    nop                    ; MDBH:MDBL = MDAL * MDAH (must wait 1 cycle)

    movw   ax, MDBL
    movw   bc, ax          ; get and save the low word of the result into BC

    mov    !MDUC, #0x40    ; unsigned multiply-accumulation operation

    movw   ax, MDBH        ; move the hi word of the (%h1 * %h2) result into
    movw   !MDCL, ax       ; the low 16 bits of the 32-bit accumulator

    movw   ax, [sp+2]      ; %H1
    movw   MDAL, ax
    movw   ax, [sp+8]      ; %h2
    movw   MDAH, ax        ; <--- this starts the 1st multiply-accumulation operation

    nop                    ; MDCH:MDCL += MDAL * MDAH

    movw   ax, [sp]
    movw   MDAL, ax        ; %h1 (prev op result is ready in MDCH:MDCL)
    movw   ax, [sp+10]     ; %H2
    movw   MDAH, ax        ; <--- this starts the 2nd multiply-accumulation operation

    pop    ax              ; since the multiply-accumulation takes 2 clocks to finish
    pop    ax              ; use these clocks to clean up the stack

    movw   ax, !MDCL       ; then load the hi word of the result from MDCL
    xchw   ax, bc          ; BC holds the low part so exchange them before returning

    ret

END_FUNC ___mulsi3

#endif /* __RL78_MUL_G13__ */


#ifdef __RL78_MUL_G14__

; inputs:   BC:AX = (un)signed 32 bit value
;           [sp+4]..[sp+7]  = (un)signed 32 bit value
; output:   BC:AX  = the lower 32 bits of the product of the two inputs
; clobbers: DE
START_FUNC ___mulsi3

    push   bc              ; we'll need these later so push them on the stack =>
    push   ax              ; [sp] = %h1, [sp+2] = %H1, [sp+8] = %h2, [sp+10] = %H2

    movw   bc, ax          ; %h1
    movw   ax, [sp+8]      ; %h2
    mulhu                  ; BC:AX = AX * BC

    movw   de, ax          ; save the low word of the result in DE

    movw   ax, bc
    movw   MACRL, ax       ; init low word of the accumulator with the hi word of (%h1 * %h2)

    movw   ax, [sp+8]      ; %h2
    movw   bc, ax
    movw   ax, [sp+2]      ; %H1
    machu                  ; MACRH:MACRL += AX * BC

    movw   ax, [sp+10]     ; %H2
    movw   bc, ax
    movw   ax, [sp]        ; %h1
    machu                  ; MACRH:MACRL += AX * BC

    movw   bc, r_4         ; move the low part of the result in BC
    addw   sp, #4          ; clean-up the copy of the 1st operand

    movw   ax, MACRL       ; load the hi word of the result from the low part of the accumulator
    xchw   ax, bc          ; return hi word of the result in BC and lo word in AX

    ret

END_FUNC ___mulsi3

#endif /* __RL78_MUL_G14__ */
