;   Copyright (C) 2016, 2017 Free Software Foundation, Inc.
;   Contributed by Alex Panek.
;
; This file is free software; you can redistribute it and/or modify it
; under the terms of the GNU General Public License as published by the
; Free Software Foundation; either version 3, or (at your option) any
; later version.
;
; This file is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; General Public License for more details.
;
; Under Section 7 of GPL version 3, you are granted additional
; permissions described in the GCC Runtime Library Exception, version
; 3.1, as published by the Free Software Foundation.
;
; You should have received a copy of the GNU General Public License and
; a copy of the GCC Runtime Library Exception along with this program;
; see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
; <http://www.gnu.org/licenses/>.


#include "vregs.h"

    .text


; inputs:   [sp+6] :[sp+4] = 1st operand (32-bit unsigned)
;           [sp+10]:[sp+8] = 2nd operand (32-bit signed)
; output:   r14:r12:r10:r8 = the 64-bit signed product of the two values
; clobbers:
START_FUNC ___usmulsidi3

    mov    a, [sp+11]      ; if the second operand is negative, jump to the more
    bf     a.7, $2f        ; complex computation, otherwise fall-through
    clr1   a.6
    br     $1f

; inputs:   [sp+6] :[sp+4] = 1st operand (32-bit signed)
;           [sp+10]:[sp+8] = 2nd operand (32-bit signed)
; output:   r14:r12:r10:r8 = the 64-bit signed product of the two values
; clobbers:
START_ANOTHER_FUNC ___mulsidi3

    mov    a, [sp+7]
    mov1   CY, a.7
    mov    a, [sp+11]
    mov1   a.6, CY
    and    a, #11000000B
    bnz    $1f             ; complex computation, otherwise fall-through

#ifdef __RL78_MUL_NONE__

; inputs:   [sp+6] :[sp+4] = 1st operand (32-bit unsigned)
;           [sp+10]:[sp+8] = 2nd operand (32-bit unsigned)
; output:   r14:r12:r10:r8 = the 64-bit unsigned product of the two values
; clobbers:
START_ANOTHER_FUNC ___umulsidi3

2:  movw   ax, [sp+4]      ; B
    movw   bc, ax
    movw   ax, [sp+8]      ; D
    call   !!___umulhisi3  ; BxD in BC:AX
    movw   r8, ax
    movw   ax, bc
    movw   r10, ax         ; store BxD in the lower 32-bits of the (provisional) result

    movw   ax, [sp+6]      ; A
    movw   bc, ax
    movw   ax, [sp+10]     ; C
    call   !!___umulhisi3  ; AxC in BC:AX
    movw   r12, ax
    movw   ax, bc
    movw   r14, ax         ; store AxC in the upper 32-bits of the (provisional) result

    movw   ax, [sp+6]      ; A
    movw   bc, ax
    movw   ax, [sp+8]      ; D
    call   $!0f            ; compute AxD and add it to the (provisional) result

    movw   ax, [sp+4]      ; B
    movw   bc, ax
    movw   ax, [sp+10]     ; C

0:  call   !!___umulhisi3  ; compute either AxD or BxC (depending on call or fall-through)

    addw   ax, r10         ; add to the 16-31 bits of the result (may generate a carry)
    movw   r10, ax

    movw   ax, bc
    sknc
    incw   ax              ; add the carry bit to the upper 16 bits of the product

    addw   ax, r12         ; add to the 32-47 bits of the result (may generate a carry)
    movw   r12, ax

    sknc
    incw   r14             ; add the carry bit to the upper 16 bits of the result

    ret

END_ANOTHER_FUNC ___umulsidi3

#endif /* __RL78_MUL_NONE__ */

#ifdef __RL78_MUL_G13__

; inputs:   [sp+6] :[sp+4] = 1st operand (32-bit unsigned)
;           [sp+10]:[sp+8] = 2nd operand (32-bit unsigned)
; output:   r14:r12:r10:r8 = the 64-bit unsigned product of the two values
; clobbers: BC
START_ANOTHER_FUNC ___umulsidi3

2:  mov    !MDUC, #0       ; unsigned multiply operation

    movw   ax, [sp+6]      ; A
    movw   MDAL, ax
    movw   ax, [sp+10]     ; C
    movw   MDAH, ax        ; <--- this starts the multiplication operation

    nop                    ; MDBH:MDBL = MDAL * MDAH (must wait 1 cycle)

    movw   ax, MDBL        ; store the AxC value by parts:
    movw   bc, ax          ; cannot store to MDCH (not in mult/acc mode)
    movw   ax, MDBH        ; and the its higher part
    movw   r14, ax         ; in the high 16 bits of the 64-bit result

    movw   ax, [sp+4]      ; B
    movw   MDAL, ax
    movw   ax, [sp+8]      ; D
    movw   MDAH, ax        ; <--- this starts the multiplication operation

    nop                    ; MDBH:MDBL = MDAL * MDAH (must wait 1 cycle)

    movw   ax, MDBL        ; store low word of BxD in the lower 16-bits of the result
    movw   r8, ax          ; (this is final - does not need to be further modified)

    mov    !MDUC, #0x40    ; switch MDUC to unsigned multiply-accumulation operation
   
    movw   ax, bc          ; get the lower part of AxC value and store it
    movw   !MDCH, ax       ; into the higher part of the MDC accumulator
    
    movw   ax, MDBH        ; copy the higher 16-bits of the BxD value
    movw   !MDCL, ax       ; to the lower part of the accumulator

    movw   ax, [sp+6]      ; A
    movw   MDAL, ax
    movw   ax, [sp+8]      ; D
    movw   MDAH, ax        ; <--- this starts the 1st multiply-accumulation operation

    movw   ax, [sp+4]      ; B
    movw   MDAL, ax        ; <--- 1st multiply-acc result ready

    mov    a, !MDUC
    mov1   CY, a.2         ; get the overflow/carry of the mult-acc op

    movw   ax, [sp+10]     ; C
    movw   MDAH, ax        ; <--- this starts the 2nd multiply-accumulation operation

    sknc
    incw   r14
    nop                    ; *I think* one additional cycle is needed here if CY == 0

    movw   ax, !MDCL       ; copy the two words of the MDC accumulator
    movw   r10, ax         ; to the middle words of the 64-bit result
    movw   ax, !MDCH
    movw   r12, ax

    mov    a, !MDUC
    mov1   CY, a.2         ; get the overflow/carry of the mult-acc op

    sknc
    incw   r14             ; update the highest word of the result, if necessary

    ret

END_ANOTHER_FUNC ___umulsidi3

#endif /* __RL78_MUL_G13__ */

#ifdef __RL78_MUL_G14__

; inputs:   [sp+6] :[sp+4] = 1st operand (32-bit unsigned)
;           [sp+10]:[sp+8] = 2nd operand (32-bit unsigned)
; output:   r14:r12:r10:r8 = the 64-bit unsigned product of the two values
; clobbers:
START_ANOTHER_FUNC ___umulsidi3

2:  movw   ax, [sp+6]      ; A
    movw   bc, ax
    movw   ax, [sp+10]     ; C
    mulhu                  ; BC:AX = AX * BC

    movw   MACRH, ax       ; store the lower part of the result into the high part
    movw   ax, bc          ; of the accumulator and its higher part
    movw   r14, ax         ; into the high 16 bits of the 64-bit result

    movw   ax, [sp+4]      ; B
    movw   bc, ax
    movw   ax, [sp+8]      ; D
    mulhu                  ; BC:AX = AX * BC

    movw   r8, ax          ; store low word of BxD in the lower 16-bits of the result
    movw   ax, bc
    movw   MACRL, ax       ; and the high word to the lower part of the accumulator

    movw   ax, [sp+6]      ; A
    movw   bc, ax
    movw   ax, [sp+8]      ; D
    machu                  ; MACRH:MACRL += AX * BC

    sknc
    incw   r14             ; take into account a possible carry

    movw   ax, [sp+4]      ; B
    movw   bc, ax
    movw   ax, [sp+10]     ; C
    machu                  ; MACRH:MACRL += AX * BC

    movw   ax, MACRL       ; copy the two words of the MACR accumulator
    movw   r10, ax         ; to the middle words of the 64-bit result
    movw   ax, MACRH
    movw   r12, ax

    sknc
    incw   r14             ; take into account a possible carry

    ret

END_ANOTHER_FUNC ___umulsidi3

#endif /* __RL78_MUL_G14__ */

    ; this part is shared between the G10/G13/G14 implementations
1:  mov    l, a            ; save the sign bits info in L because L will not be
                           ; clobbered by ___umulsidi3 and possible descendants
    movw   ax, [sp+6]
    push   ax
    movw   ax, [sp+6]
    push   ax
    movw   ax, [sp+14]
    push   ax
    movw   ax, [sp+14]
    push   ax
    call   $!___umulsidi3  ; call the unsigned version (result in r8..r15)
    addw   sp, #8

    mov    c, r_6          ; move the sign bits info to C
    movw   hl, sp          ; 'cause we need HL for access to stack operands in subw

    bf     r_2.7, $3f

    movw   ax, r12         ; subtract operand 1 from the higher 32-bits of the result
    subw   ax, [hl+4]
    movw   r12, ax

    movw   ax, r14
    sknc
    decw   ax
    subw   ax, [hl+6]
    movw   r14, ax

    bf     r_2.6, $4f

3:  movw   ax, r12        ; subtract operand 2 from the higher 32-bits of the result
    subw   ax, [hl+8]
    movw   r12, ax

    movw   ax, r14
    sknc
    decw   ax
    subw   ax, [hl+10]
    movw   r14, ax

4:  ret

END_ANOTHER_FUNC ___mulsidi3

END_FUNC ___usmulsidi3
