// file kernel/n/x86-64/karatsuba.S: Karatsuba multiplication
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Multiplication de Karatsuba                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                            # +------------------+
                            # |  Multiplication  |
                            # +------------------+
        

# entre :
#   a = naturel de longueur la     rsi = &a, rdx = la
#   b = naturel de longueur lb     rbx = &b, rcx = lb
#   c = naturel de longueur la+lb  rdi = &c
# contraintes : 0 < lb <= la
#
# sortie :
#   c <- a * b
#
# registres modifis : tous sauf r14,r15

#ifdef assembly_sn_karamul
        ALIGN(32)
#ifdef debug_karamul
.Lsn_fkaramul_buggy:
#else
.Lsn_fkaramul:
#endif

#undef L
#define L(x) .Lsn_fkaramul_##x

        # petite multiplication => algorithme en n^2
        cmpq   $karamul_lim, %rcx
        jbe   .Lsn_fmul_n2

        # initialise les variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        #define _d_ 56(%rsp)
        #define _x_ 48(%rsp)
        #define _p_ 40(%rsp)
        #define _q_ 32(%rsp)
        #define _r_ 24(%rsp)
        #define _a_ 16(%rsp)
        #define _b_  8(%rsp)
        #define _c_   (%rsp)

        movq   %rdx,    %rbp
        shrq   $1,      %rbp
        adcq   $0,      %rbp            # rbp <- p = ceil(la/2)
        subq   %rbp,    %rcx            # rcx <- r = lb - p
        jbe    L(tranches)              # si lb <= p, dcoupe a en tranches
        subq   %rbp,    %rdx            # rdx <- q = la - p
        movq   %rbp,    %rax
        shlq   $4,      %rax            # alloue 2p chiffres dans la pile
	ALLOCA
        pushq  $0                       # x <- 0
        pushq  %rbp                     # sauve p
        pushq  %rdx                     # sauve q
        pushq  %rcx                     # sauve r
        pushq  %rsi                     # sauve &a
        pushq  %rbx                     # sauve &b
        pushq  %rdi                     # sauve &c

        # calcule |a0 - a1| et |b0 - b1| dans c
        movq   %rdx,    %rcx            # rcx <- q
        movq   %rbp,    %rdx            # rdx <- p
        leaq   (%rsi,%rdx,8), %rbx      # rbx <- &a1
        call   .Lsn_fasub               # c[0..q-1] <- |a0 - a1|
        adcq   %rcx,    _x_             # x <- signe(a0-a1)
        
        movq   _b_,     %rsi
        movq   _c_,     %rdi
        movq   _r_,     %rcx
        movq   _p_,     %rdx            # rdx <- p
        leaq   (%rsi,%rdx,8), %rbx      # rbx <- &b1
        leaq   (%rdi,%rdx,8), %rdi      # rdi <- &c[p]
        call   .Lsn_fasub               # c[p..2p-1] <- |b0 - b1|
        adcq   %rcx,    _x_             # x ^= signe(b0-b1)

        # d <- |a0-a1|*|b0-b1|
        movq   _c_,     %rsi
        movq   _p_,     %rdx
        movq   %rdx,    %rcx            # rcx <- p
        leaq   (%rsi,%rdx,8), %rbx      # rbx <- &c[p]
        leaq   _d_,     %rdi
        call   .Lsn_fkaramul

        # c <- a1b1 : a0b0
        movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _c_,     %rdi
        movq   _p_,     %rcx
        movq   %rcx,    %rdx            # rdx <- p
        call   .Lsn_fkaramul
        
        movq   _c_,     %rdi
        movq   _b_,     %rbx
        movq   _a_,     %rsi
        movq   _r_,     %rcx
        movq   _q_,     %rdx
        movq   _p_,     %rax
        leaq   (%rsi,%rax,8), %rsi      # rsi <- &a1
        leaq   (%rbx,%rax,8), %rbx      # rbx <- &b1
        leaq   (%rdi,%rax,8), %rdi      # rdi <- &c1
        leaq   (%rdi,%rax,8), %rdi
        call   .Lsn_fkaramul

        # point de chute pour karasqr
.Lsn_kara_aux:

	# raffectation des variables locales
	movq    _c_, %r9
	movq    _p_, %r10
	mov     _q_, %r11   # q <- q+r
	addq    _r_, %r11
	
	#undef  _c_
	#undef  _p_
	#undef  _q_
	#undef  _s_
	#define _c_  %r9
	#define _p_  %r10
	#define _q_  %r11
	#define _s_  %r12
        
        # c += (a0b0 + a1b1)*BASE^p

        movq   _p_,     %rcx
	leaq   (_c_, _p_,8), %rbx       # rbx <- &c[p]
        leaq   (%rbx,_p_,8), %rsi       # rsi <- &c[2p]
	movq   %rsi,    %rdi            # rdi <- &c[2p]
        call   .Lsn_fadd_1              # c[2p..3p-1] += c[p..2p-1]
        rclq   $1,      _s_             # s[0] <- retenue
        
        movq   _c_,     %rsi
        movq   _p_,     %rcx
	leaq   (_c_,_p_,8), %rdi        # rdi <- &c[p]
        call   .Lsn_fadd_1              # c[p..2p-1] <- c[0..p-1] + c[2p..3p-1]
        rclq   $1,      _s_             # s[1] <- retenue
        
        movq   %rdi,    %rsi            # rsi <- &c[2p]
        movq   _q_,     %rdx
	movq   _q_,     %rcx
        subq   _p_,     %rcx            # rcx <- q+r-p
        jz     L(short_c)
        call   .Lsn_finc                # c[2p..2p+q+r-1] += c[3p..2p+q+r-1]

        # propage la premire retenue sur c[3p..2p+q+r-1]
        bt     $1,      _s_
        jnc    L(short_c)
        movq   _p_,     %rdx
        subq   _q_,     %rdx            # rdx <- p - q - r
1:
        incq   (%rbx,%rdx,8)
        jnz    L(short_c)
        incq   %rdx
        jnz    1b
L(short_c):

        # propage les deux retenues sur c[2p..2p+q+r-1]
        bt     $0,      _s_
        setc   %al                      # rax <- 1re retenue
	movzx  %al,     %rax
        subq   _q_,     %rcx            # rcx <- -(q+r)
        bt     $1,      _s_             # CF <- 2me retenue
        adcq   %rax,    (%rbx,%rcx,8)
        jnc    L(done_ret)
1:
        incq   %rcx
        jz     L(done_ret)
        incq   (%rbx,%rcx,8)
        jz     1b
L(done_ret):

        # c[p..2p+q+r-1] -= (a0 - a1)*(b0 - b1)
	leaq   .Lsn_fdec(%rip), %rax    # rax <- adresse de saut
	leaq   .Lsn_finc(%rip), %rbp
	bt     $0,      _x_
	cmovc  %rbp,    %rax
        leaq   _d_,     %rbx
        leaq   (_c_,_p_,8), %rsi        # rsi <- &c[p]
        leaq   (_p_,_p_,1), %rcx        # rcx <- 2p
        leaq   (_p_,_q_,1), %rdx        # rdx <- p+q+r
	call   *%rax
        movq   %rbx,    %rsp            # nettoie la pile
        ret

        # ici lb <= ceil(la/2) : dcoupage en tranches de longueur lb
        ALIGN(4)
L(tranches):
        addq   %rbp,    %rcx            # rcx <- lb
	
        # Le code qui suit est recopi mot  mot dans toommul, en remplaant
        # les deux appels  sn_fkaramul par des appels  sn_ftoommul.
        # Attention  rpercuter les mises  jour !

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
        #define _d_  40(%rsp)
        #define _la_ 32(%rsp)
        #define _lb_ 24(%rsp)
        #define _a_  16(%rsp)
        #define _b_   8(%rsp)
        #define _c_    (%rsp)
        
        leaq   (,%rcx,8), %rax
        ALLOCA                          # rserve lb chiffres dans la pile
        pushq  %rdx                     # sauve la
        pushq  %rcx                     # sauve lb

        # premire multiplication : c <- a[0..(la % lb)-1]*b
        movq   %rdx,    %rax
        movq   $0,      %rdx            # rdx:rax <- la
        divq   %rcx                     # rdx <- la % lb
        testq  %rdx,    %rdx            # si la est multiple de lb ...
        jnz    1f
        movq   %rcx,    %rdx
1:
        xchgq  %rbx,    %rsi            # permute les arguments ...
        xchgq  %rcx,    %rdx            # pour avoir rdx >= rcx
        leaq   (%rbx,%rcx,8), %rax
        pushq  %rax                     # a += a[la % lb]
        pushq  %rsi                     # sauve &b
        leaq   (%rdi,%rcx,8), %rax
        pushq  %rax                     # c += c[la % lb]
        subq   %rcx,    _la_            # la -= la % lb
        call   .Lsn_fkaramul

        # multiplications suivantes
        ALIGN(4)
L(loop):
        movq   _c_,     %rsi
        leaq   _d_,     %rdi
        movq   _lb_,    %rcx
        cld;   rep movsq                # d <- c[0..lb-1]
        
        movq   _c_,     %rdi
        movq   _b_,     %rsi
        movq   _a_,     %rbx
        movq   _lb_,    %rdx
        movq   %rdx,    %rcx            # rcx <- lb
        call   .Lsn_fkaramul            # c[0..2lb-1] <- a[0..lb-1]*b

        movq   _c_,     %rsi
        leaq   _d_,     %rbx
        movq   _lb_,    %rcx
        leaq   (,%rcx,2), %rdx          # rdx <- 2*lb
        call   .Lsn_finc                # c <- c + d

        movq   _lb_,    %rax
        leaq   (,%rax,8), %rcx
        addq   %rcx,    _c_             # c+=lb
        addq   %rcx,    _a_             # a+=lb
        subq   %rax,    _la_            # la -= lb
        jne    L(loop)

        # termin
        leaq   40(%rsp,%rax,8), %rsp    # nettoie la pile
        ret

                              # +---------------+
                              # |  Interface C  |
                              # +---------------+

#  void xn(karamul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur lb
#  c = naturel de longueur la+lb, non confondu avec a ou b
#  contraintes : 0 < lb <= la
#
#  sortie :
#  c <- a*b

#ifdef debug_karamul
ENTER(sn_karamul_buggy)
#else
ENTER(sn_karamul)
#endif

	movq   %rdx,   %rbx
	movq   %rsi,   %rdx
	movq   %rdi,   %rsi
	movq   %r8,    %rdi
#ifdef debug_karamul
        call   .Lsn_fkaramul_buggy      # effectue la multiplication
#else
        call   .Lsn_fkaramul
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_karamul */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fkaramul renvoie vers la version C

#if !defined(assembly_sn_karamul) || defined(debug_karamul)
        ALIGN(32)
.Lsn_fkaramul:

	movq   %rdi,   %r8
	movq   %rsi,   %rdi
	movq   %rdx,   %rsi
	movq   %rbx,   %rdx
        jmp    SUBR(sn_karamul)
        
#endif /* !defined(assembly_sn_karamul) || defined(debug_karamul) */


                                 # +---------+
                                 # |  Carr  |
                                 # +---------+

# entre :
#   a = naturel de longueur la     rsi = &a, rdx = la
#   c = naturel de longueur 2*la   rdi = &c
# contraintes : 0 < la
#
# sortie :
#   c <- a^2
#
# registres modifis : tous asuf r12,r13,r14,r15

#ifdef assembly_sn_karasqr
        ALIGN(32)
#ifdef debug_karamul
.Lsn_fkarasqr_buggy:
#else
.Lsn_fkarasqr:
#endif

#undef L
#define L(x) .Lsn_fkarasqr_##x

        # petit carr => algorithme en n^2
        cmpq   $karasqr_lim, %rdx
        jbe   .Lsn_fsqr_n2
        
        # initialise les variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        #define _d_ 56(%rsp)
        #define _x_ 48(%rsp)
        #define _p_ 40(%rsp)
        #define _q_ 32(%rsp)
        #define _r_ 24(%rsp)
        #define _a_ 16(%rsp)
        #define _b_  8(%rsp)
        #define _c_   (%rsp)

        movq   %rdx,    %rbp
        shrq   $1,      %rbp
        adcq   $0,      %rbp            # rbp <- p = ceil(la/2)
        subq   %rbp,    %rdx            # rdx <- q = la - p
        movq   %rbp,    %rax
        shlq   $4,      %rax            # alloue 2p chiffres dans la pile
	ALLOCA
        pushq  $0                       # x <- 0
        pushq  %rbp                     # sauve p
        pushq  %rdx                     # sauve q
        pushq  %rdx                     # sauve r (= q)
        pushq  %rsi                     # sauve &a
        pushq  %rsi                     # sauve &b (= &a)
        pushq  %rdi                     # sauve &c

        # calcule |a0 - a1| dans c
        movq   %rdx,    %rcx            # rcx <- q
        movq   %rbp,    %rdx            # rdx <- p
        leaq   (%rsi,%rdx,8), %rbx      # rbx <- &a1
        call   .Lsn_fasub               # c[0..q-1] <- |a0 - a1|
        
        # d <- (a0-a1)^2
        movq   _c_,     %rsi
        movq   _p_,     %rdx
        leaq   _d_,     %rdi
        call   .Lsn_fkarasqr

        # c <- a1^2 : a0^2
        movq   _a_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        call   .Lsn_fkarasqr
        
        movq    _c_,    %rdi
        movq   _a_,     %rsi
        movq   _q_,     %rdx
        movq   _p_,     %rax
        leaq   (%rsi,%rax,8), %rsi      # rsi <- &a1
        leaq   (%rdi,%rax,8), %rdi      # rdi <- &c1
        leaq   (%rdi,%rax,8), %rdi
        call   .Lsn_fkarasqr

        jmp    .Lsn_kara_aux            # continue dans karamul
        
                              # +---------------+
                              # |  interface C  |
                              # +---------------+

#  void xn(karasqr)(chiffre *a, long la, chiffre *b)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur 2*la, non confondu avec a
#  contraintes : 0 < la
#
#  sortie :
#  b <- a^2

#ifdef debug_karamul
ENTER(sn_karasqr_buggy)
#else
ENTER(sn_karasqr)
#endif

	movq   %rdx,   %rax
	movq   %rsi,   %rdx
	movq   %rdi,   %rsi
	movq   %rax,   %rdi
#ifdef debug_karamul
        call   .Lsn_fkarasqr_buggy      # calcule le carr
#else
        call   .Lsn_fkarasqr
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_karasqr */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fkarasqr renvoie vers la version C
        
#if !defined(assembly_sn_karasqr) || defined(debug_karamul)
        ALIGN(32)
.Lsn_fkarasqr:

	movq   %rdi,   %rax
	movq   %rsi,   %rdi
	movq   %rdx,   %rsi
	movq   %rax,   %rdx
        jmp    SUBR(sn_karasqr)
        
#endif /* !defined(assembly_sn_karasqr) || defined(debug_karamul) */


