// file kernel/n/x86/mmod.S: operations on residues modulo BASE^n + 1
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                  Arithmtique modulo BASE^n + 1                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

        # +---------------------------------------------------------+
        # |  Rduction modulo BASE^p + 1 et BASE^(2p) - BASE^p + 1  |
        # +---------------------------------------------------------+

# entre :
#  a = naturel de longueur 3p         esi = &a, ecx = p
#  b = naturel de longueur 3p+1       edi = &b
#  c = naturel de longueur p          ebx = &c
#
# contraintes :
#  p > 0, c disjoint de a et b. On peut avoir &a == &b
#
# sortie :
#  b[0..2p-1] <- a mod BASE^(2p) - BASE^p + 1
#  b[2p..3p]  <- a mod BASE^p + 1
#  c <- ind. (mode x86)
#  c inutilis (mode sse2)
#
# registres modifis :
#  eax,ebx,ecx,edx,esi,edi,ebp <- ind.
        
#if defined(assembly_sn_mmul) || defined(assembly_sn_msqr)
#undef L
#define L(x) .Lsn_fmred_##x
        ALIGN_32
.Lsn_fmred:

#ifdef use_sse2

        leal  (%ecx,%ecx,2), %eax       # eax <- 3p
        leal  (%esi,%eax,4), %esi       # esi <- &a[3p]
        leal  (%edi,%eax,4), %edi       # edi <- &b[3p]
        negl   %eax                     # eax <- -3p
        leal  (%eax,%ecx,1), %ebx       # ebx <- -2p
        negl   %ecx                     # ecx <- -p
        movl   %ecx,    %edx            # edx <- -p
        pxor   %mm3,    %mm3            # init retenues
        pxor   %mm4,    %mm4
        pxor   %mm5,    %mm5

        # b0 <- a0 - a2, b1 <- a1 + a2, b2 <- a0 - a1 + a2
        ALIGN_4
1:
        movd  (%esi,%eax,4), %mm0       # mm0 <- a0[i]
        movd  (%esi,%ebx,4), %mm1       # mm1 <- a1[i]
        movd  (%esi,%edx,4), %mm2       # mm2 <- a2[i]
        paddq  %mm0,    %mm3            # mm3 <- ret(b0) + a0[i]
        paddq  %mm1,    %mm4            # mm4 <- ret(b1) + a1[i]
        paddq  %mm2,    %mm5            # mm5 <- ret(b2) + a2[i]
        psubq  %mm2,    %mm3            # mm3 <- ret(b0) + a0[i] - a2[i]
        paddq  %mm2,    %mm4            # mm4 <- ret(b1) + a1[i] + a2[i]
        paddq  %mm0,    %mm5            # mm5 <- ret(b1) + a0[i] + a2[i]
        psubq  %mm1,    %mm5            # mm5 <- ret(b1) + a0[i] - a1[i] + a2[i]
        movd   %mm3, (%edi,%eax,4)      # sauve b0[i]
        movd   %mm4, (%edi,%ebx,4)      # sauve b1[i]
        movd   %mm5, (%edi,%edx,4)      # sauve b2[i]
        incl   %eax
        incl   %ebx
        incl   %edx
        pshufw $0xfe, %mm3, %mm3        # nouvelle retenue sur b0
        pshufw $0xfe, %mm4, %mm4        # nouvelle retenue sur b1
        pshufw $0xfe, %mm5, %mm5        # nouvelle retenue sur b2
        jne    1b

        # propage la retenue sortant de b0
        leal   (%edi,%ecx,4), %ebx      # ebx <- &b2
        movd   %mm3,    %eax
        sarl   $1,      %eax            # CF <- retenue
        jnb    2f
        movl   %ecx,    %edx
1:
        sbbl   $0, (%ebx,%edx,4)        # b1--
        jnb    2f
        incl   %edx
        jne    1b
        jmp    L(noret)                 # si la retenue traverse b1 alors
        ALIGN_4                        # a1+a2=BASE^p et il n y a plus rien  faire
2:

        # recycle la retenue sortant de b1
        movd   %mm4,    %eax
        testl  %eax,    %eax
        jz     L(noret)
        leal   (%ebx,%ecx,4), %eax      # eax <- &b1

        # ici retenue = 1 -> ajoute BASE^p - 1
        # rmq:  on a b1 <= BASE^p - 2, donc il ne peut pas y avoir de
        # nouvelle retenue
        movl   %ecx,    %edx            # edx <- -p
1:
        subl   $1, (%eax,%edx,4)        # b0--
        jnb    2f
        incl   %edx
        jne    1b
        jmp    L(noret)
        ALIGN_4
2:
        movl   %ecx,    %edx            # edx <- -p
3:
        incl   (%ebx,%edx,4)            # b1++
        jne    L(noret)
        incl   %edx
        jmp    3b

        # recycle la retenue sortant de b2
        ALIGN_4
L(noret):
        movd   %mm5,    %eax
        incl   %eax                     # eax <- retenue + 1 (0,1 ou 2)
        jne    2f
        movl   %ecx,    %edx
1:
        incl   (%edi,%edx,4)            # si 0, b2++
        jne    2f
        incl   %edx
        jne    1b
        movl   $2,      %eax
2:      
        shrl   $1,      %eax            # eax <- retenue finale (0 ou 1)
        movl   %eax,   (%edi)           # la range dans b2[p]

        emms    
        ret
        
#else /* use_sse2 */

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #undef  _r_
        #define _a_   16(%esp)
        #define _b_   12(%esp)
        #define _c_    8(%esp)
        #define _p_    4(%esp)
        #define _r_     (%esp)
        
        pushl  %esi                     # sauve &a
        pushl  %edi                     # sauve &b
        pushl  %ebx                     # sauve &c
        pushl  %ecx                     # sauve p
        pushl  $0                       # retenues <- 0

        # c <- a0 - a1
        movl   %ebx,    %edi            # edi <- &c
        leal   (%esi,%ecx,4), %ebx      # ebx <- &a1
        call   .Lsn_fsub_1
        rcll   $1,     _r_              # sauve la retenue
        
        # b1 <- a1 + a2
        movl   _p_,     %ecx
        movl   _b_,     %edi
        leal   (%edi,%ecx,4), %edi      # edi <- &b1
        call   .Lsn_fadd_1
        rcll   $1,     _r_              # sauve la retenue

        # b0 <- a0 - a2
        movl   %esi,    %ebx            # ebx <- &a2
        movl   _a_,     %esi
        movl   _b_,     %edi
        movl   _p_,     %ecx
        call   .Lsn_fsub_1
        
        # propage la retenue sortant de b0
        movl   _p_,     %ecx
        leal   (%edi,%ecx,4), %ebx      # ebx <- &b2
        not    %ecx; incl %ecx          # ecx <- -p
        jnb    2f
        movl   %ecx,     %edx
1:
        sbbl   $0, (%ebx,%edx,4)        # b1--
        jnb    2f
        incl   %edx
        jne    1b
        jmp    L(noret)                 # si la retenue traverse b1 alors
        ALIGN_4                        # a1+a2=BASE^p et il n y a plus rien  faire
2:

        # recycle la retenue sortant de b1
        bt     $0,      _r_
        jnc    L(noret)

        # ici retenue = 1 -> ajoute BASE^p - 1
        # rmq:  on a b1 <= BASE^p - 2, donc il ne peut pas y avoir de
        # nouvelle retenue
        movl   %ecx,    %edx            # edx <- -p
1:
        subl   $1, (%edi,%edx,4)        # b0--
        jnb    2f
        incl   %edx
        jne    1b
        jmp    L(noret)
        ALIGN_4
2:
        movl   %ecx,    %edx            # edx <- -p
3:
        incl   (%ebx,%edx,4)            # b1++
        jne    L(noret)
        incl   %edx
        jmp    3b
        ALIGN_4
L(noret):

        # b2 <- a2 + (a0 - a1)
        negl   %ecx                     # ecx <- p
        movl   %ebx,    %edi            # edi <- &b2
        movl   _c_,     %ebx
        leal   (%esi,%ecx,4), %esi      # esi <- &a2
        call   .Lsn_fadd_1
        
        # recycle la retenue sortant de b2
        adcl   %ecx,    %ecx
        bt     $1,      _r_
        sbbl   $0,      %ecx            # ecx <- retenue (-1,0,1)
        jnb    2f
        movl   _p_,     %edx
        negl   %edx
        incl   %ecx
1:
        incl   (%edi,%edx,4)            # si -1, b2++
        jne    2f
        incl   %edx
        jne    1b
        incl   %ecx
2:      
        movl   %ecx,   (%edi)           # b2[p] <- retenue finale
        
        leal  20(%esp), %esp            # nettoie la pile
        ret

#endif /* use_sse2 */
        
#endif /* defined(assembly_sn_mmul) || defined(assembly_sn_msqr) */
        

                   # +------------------------------------+
                   # |  Multiplication modulo BASE^n + 1  |
                   # +------------------------------------+

#  void xn(mmul)(chiffre *a, chiffre *b, long n)
#
#  entre :
#  a = naturel de longueur n+1
#  b = naturel de longueur n+1 non confondu avec a
#
#  contrainte : n > 0
#
#  sortie :
#  a <- (a*b) mod (BASE^n + 1), le chiffre de poids fort vaut 0 ou 1
#  b <- b mod (BASE^n + 1)

#ifdef assembly_sn_mmul
#undef L
#define L(x) .Lsn_mmul_##x

#ifdef debug_mmul
ENTER(sn_mmul_buggy)
#else
        ALIGN_32
        .globl SUBR(sn_mmul)
#if __ELF__
        .type  SUBR(sn_mmul),@function
#endif
SUBR(sn_mmul):
.Lsn_mmul:
        pushl  %ebp
        movl   %esp,%ebp
        pushl  %edi
        pushl  %esi
        pushl  %ebx
#endif
        
        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %ebx            # ebx <- &b
        movl   arg3,    %edx            # edx <- n

        # normalise a
        movl   (%esi,%edx,4), %eax      # eax <- a[n]
        subl   %eax,    (%esi)          # a[0..1] -= a[n]
        sbbl   $0,     4(%esi)
        jnb    L(a_ok)                  # pas de retenue -> fini
        leal   -2(%edx), %ecx           # ecx <- n-2
        leal   8(%esi), %edi            # edi <- &a[2]
1:
        sbbl   $0,      (%edi)          # propage la retenue
        jnb    L(a_ok)
        leal   4(%edi), %edi
        loop   1b
        movl   %edx,    %ecx            # si elle ressort, ajoute BASE^n + 1
        movl   %esi,    %edi
2:
        incl   (%edi)
        jnz    L(a_ok)
        leal   4(%edi), %edi
        loop   2b
        
        # ici a = BASE^n, donc le produit vaut  -b mod BASE^n + 1
        movl   (%ebx,%edx,4), %eax      # eax <- b[n]
        jmp    L(neg_b)                 # a <- -b mod BASE^n + 1

        # normalise b
        ALIGN_4
L(a_ok):        
        movl   (%ebx,%edx,4), %eax
        movl   $0,      (%ebx,%edx,4)   # b[n] <- 0
        subl   %eax,    (%ebx)          # b[0..1] -= b[n]
        sbbl   $0,     4(%ebx)
        jnb    L(b_ok)                  # pas de retenue -> fini
        leal   -2(%edx), %ecx           # ecx <- n-2
        leal   8(%ebx), %edi            # edi <- &b[2]
1:
        sbbl   $0,      (%edi)          # propage la retenue
        jnb    L(b_ok)
        leal   4(%edi), %edi
        loop   1b
        movl   %edx,    %ecx            # si elle ressort, ajoute BASE^n + 1
        movl   %ebx,    %edi
2:
        incl   (%edi)
        jnz    L(b_ok)
        leal   4(%edi), %edi
        loop   2b
        movl   $1,      (%edi)          # b[n] <- 1
        movl   %esi,    %ebx            # ebx <- &a

        # ici b = BASE^n, donc le produit vaut -a mod BASE^n + 1
        movl   %ecx, %eax               # eax <- a[n] (= 0)
L(neg_b):
        leal   (%esi,%edx,4), %edi      # edi <- &a[n]
        leal   (%ebx,%edx,4), %ebx      # ebx <- &b[n]
        movl   %edx,    %ecx
        negl   %ecx                     # ecx <- -n
        xorl   %edx,    %edx
        ALIGN_4
1:
        sbbl   (%ebx,%ecx,4),  %eax     # a <- b[n] - b[0..n-1]
        movl   %eax,    (%edi,%ecx,4)
        movl   %edx,    %eax
        incl   %ecx
        jne    1b
        movl   %eax,    (%edi)          # a[n] <- 0
        jnb    3f
2:
        incl   (%esi)                   # s il y a retenue, ajoute BASE^n + 1
        leal   4(%esi), %esi
        jz     2b
3:
        RETURN_WITH_SP

        # ici a et b sont normaliss et tiennent sur n chiffres
        # n est-il divisible par 3 et suffisament grand ?
        ALIGN_4
L(b_ok):
        cmpl   $mmul_lim, %edx
        jbe    L(simple_mul)
        movl   %edx,    %eax            # eax <- n
        xorl   %edx,    %edx            # edx:eax <- n
        movl   $3,      %ecx
        divl   %ecx                     # eax <- n/3, edx <- n%3
        testl  %edx,    %edx
        jz     L(trois)
        movl   arg3,    %edx            # edx <- n

        # cas n petit ou non divisible par 3 : multiplication dans N
L(simple_mul):
        movl   %edx,    %ecx            # ecx <- n
        leal   (,%edx,8), %eax
	ALLOCA                          # rserve 2n chiffres dans la pile
        movl   %esp,    %edi            # edi <- &c
        pushl  %ecx                     # sauve n
        pushl  %esi                     # sauve &a
        call   .Lsn_ftoommul            # c <- a*b
        
        # point de chute pour msqr
.Lsn_mmul_aux_simple:
        
        popl   %edi                     # edi <- &a
        movl   (%esp),  %ecx            # ecx <- n
        movl   %ecx,    %edx            # edx <- n
        leal   4(%esp), %esi            # esi <- &c
        leal   (%esi,%ecx,4), %ebx      # ebx <- &c[n]
        call   .Lsn_fsub_1              # a[0..n-1] <- c[0..n-1] - c[n..2n-1]
        popl   %edx                     # edx <- n
        leal   (%esp,%edx,8), %esp      # nettoie la pile
        movl   %ecx,    (%edi)          # a[n] <- 0
        jnb    2f                       # s il n y a pas de retenue, c est fini
        not    %edx
1:
        incl   %edx
        incl   (%edi,%edx,4)            # sinon, ajoute BASE^n + 1
        jz     1b
2:
        RETURN_WITH_SP
        
        # cas n divisible par 3 : multiplie modulo BASE^p + 1
        # et modulo BASE^(2p) - BASE^p + 1
        
L(trois):

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #define _c_   12(%esp)
        #define _a_    8(%esp)
        #define _b_    4(%esp)
        #define _p_     (%esp)

	movl   %eax,      %ecx
        leal   (,%eax,8), %eax
        leal   (%eax,%eax,2), %eax      # eax <- 24p
	ALLOCA                          # rserve 6p chiffres dans la pile
        pushl  %esi                     # sauve &a
        pushl  %ebx                     # sauve &b
        pushl  %ecx                     # sauve p

        # dcompose a et b modulo BASE^(2p) - BASE^p + 1 et BASE^p + 1
        movl   %esi,   %edi
#ifndef use_sse2
        leal   _c_,    %ebx
#endif
        call   .Lsn_fmred
        movl   _b_,    %esi
        movl   _p_,    %ecx
        leal   _c_,    %edi
#ifndef use_sse2
        leal   (%edi,%ecx,8), %ebx
#endif
        call   .Lsn_fmred

        # a[2p..3p] <- (a*b) mod BASE^p + 1
        movl   _a_,     %esi
        leal   _c_,     %edi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,8), %esi
        leal   (%edi,%ecx,8), %edi      # edi <- &c[2p]
        pushl  %ecx
        pushl  %edi
        pushl  %esi
        call   .Lsn_mmul
        leal   12(%esp), %esp
        
        # c[2p..6p-1] <- (a*b) mod (BASE^(2p) - BASE^p + 1), non rduit
        movl   _a_,     %esi
        leal   _c_,     %ebx
        movl   _p_,     %ecx
        leal   (,%ecx,2), %ecx          # ecx <- 2p
        movl   %ecx,    %edx            # edx <- 2p
        leal   (%ebx,%ecx,4), %edi      # edi <- &c[2p]
        call   .Lsn_ftoommul
        
        # point de chute pour msqr
.Lsn_mmul_aux_trois:

#ifdef use_sse2

        # a[0..2p-1] <- (a*b) mod (BASE^(2p) - BASE^p + 1)
        # a[2p..3p]  <- a[0..2p-1] - (a*b) mod (BASE^p + 1)
        movl   _p_,     %ecx
        movl   _a_,     %edi
        leal   _c_,     %esi
        leal   (%esi,%ecx,8), %esi      # esi <- &c[2p]
        movl   %ecx,    %ebx            # ebx <- p
        leal   (%ecx,%ecx,2),%edx       # edx <- 3p
        pxor   %mm0,    %mm0            # init retenues
        pxor   %mm1,    %mm1
        pxor   %mm2,    %mm2
        

        ALIGN_4
1:
        movd   (%esi),  %mm3            # mm3 <- c0[i]
        movd   (%esi,%ebx,4),%mm4       # mm4 <- c1[i]
        movd   (%esi,%ebx,8),%mm5       # mm5 <- c2[i]
        movd   (%esi,%edx,4),%mm6       # mm6 <- c3[i]
        movd   (%edi,%ebx,8),%mm7       # mm7 <- a2[i]

        psubq  %mm5,    %mm3            # mm3 <- c0[i] - c2[i]
        paddq  %mm5,    %mm4            # mm4 <- c1[i] + c2[i]
        psubq  %mm6,    %mm3            # mm3 <- c0[i] - c2[i] - c3[i]
        paddq  %mm3,    %mm2            # mm2 <- ret(a2) + a0[i]
        paddq  %mm3,    %mm0            # mm0 <- ret(a0) + c0[i] - c2[i] - c3[i]
        psubq  %mm4,    %mm2            # mm2 <- ret(a2) + a0[i] - a1[i]
        paddq  %mm4,    %mm1            # mm1 <- ret(a1) + c1[i] + c2[i]
        psubq  %mm7,    %mm2            # mm2 <- ret(a2) + a0[i] - a1[i] - a2[i]
        
        movd   %mm0,    (%edi)          # sauve a0[i]
        movd   %mm1,    (%edi,%ebx,4)   # sauve a1[i]
        movd   %mm2,    (%edi,%ebx,8)   # sauve a2[i]
        pshufw $0xfe, %mm0, %mm0        # nouvelle retenue pour a0
        pshufw $0xfe, %mm1, %mm1        # nouvelle retenue pour a1
        pshufw $0xfe, %mm2, %mm2        # nouvelle retenue pour a2
        leal   4(%esi), %esi
        leal   4(%edi), %edi
                loop   1b

        # propage la retenue sur a1 (0,-1,-2)
        leal   (%edi,%ebx,4),%edx       # edx <- &a2
        leal   (%edx,%ebx,4),%esi       # esi <- &a2[p]
        negl   %ebx                     # ebx <- -p
        movd   %mm0,    %eax
        negl   %eax                     # eax <- |ret|
        jz     2f
        subl   %eax,    (%edx,%ebx,4)   # retranche au premier chiffre de a1
        jnb    2f
        leal   1(%ebx), %ecx
1:
        subl   $1,      (%edx,%ecx,4)   # propage la retenue sur les chiffres suivants
        jnb    2f
        incl   %ecx
        jne    1b
2:      

        # recycle la retenue sur a2 (-1,0,1)
        movd   %mm1,    %eax
        sbbl   $0,      %eax            # eax <- ret
        jz     L(noret)
        jb     L(sub)

        # ici ret = 1, ajoute BASE^p - 1
        # rmq:  on a a1 <= BASE^p - 2 donc il ne peut pas y avoir de
        # nouvelle retenue
        subl   $3,     (%esi)           # ret(a2) -= 3
        movl   %ebx,    %ecx            # ecx <- -p
1:
        subl   $1, (%edi,%ecx,4)        # a0--
        jnb    2f
        incl   %ecx
        jne    1b
        jmp    L(noret)
        ALIGN_4
2:
        movl   %ebx,    %ecx            # ecx <- -p
3:
        incl   (%edx,%ecx,4)            # a1++
        jne    L(noret)
        incl   %ecx
        jmp    3b
        ALIGN_4

        # ici ret = -1, retranche BASE^p - 1
        # rmq:  on a a1 >= BASE^p - 2 donc il ne peut pas y avoir de
        # nouvelle retenue
L(sub):
        addl   $3,     (%esi)           # ret(a2) += 3
        movl   %ebx,    %ecx            # ecx <- -p
1:
        incl   (%edi,%ecx,4)            # a0++
        jne    2f
        incl   %ecx
        jne    1b
        jmp    L(noret)
        ALIGN_4
2:
        movl   %ebx,    %ecx            # ecx <- -p
3:
        subl   $1, (%edx,%ecx,4)        # a1--
        jne    L(noret)
        incl   %ecx
        jmp    3b
        ALIGN_4
L(noret):

        # recycle la retenue sortant de a2 (entre -8 et +3)
        movd    %mm2,   %eax
        subl   (%esi),  %eax            # eax <- ret
        movl    $0,    (%esi)           # a2[p] <- 0
        js      2f
        subl    %eax,  (%edx)           # si ret >= 0, a2 -= ret
        jnb     4f
        leal  1(%ebx),  %ecx
        movl    $-1,    %eax
1:
        addl    %eax,  (%esi,%ecx,4)
        jc      4f
        incl    %ecx
        jne     1b
2:              
        subl    %eax,   (%edx)          # si ret < 0, a2 += |ret|
        jc      4f
3:
        leal    4(%edx), %edx
        incl    (%edx)
        jz      3b
4:

# algorithme de division par -3 modulo BASE^p + 1
# -----------------------------------------------
#
# On note a pour a2 et B pour BASE. On a 0 <= a <= B^p et on cherche
# q tel que a = -3q mod (B^p+1) avec 0 <= q <= 2B^p - 1 (ie. le chiffre
# de poids fort de q vaut 0 ou 1). q existe car -3 est premier  B.
#
# Soit m = (B-1)/3 (entier). l algorithme consiste  calculer m*a et  le
# diviser par 1-B (division suivant les puissances de B croissantes).
# On obtient donc b compris entre 0 et B^p-1 et x relatif tels que :
#
#     m*a = (1-B)*b + B^p*x
#
# Les deux premiers termes sont divisibles par m, donc le troisime aussi.
# Soit x = m*y, d o a = -3b + B^p*y. Comme 0 <= a <= B^p et 0 <= b < B^p,
# on a 0 <= y <= 3.
#
# Si y = 0: (a=0 dans ce cas) alors q = b (= 0).
# Si y = 1: a = -3b +  B^p = -3(b + 1 +  m*(1+B+..+B^(p-1))) mod (B^p+1)
# Si y = 2: a = -3b + 2B^p = -3(b + 2 + 2m*(1+B+..+B^(p-1))) mod (B^p+1)
# Si y = 3: a = -3n + 3B^p = -3(b+1) mod (B^p+1).
#
# Calcul de b et y: on note a = sum(a.i*B^i), b = sum(b.i*B^i) et on
# calcule les nombres b.i,c.i,d.i de proche en proche par :
#
#     d.0 = 0
#     m*(a.i + d.i) = b.i + B*c.i    (division euclidienne)
#     d.(i+1) = (b.i + c.i)/m        (entier)
#
# Donc m*(a+d) = b + B*c + m*B^p*(a.p+d.p) et m*d = B*(b+c)
# d o m*a = (1-B)*b + B^p*m*(a.p+d.p) comme annonc, et y = a.p+d.p.
#
# Le calcul de b.i et c.i  partir de a.i et d.i est immdiat. En ce qui
# concerne d.i, on montre par rcurrence que d.i <= 3 et comme m = 1 mod 4,
# on en dduit d.i = (b.i + c.i) mod 4.

        movl   %ebx,    %ecx            # ecx <- -p
        movl   $0x55555555, %ebp        # ebp <- m
        movd   %ebp,    %mm7            # mm7 <- m
        movl   $3,      %eax
        movd   %eax,    %mm6            # mm6 <- masque pour diviser par 4
        pxor   %mm0,    %mm0            # mm0 <- 0 (= d.0)

        ALIGN_4
1:
        movd   (%esi,%ecx,4), %mm1
        paddq   %mm0,   %mm1            # mm1 <- a.i+d.i
        pshufw  $0xfe,  %mm1, %mm2      # mm2 <- retenue
        pmuludq %mm7,   %mm1            # mm1 <- m*low(a.i+d.i)
        movd    %mm1,  (%esi,%ecx,4)    # sauve b.i
        pshufw  $0xfe,  %mm1, %mm0
        paddq   %mm1,   %mm0            # low(mm0) <- b.i+c.i
        pand    %mm6,   %mm0
        paddq   %mm2,   %mm0            # mm0 <- d.(i+1) + retenue de a.i+d.i
        incl    %ecx
        jne     1b

        movd   (%esi),  %mm1
        paddq   %mm0,   %mm1            # mm1 <- y
        movd    %mm1,   %eax
        testl   %eax,   %eax
        jz      4f                      # si y = 0, pas de correction
        movl    %ebx,   %ecx            # ecx <- -p
        cmpl    $3,     %eax
        je      2f                      # si y = 3, il faut ajouter 1

        pmuludq %mm1,   %mm7            # si y=1 ou 2, ajoute y + m*y*(1+..+B^(p-1))
        ALIGN_4
1:      
        movd   (%esi,%ecx,4), %mm0
        paddq   %mm0,   %mm1
        paddq   %mm7,   %mm1
        movd    %mm1,  (%esi,%ecx,4)
        pshufw  $0xfe, %mm1, %mm1
        incl    %ecx
        jne     1b
        movd    %mm1,  (%esi)
        jmp     4f
        ALIGN_4
2:
        movl   $0,     (%esi)           # a2[p] <- 0
3:      
        incl   (%esi,%ecx,4)            # si y = 3, ajoute 1
        leal  1(%ecx), %ecx
        jz     3b
4:      

        # a <- a - (BASE^p - 1)*a[2p..3p]
        leal   (%esi,%ebx,4), %edx      # edx <- &a2
        movl   %ebx, %ecx               # ecx <- -p
        pxor   %mm0,    %mm0            # init retenues
        pxor   %mm1,    %mm1
        ALIGN_4
1:
        movd   (%edi,%ecx,4), %mm2      # mm2 <- a0[i]
        movd   (%edx,%ecx,4), %mm3      # mm3 <- a1[i]
        movd   (%esi,%ecx,4), %mm4      # mm4 <- a2[i]
        paddq  %mm2,    %mm0            # mm0 <- ret(a0) + a0[i]
        paddq  %mm3,    %mm1            # mm1 <- ret(a1) + a1[i]
        paddq  %mm4,    %mm0            # mm0 <- ret(a0) + a0[i] + a2[i]
        psubq  %mm4,    %mm1            # mm1 <- ret(a1) + a1[i] - a2[i]
        movd   %mm0,   (%edi,%ecx,4)    # sauve a0[i]
        movd   %mm1,   (%edx,%ecx,4)    # sauve a1[i]
        pshufw $0xfe,   %mm0, %mm0      # nouvelle retenue sur a0
        pshufw $0xfe,   %mm1, %mm1      # nouvelle retenue sur a1
        incl   %ecx
        jne    1b

        # propage la retenue ngative
        movl  (%esi),   %ebp
        movd   %mm1,    %eax
        subl   %ebp,    %eax
        jz     2f
        addl   %eax,   (%edx)
        jc     2f
1:
        leal   4(%edx), %edx
        subl   $1,     (%edx)
        jb     1b
2:
        
        # propage la retenue positive
        movd   %mm0,    %eax
        addl   %ebp,    %eax
        jz     2f
        addl   %eax,   (%edi)
        jnc    2f
1:      
        leal   4(%edi), %edi
        incl   (%edi)
        jz     1b
2:
        emms

#else /* use_sse2 */

        # a[0..2p-1] <- (a*b) mod (BASE^(2p) - BASE^p + 1)
        movl   _a_,     %edi
        leal   _c_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,8), %esi      # esi <- &c[2p]
        leal   (%esi,%ecx,8), %ebx      # ebx <- &c2
        call   .Lsn_fsub_1              # a0 <- c0 - c2
        adcl   %ecx,    %ecx            # ecx <- retenue sur a1
        movl   %ecx,    _b_             # la sauve  la place de &b
        movl   _p_,     %ecx
        leal   (%esi,%ecx,4), %ebx      # ebx <- &c2
        call   .Lsn_fadd_1              # a1 <- c1 + c2
        adcl   %ecx,    %ecx
        movl   %ecx,    %edi            # edi <- retenue sur a2
        movl   _a_,     %esi
        movl   _p_,     %ecx
        call   .Lsn_fdec_1              # a0 -= c3
        movl   _b_,     %edx            # rcupre la retenue sur a1
        sbbl   %edx,    (%esi)          # cumule  la retenue courante
        jnb    2f                       # et la propage
        movl   _p_,     %ecx
        leal   (%esi,%ecx,4), %ebx
        decl   %ecx
        negl   %ecx
1:
        subl   $1,      (%ebx,%ecx,4)
        jnb    2f
        incl   %ecx
        jne    1b
        ALIGN_4
2:
        sbbl   $0,      %edi            # edi <- retenue sur a2
        jz     7f
        movl   _p_,     %ecx
        not    %ecx
        incl   %ecx
        jb     5f
3:                                      # si > 0, a0--, a1++    
        subl   $1,      (%esi,%ecx,4)
        jnb    4f
        incl   %ecx
        jne    3b
        jmp    7f
        ALIGN_4
4:
        incl   (%esi)
        leal   4(%esi), %esi
        jz     4b
        jmp    7f
        ALIGN_4
5:                                      # si < 0, a0++, a1--
        incl   (%esi,%ecx,4)
        jne    6f
        incl   %ecx
        jne    5b
        jmp    7f
        ALIGN_4
6:
        subl   $1,      (%esi)
        leal   4(%esi), %esi
        jb     6b
7:

        # a[2p..3p] <- (a[0..2p-1] - a[2p..3p]) mod (BASE^p + 1), normalis
        movl   _p_,     %ecx
        movl   _a_,     %esi
        leal   (%esi,%ecx,8), %ebx      # ebx <- &a2
        movl   %ebx,    %edi
        call   .Lsn_fsub_1              # a2 <- a0 - a2
        adcl   %ecx,    (%ebx)
        movl   _p_,     %ecx
        movl   %esi,    %ebx            # ebx <- &a1
        leal   (%esi,%ecx,4), %esi      # esi <- &a2
        call   .Lsn_fdec_1              # a2 -= a1
        xchgl  %ecx,    (%esi)          # a2[p] <- 0
        adcl   $0,      %ecx            # ecx <- retenue
        movl   _p_,     %edx
        negl   %edx
        addl   %ecx,    (%esi,%edx,4)   # rinjecte la retenue
        jnc    2f
1:
        incl   %edx
        incl   (%esi,%edx,4)
        jz     1b
2:

        # a[2p..3p] <- -a[2p..3p]/3 mod BASE^p
        # voir explications plus haut (code sse2)
        movl   _p_,     %ecx
        negl   %ecx
        movl   $0x55555555, %ebp        # ebp <- (BASE-1)/3
        xorl   %eax,    %eax            # init reste
        ALIGN_4
1:
        adcl   (%esi,%ecx,4), %eax      # eax += a[i]
        rcll   $1,      %ebx            # sauve la retenue
        mull   %ebp                     # divise par -3
        movl   %eax,    (%esi,%ecx,4)   # a[i] <- quotient
        addl   %edx,    %eax
        andl   $3,      %eax            # eax <- reste
        bt     $0,      %ebx            # rcupre la retenue
        incl   %ecx
        jne    1b
        adcl   (%esi),  %eax            # ajoute le dernier chiffre de a2
        movl   %ecx,    (%esi)          # a2[p] <- 0
        jz     5f                       # pas de retenue -> fini
        
        movl   _p_,     %ecx
        negl   %ecx
        cmpl   $2,      %eax
        ja     4f                       # ret = 3 -> a2++
        jne    2f
        shll   $1,      %ebp            # ret = 1 ou 2 => ebp <- ret*(BASE-1)/3
2:
        addl   %ebp,    %eax            # eax <- ret*(BASE+2)/3
        addl   %eax,    (%esi,%ecx,4)   # a2 += ret/3
        incl   %ecx
        ALIGN_4
3:
        adcl   %ebp,    (%esi,%ecx,4)
        incl   %ecx
        jne    3b
        adcl   %ecx,    (%esi)
        jmp    5f
        ALIGN_4
4:
        incl   %ecx
        incl  -4(%esi,%ecx,4)           # si ret = 3, a2++
        jz     4b
5:
        
        # a <- a - (BASE^p - 1)*a[2p..3p]
        movl   _a_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,8), %ebx      # ebx <- &a2
        leal   (,%ecx,2), %edx          # edx <- 2p
        incl   %ecx                     # ecx <- p+1
        call   .Lsn_finc                # a1:a0 += a2

        movl   %esi,    %ebx            # ebx <- &a2
        movl   _a_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,4), %esi      # esi <- &a1
        leal   1(,%ecx,2), %edx         # edx <- 2p+1
        incl   %ecx                     # ecx <- p+1
        pushf                           # sauve la retenue de l addition
        call   .Lsn_fdec                # a2:a1 -= a2

        popf                            # propage la retenue de l addiion
        jnc    2f
        movl   _p_,     %ecx
        incl   %ecx
        not    %ecx
1:
        incl   %ecx
        incl   (%esi,%ecx,4)
        jz     1b
        ALIGN_4
2:
        
#endif /* use_sse2 */

        # termin
        movl   _p_,     %eax
        leal   (,%eax,2), %eax
        leal   3(%eax,%eax,2), %eax     # eax <- 6p+3
        leal   (%esp,%eax,4), %esp      # nettoie la pile
        RETURN_WITH_SP

#endif /* assembly_sn_mmul */

        # cas o la version assembleur est dsactive ou dbogue :
        # .Lsn_mmul renvoie vers la version C

#if !defined(assembly_sn_mmul) || defined(debug_mmul)
        ALIGN_32
.Lsn_mmul:
        jmp   SUBR(sn_mmul)
#endif /* !defined(assembly_sn_mmul) || defined(debug_mmul) */



                        # +---------------------------+
                        # |  Carr modulo BASE^n + 1  |
                        # +---------------------------+

#  void xn(msqr)(chiffre *a, long n)
#
#  entre :
#  a = naturel de longueur n+1
#
#  contrainte : n > 0
#
#  sortie :
#  a <- a^2 mod (BASE^n + 1), le chiffre de poids fort vaut 0 ou 1

#ifdef assembly_sn_msqr
#undef L
#define L(x) .Lsn_msqr_##x
#ifdef debug_mmul
ENTER(sn_msqr_buggy)
#else
        ALIGN_32
        .globl SUBR(sn_msqr)
#if __ELF__
        .type  SUBR(sn_msqr),@function
#endif
SUBR(sn_msqr):
.Lsn_msqr:
        pushl  %ebp
        movl   %esp,%ebp
        pushl  %edi
        pushl  %esi
        pushl  %ebx
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- n

        # normalise a
        movl   (%esi,%edx,4), %eax      # eax <- a[n]
        subl   %eax,    (%esi)          # a[0..1] -= a[n]
        sbbl   $0,     4(%esi)
        jnb    L(a_ok)                  # pas de retenue -> fini
        leal   -2(%edx), %ecx           # ecx <- n-2
        leal   8(%esi), %edi            # edi <- &a[2]
1:
        sbbl   $0,      (%edi)          # propage la retenue
        jnb    L(a_ok)
        leal   4(%edi), %edi
        loop   1b
        movl   %edx,    %ecx            # si elle ressort, ajoute BASE^n + 1
        movl   %esi,    %edi
2:
        incl   (%edi)
        jnz    L(a_ok)
        leal   4(%edi), %edi
        loop   2b
        
        # ici a = BASE^n, donc le carr vaut 1
        movl   $0,      (%edi)
        movl   $1,      (%esi)
        RETURN_WITH_SP

        # ici a est normalis et tient sur n chiffres
        # n est-il divisible par 3 et suffisament grand ?
        ALIGN_4
L(a_ok):
        cmpl   $msqr_lim, %edx
        jbe    L(simple_sqr)
        movl   %edx,    %eax            # eax <- n
        xorl   %edx,    %edx            # edx:eax <- n
        movl   $3,      %ecx
        divl   %ecx                     # eax <- n/3, edx <- n%3
        testl  %edx,    %edx
        jz     L(trois)
        movl   arg2,    %edx            # edx <- n

        # cas n petit ou non divisible par 3 : carr dans N
L(simple_sqr):

        movl   %edx,    %ecx            # ecx <- n
        leal   (,%edx,8), %eax
	ALLOCA                          # rserve 2n chiffres dans la pile
        movl   %esp,    %edi            # edi <- &c
        pushl  %ecx                     # sauve n
        pushl  %esi                     # sauve &a
        call   .Lsn_ftoomsqr            # c <- a^2
        jmp    .Lsn_mmul_aux_simple     # continue avec mmul

        # cas n divisible par 3 : multiplie modulo BASE^p + 1
        # et modulo BASE^(2p) - BASE^p + 1

L(trois):

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #define _c_   12(%esp)
        #define _a_    8(%esp)
        #define _b_    4(%esp)
        #define _p_     (%esp)

	movl   %eax,      %ecx
        leal   (,%eax,8), %eax
        leal   (%eax,%eax,2), %eax      # eax <- 24p
	ALLOCA                          # rserve 6p chiffres dans la pile
        pushl  %esi                     # sauve &a
        pushl  %ebx                     # sauve &b
        pushl  %ecx                     # sauve p

        # dcompose a et b modulo BASE^(2p) - BASE^p + 1 et BASE^p + 1
        movl   %esi,   %edi
#ifndef use_sse2
        leal   _c_,    %ebx
#endif
        call   .Lsn_fmred

        # a[2p..3p] <- a^2 mod BASE^p + 1
        movl   _a_,     %esi
        leal   _c_,     %edi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,8), %esi
        pushl  %ecx
        pushl  %esi
        call   .Lsn_msqr
        leal  8(%esp), %esp
        
        # c[2p..6p-1] <- a^2 mod (BASE^(2p) - BASE^p + 1), non rduit
        movl   _a_,     %esi
        leal   _c_,     %edi
        movl   _p_,     %edx
        leal   (,%edx,2), %edx          # ecx <- 2p
        leal   (%edi,%edx,4), %edi      # edi <- &c[2p]
        call   .Lsn_ftoomsqr

        jmp    .Lsn_mmul_aux_trois      # continue avec mmul
        
#endif /* assembly_sn_msqr */

        # cas o la version assembleur est dsactive ou dbogue :
        # .Lsn_msqr renvoie vers la version C

#if !defined(assembly_sn_msqr) || defined(debug_mmul)
        ALIGN_32
.Lsn_msqr:
        jmp   SUBR(sn_msqr)
#endif /* !defined(assembly_sn_msqr) || defined(debug_mmul) */

                      # +------------------------------+
                      # |  Papillon modulo BASE^n + 1  |
                      # +------------------------------+

# void xn(butterfly1)(chiffre *a, chiffre *b, long n, long q, int s)
#
#  entre :
#  a = naturel de longueur n+1
#  b = naturel de longueur n+1 non confondu avec a
#  q = entier positif ou nul
#  s = 0 ou 1
#
#  contraintes : n >= 3 et si q est impair, n doir tre pair
#
#  sortie :
#  a <- a + (-1)^s * b * 2^(q/2) mod (BASE^n + 1)
#  b <- a - (-1)^s * b * 2^(q/2) mod (BASE^n + 1)
#
#  remarque : 2^(1/2) = BASE^(3n/4)*(BASE^(n/2) + 1) mod (BASE^n + 1)

#ifdef assembly_sn_butterfly
#undef L
#define L(x) .Lsn_butterfly_##x
#ifdef debug_butterfly
ENTER(sn_butterfly_buggy)
#else
ENTER(sn_butterfly)
#endif

        # force 2 <= a[n] <= BASE-3 pour absorber les retenues
        movl   arg1,    %esi            # esi <- &a
        movl   arg3,    %ecx            # ecx <- n
        movl   (%esi,%ecx,4), %eax      # eax <- a[n]
        addl   $2,      %eax
        jnc    2f                       # a[n] < BASE - 2 ?
        subl   $2,      (%esi,%ecx,4)   # si oui, retranche 2*(BASE^n + 1)
        subl   $2,      (%esi)
        jnb    4f
1:
        leal   4(%esi), %esi
        sbbl   $0,      (%esi)
        jb     1b
        jmp    4f
        ALIGN_4
2:
        cmpl   $4,      %eax            # a[n] >= 2 ?
        jnb    4f
        movl   %eax,    (%esi,%ecx,4)   # si non, ajoute 2*(BASE^n + 1)
        addl   $2,      (%esi)
        jnc    4f
3:
        leal   4(%esi), %esi            # propage la retenue
        incl   (%esi)
        jz     3b
4:

        movl   arg4,    %eax
        shrl   $1,      %eax            # eax <- q/2
        jnc    L(sqrt_2_done)

        # q est impair, il faut multiplier b par (BASE^(n/2) + 1)

        leal   (%ecx,%ecx,2), %edx
        leal   (%eax,%edx,8), %eax      # eax <- q/2 + 24*n
        movl   %eax,    arg4            # q <- q/2 + 24*n
        
#ifdef use_sse2

        movl   arg2,    %esi
        leal   (%esi,%ecx,2), %edi      # edi <- &b[n/2]
        leal   (%esi,%ecx,4), %esi      # esi <- &b[n]
        shrl   $1,      %ecx
        negl   %ecx                     # ecx <- -n/2
        movl   %ecx,    %edx            # edx <- -n/2

        # initialisation des retenues :
        # -b[n] sur la partie haute (mm1)
        # -floor(b[n]/2) sur la partie basse (mm0)
        # b[n] <- ceil(b[n]/2) pour viter une retenue sortant du haut
        movd  (%esi),   %mm2            # mm2 <- b[n]
        pxor   %mm0,    %mm0
        pxor   %mm1,    %mm1
        movq   %mm2,    %mm3
        psubq  %mm2,    %mm1            # mm1 <- -b[n]
        psrlq  $1,      %mm2
        psubq  %mm2,    %mm3
        psubq  %mm2,    %mm0            # mm0 <- -floor(b[n]/2)
        movd   %mm3,   (%esi)           # b[n] <- ceil(b[n]/2)
        
        ALIGN_4
1:
        movd  (%edi,%ecx,4), %mm2       # bas <- bas - haut,
        movd  (%esi,%ecx,4), %mm3       # haut <- haut + bas
        paddq  %mm2,    %mm0
        paddq  %mm2,    %mm1
        psubq  %mm3,    %mm0
        paddq  %mm3,    %mm1
        movd   %mm0,   (%edi,%ecx,4)
        movd   %mm1,   (%esi,%ecx,4)
        pshufw $0xfe, %mm0, %mm0
        pshufw $0xfe, %mm1, %mm1
        incl   %ecx
        jne    1b

        movd   %mm0,    %eax            # propage la retenue sortant du bas
        testl  %eax,    %eax
        jz      2f
1:
        subl   $1,     (%esi,%edx,4)
        jnb    2f
        incl   %edx
        jne    1b
2:
        movd   %mm1,    %eax            # b[n] += retenue finale
        sbbl   %ecx,    %eax
        addl   %eax,   (%esi)
        
        movl   arg3,    %ecx            # ecx <- n
        movl   arg4,    %eax            # eax <- q

#else /* use_sse2 */
        
        shrl   $1,      %ecx
        incl   %ecx                     # ecx <- n/2 + 1
        leal   (,%ecx,4), %eax
	ALLOCA                          # rserve n/2 + 1 chiffres dans la pile
        movl   %esp,    %edi            # edi <- &x
        movl   arg2,    %esi
        movl   %esi,    %ebx            # ebx <- &b[0]
        leal   -4(%esi,%ecx,4), %esi    # esi <- &b[n/2]
        cld;   REP(movsl)               # x <- b[n/2..n]

        # haut <- haut + bas
        movl   arg3,    %ecx
        shrl   $1,      %ecx
        leal   (%ebx,%ecx,4), %esi      # esi <- &b[n/2]
        movl   %ebx,    %edi            # edi <- &b
        pushl  %ebp
        call   .Lsn_finc_1              # b[n/2..n-1] += b[0..n/2-1]
        popl   %ebp
        adcl   %ecx,    (%esi)          # b[n] += retenue
        jnc    2f
        decl   (%esi)                   # s il y a retenue, b[n] <- BASE-1
        movl   %edi,    %esi            # ... et b <- b - 1
1:
        subl   $1,      (%esi)
        leal   4(%esi), %esi
        jb     1b
2:
        
        # bas <- bas - haut
        movl   %edi,    %esi            # esi <- &b
        movl   %esp,    %ebx            # ebx <- &x
        movl   arg3,    %ecx
        shrl   $1,      %ecx
        incl   %ecx                     # ecx <- n/2 + 1
        pushl  %ebp
        call   .Lsn_fdec_1              # b[0..n/2] -= x
        popl   %ebp
        movl   %ebx,    %esp            # nettoie la pile
1:
        sbbl   $0,      (%esi)          # propage la retenue
        leal   4(%esi), %esi
        jb     1b
        
        movl   arg3,    %ecx            # ecx <- n
        movl   arg4,    %eax            # eax <- q
        
#endif /* use_sse2 */
        
        ALIGN_4
L(sqrt_2_done):
        
        # dcompose le dcalage en nombre et fraction de chiffre
        movl   %eax,    %edx
        andl   $31,     %edx            # edx <- q % 32
        shrl   $5,      %eax            # eax <- q / 32 = m
        subl   %ecx,    %eax            # m -= n
        jb     2f
1:
        xorl   $1,      arg5            # s ^= 1
        subl   %ecx,    %eax            # m -= n
        jae    1b
2:
        addl   %ecx,    %eax            # m += n
        movl   %eax,    arg4            # sauve m  la place de q

        # b <- b*2^k mod (BASE^n + 1), normalis
        movl   %edx,    %ecx            # ecx <- k
        movl   arg2,    %edi            # edi <- &b
        jecxz  L(norm)                  # si k = 0, normalise b
        
        # dcalage de k bits
        movl   arg3,    %edx            # edx <- n
        movl   (%edi,%edx,4), %ebx      # ebx <- b[n]
        movl   %ebx,    %eax            # eax <- b[n]
        movl   %ebx,    %esi            # esi <- b[n]
        incl   %edx
        shrl   $1,      %edx
        jnc    2f
        ALIGN_4
1:
        movl  -4(%edi,%edx,8), %ebx     # ebx <- b[2i-1]
        shldl  %cl, %ebx, %eax          # eax <- (ret:b[2i-1]) << k mod BASE
        movl   %eax, (%edi,%edx,8)      # sauve dans b[2i]
2:
        movl  -8(%edi,%edx,8), %eax     # eax <- b[2i-2]
        shldl  %cl, %eax, %ebx          # ebx <- (ret:b[2i-2]) << k mod BASE
        movl   %ebx, -4(%edi,%edx,8)    # sauve dans b[2i-1]
        decl   %edx
        jne    1b
        shll   %cl,     %eax            # eax <- b[0] << k mod BASE
        movl   %eax,    (%edi)          # sauve dans b[0]
        shldl  %cl, %esi, %edx          # edx <- fin de b[n] << k
        
        # si k = 0, normalise b
L(norm):
        movl   arg3,    %ecx            # ecx <- n
        leal   (%edi,%ecx,4), %esi      # esi <- &b[n]
        negl   %ecx
        leal   2(%ecx), %ecx            # ecx <- 2 - n
        movl   (%esi),  %eax            # eax <- b[n]
        subl   %eax,    (%edi)          # retranche  b[1]:b[0]
        sbbl   %edx,    4(%edi)
        jnb    L(b_ok)
1:
        sbbl   $0,      (%esi,%ecx,4)   # propage la retenue
        jnb    L(b_ok)
        incl   %ecx
        jnz    1b
        movl   arg3,    %ecx            # si elle ressort, ajoute BASE^n + 1
        movl   %edi,    %esi
2:
        incl   (%esi)
        jnz    L(b_ok)
        leal   4(%esi), %esi
        loop   2b

        # cas b = BASE^n
        movl   arg1,    %esi            # esi <- &a
        movl   arg3,    %ecx
        incl   %ecx                     # ecx <- n+1
        movl   arg4,    %edx
        subl   %ecx,    %edx            # edx <- m - n - 1
        cld;   REP(movsl)               # b <- a
        leal   (%esi,%edx,4), %esi      # esi <- &a[m]
        leal   (%edi,%edx,4), %edi      # edi <- &b[m]
        testl  $1,      arg5            # si s = 1, change les pointeurs
        jz     1f
        xchgl  %esi,    %edi
1:                                      # a -= BASE^m
        subl   $1,      (%esi)
        leal   4(%esi), %esi
        jb     1b
2:                                      # b += BASE^m
        incl   (%edi)
        leal   4(%edi), %edi
        jz     2b
        RETURN_WITH_BP

        # ici on est sr que b tient sur n chiffres 
        # x <- b*BASE^m mod (BASE^n + 1) 
        ALIGN_4
L(b_ok):
        cld
        movl   %edi,    %esi            # esi <- &b
        movl   arg3,    %ecx            # ecx <- n
        movl   arg4,    %edx            # edx <- m
        pushl  $0                       # x[n] <- 0
        leal   (,%ecx,4), %eax
	ALLOCA                          # rserve n chiffres de plus dans la pile
        subl   %edx,    %ecx            # ecx <- n-m
        cmpl   %edx,    %ecx            # si m > n-m, change de signe pour
        jb     L(reverse)               # que la copie par "rep movsl"
                                        # porte sur la partie la plus longue

        # ici m <= n-m : x[m..n-1] <- b[0..n-m-1]
        leal   (%esp,%edx,4), %edi      # edi <- &x[m]
        REP(movsl)

        # x[0..m-1] <- 1 - b[n-m..n-1]
        xchgl  %edx,    %ecx            # ecx <- m, edx <- 0
        jecxz  L(x_ok)
        movl   $1,      %eax            # init retenue
        movl   %eax,    (%edi)          # x[n] <- 1 pour absorber la retenue
        leal   (%esp,%ecx,4), %edi      # edi <- &x[m]
        leal   (%esi,%ecx,4), %esi      # esi <- &b[n]
        negl   %ecx

#ifdef use_sse2

        movd   %eax,    %mm1
        ALIGN_4
1:
        movd   (%esi,%ecx,4), %mm0
        psubq  %mm0,   %mm1             # mm1 <- ret - b[i+n-m]
        movd   %mm1,  (%edi,%ecx,4)     # sauve dans x[i]
        pshufw $0xfe,  %mm1, %mm1       # nouvelle retenue
        incl   %ecx
        jne    1b
        movd   %mm1,   %eax             # eax <- retenue finale
        shrl   $1,     %eax             # CF <- retenue
        
#else /* use_sse2 */
        
        clc
        ALIGN_4
1:
        sbbl   (%esi,%ecx,4), %eax      # eax -= b[i+n-m]
        movl   %eax,    (%edi,%ecx,4)   # sauve dans x[i]
        movl   %edx,    %eax            # raz retenue
        incl   %ecx
        jne    1b
        
#endif /* use_sse2 */
        
2:
        sbbl   %ecx,   (%edi)           # propage la retenue
        leal   4(%edi), %edi
        jc     2b
        jmp    L(x_ok)

        # ici m > n-m > 0
        ALIGN_4
L(reverse):
        xorl   $1,      arg5            # s <- 1 - s

        # x[0..m-1] <- b[n-m..n-1]
        leal   (%esi,%ecx,4), %esi      # esi <- &b[n-m]
        movl   %esp,    %edi            # edi <- &x
        xchgl  %ecx,    %edx            # ecx <- m, edx <- n-m
        REP(movsl)

        # x[m..n-1] <- -b[0..n-m-1]
        xchgl  %ecx,    %edx            # ecx <- n-m, edx <- 0
        movl   arg2,    %esi            # esi <- &b
        leal   (%esi,%ecx,4), %esi      # esi <- &b[n-m]
        leal   (%edi,%ecx,4), %edi      # edi <- &x[n]
        negl   %ecx

#ifdef use_sse2

        pxor   %mm1,    %mm1
        ALIGN_4
1:
        movd   (%esi,%ecx,4), %mm0
        psubq  %mm0,   %mm1
        movd   %mm1,  (%edi,%ecx,4)     # sauve dans x[i]
        pshufw $0xfe,  %mm1, %mm1       # nouvelle retenue
        incl   %ecx
        jne    1b
        movd   %mm1,   %eax             # eax <- retenue finale
        shrl   $1,     %eax             # CF <- retenue
                
#else /* use_sse2 */
        
        clc
        ALIGN_4
1:
        movl   %edx,    %eax
        sbbl   (%esi,%ecx,4), %eax
        movl   %eax,    (%edi,%ecx,4)
        incl   %ecx
        jne    1b
        
#endif /* use_sse2 */
        
        movl   %esp,    %edi            # edi <- &x
2:
        adcl   %ecx,    (%edi)          # rinjecte la retenue
        leal   4(%edi), %edi
        jc     2b
L(x_ok):
        
#ifdef use_sse2
        
        # a <- a + (-1)^s*x, b <- a - (-1)^s*x
        movl   arg1,    %esi            # esi <- &a
        movl   %esp,    %ebx            # ebx <- &x
        movl   arg2,    %edi            # edi <- &b
        movl   %esi,    %edx            # edx <- &a
        movl   arg3,    %ecx
        incl   %ecx                     # ecx <- n+1
        testl  $1,      arg5
/*      cmovz  %edi,    %edx            # si s = 0, change edi et edx
        cmovz  %esi,    %edi */
	.byte  0x0f, 0x44, 0xd7, 0x0f, 0x44, 0xfe
        call   .Lsn_fadd_sub
        RETURN_WITH_BP
        
#else /* use_sse2 */
        
        # a <- a + (-1)^s*x, b <- a - (-1)^s*x
        movl   arg1,    %esi            # esi <- &a
        movl   %esp,    %ebx            # ebx <- &x
        movl   arg2,    %edi            # edi <- &b
        movl   arg3,    %ecx
        incl   %ecx                     # ecx <- n+1
        testl  $1,      arg5
        jz     L(fsub)
        pushl  %ebp
        call   .Lsn_fadd_1              # si (s), xn(add)(a,n+1,x,n+1,b)
        popl   %ebp
        movl   arg1,    %esi            # esi <- &a
        movl   %esp,    %ebx            # ebx <- &x
        movl   arg3,    %ecx
        incl   %ecx                     # ecx <- n+1
        pushl  %ebp
        call   .Lsn_fdec_1              # xn(dec)(a,n+1,x,n+1)
        popl   %ebp
        RETURN_WITH_BP

        ALIGN_4
L(fsub):
        pushl  %ebp
        call   .Lsn_fsub_1              # sinon,  xn(sub)(a,n+1,x,n+1,b)
        popl   %ebp
        movl   arg1,    %esi            # esi <- &a
        movl   %esp,    %ebx            # ebx <- &x
        movl   arg3,    %ecx
        incl   %ecx                     # ecx <- n+1
        pushl  %ebp
        call   .Lsn_finc_1              # xn(inc)(a,n+1,x,n+1)
        popl   %ebp
        RETURN_WITH_BP

#endif /* use_sse2 */
        
#endif /* assembly_sn_butterfly */

