// file kernel/n/x86/div_n2.S: O(n^2) division of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                         Division quadratique                          |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                         # +-------------------------+
                         # |  Division  un chiffre  |
                         # +-------------------------+


# unsigned long xn(div_1)(chiffre *a, long la, unsigned long b, chiffre *c)
#
# entre :
# a = naturel de longueur la >= 0
# b = long > 0
# c = naturel de longueur la, peut tre confondu avec a
#
# sortie :
# c <- floor(a/b)
# retourne a mod b
        
#ifdef assembly_sn_div_1
#undef L
#define L(x) .Lsn_div_1_##x
ENTER(sn_div_1)

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %ecx            # ecx <- la
        movl   arg3,    %ebx            # ebx <- b
        movl   arg4,    %edi            # edi <- &c
        xorl   %edx,    %edx            # init reste
        jecxz  2f
        ALIGN_4

1:
        movl   -4(%esi,%ecx,4), %eax
        divl   %ebx
        movl   %eax,   -4(%edi,%ecx,4)
        loop   1b

2:
        movl   %edx,    %eax            # eax <- reste
        RETURN_WITH_SP

#endif /* assembly_sn_div_1 */
        
# unsigned long xn(mod_1)(chiffre *a, long la, unsigned long b)
#
# entre :
# a = naturel de longueur la >= 0
# b = long > 0
#
# sortie :
# retourne a mod b
        
#ifdef assembly_sn_mod_1
#undef L
#define L(x) .Lsn_mod_1_##x
ENTER(sn_mod_1)

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %ecx            # ecx <- la
        movl   arg3,    %ebx            # ebx <- b
        xorl   %edx,    %edx            # init reste
        jecxz  2f
        ALIGN_4

1:
        movl   -4(%esi,%ecx,4), %eax
        divl   %ebx
        loop   1b

2:
        movl   %edx,    %eax            # eax <- reste
        RETURN_WITH_SP

#endif /* assembly_sn_mod_1 */
        

                         # +------------------------+
                         # |  Division quadratique  |
                         # +------------------------+

# entre :
#   a = naturel de longueur la     esi = &a, edx = la-lb
#   b = naturel de longueur lb     ebx = &b, ecx = lb
#   c = naturel de longueur la-lb  edi = &c
#
# contraintes : 
# deux <= lb < la, le bit de poids fort de b est non nul,
# a < BASE^(la-lb)*b
# a,b,c non confondus
#
# sortie :
# a <- a mod b
# c <- floor(a/b)
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#ifdef assembly_sn_div_n2
#undef L
#define L(x) .Lsn_fdiv_n2_##x
        ALIGN_32
#ifdef debug_div_n2
.Lsn_fdiv_n2_buggy:
#else
.Lsn_fdiv_n2:
#endif
#ifdef use_sse2

        # variables locales
        #undef _la_
        #undef _lb_
        #define _la_  4(%esp)
        #define _lb_   (%esp)

        leal -4(%esi,%edx,4), %esi
        leal   (%esi,%ecx,4), %esi      # esi <- &a[la-1]
        leal   (%ebx,%ecx,4), %ebx      # ebx <- &b[lb]
        negl   %ecx                     # ecx <- -lb
        leal   (%edi,%edx,4), %edi      # edi <- &c[la-lb]
        pushl  %edx                     # sauve la-lb
        pushl  %ecx                     # sauve -lb

        movl   $-1,     %eax
        movd   %eax,    %mm7            # mm7 <- BASE-1
        movq   %mm7,    %mm6
        pmuludq %mm6,   %mm6            # mm6 <- (BASE-1)^2
        movl   -4(%ebx),%ebp            # ebp <- b[lb-1]
        
        # boucle principale
        ALIGN_4
L(main):

        # calcule le quotient approch, trop grand d au plus 2 units
        movl   (%esi),  %edx            # edx:eax <- a[la-1]:a[la-2]
        movl   -4(%esi),%eax
        movd   %edx,    %mm4            # mm4 <- a[la-1]
        cmpl   %ebp,    %edx            # division tronque par b[lb-1]
        jb     1f
        movq   %mm7,    %mm0            # si q >= BASE, q <- BASE-1
        jmp    2f
        ALIGN_4
1:
        divl   %ebp
        movd   %eax,    %mm0            # mm0 <- q = quotient
2:

        # a <- a - q*b
        # astuce prise dans GMP (pentium4/sse2/submul_1.asm) :
        # on ajoute BASE^2-BASE avant le dcalage et on retranche BASE-1
        # aprs, de faon  dcaler par psrlq un nombre compris entre
        # 0 et BASE^2-1. Au total, on part avec une retenue initiale valant
        # BASE-1, on ajoute (BASE-1)^2  chaque itration et on retranche
        # BASE-1  la fin.
        movq    %mm7,    %mm3           # ret <- BASE-1
        ALIGN_4
1:
        movd (%esi,%ecx,4), %mm1        # mm1 <- a[i]
        movd (%ebx,%ecx,4), %mm2        # mm2 <- b[i]
        paddq   %mm1,   %mm3            # mm3 <- ret + a[i]
        pmuludq %mm0,   %mm2            # mm2 <- q*b[i]
        paddq   %mm6,   %mm3            # mm3 <- ret + a[i] + (BASE-1)^2
        psubq   %mm2,   %mm3            # mm3 <- ... - q*b[i]
        movd    %mm3,  (%esi,%ecx,4)    # sauve a[i]
        incl    %ecx
        psrlq   $32,    %mm3            # mm3 <- nouvelle retenue + (BASE-1)
        jne     1b
        psubq   %mm7,   %mm4            # ret -= BASE-1
2:      
        paddq   %mm3,   %mm4            # dernier chiffre
        pextrw  $3, %mm4, %eax          # eax <- retenue
        testl  %eax,    %eax
        jz     L(next)

        # si < 0, diminue q et ajoute b
        paddq  %mm7,    %mm0            # q--
        pxor   %mm3,    %mm3
        movl   _lb_,    %ecx
3:
        movd  (%esi,%ecx,4), %mm1
        movd  (%ebx,%ecx,4), %mm2
        paddq  %mm1,    %mm3
        paddq  %mm2,    %mm3
        movd   %mm3,   (%esi,%ecx,4)
        incl   %ecx
        pshufw $0xfe, %mm3, %mm3
        jne    3b
        jmp    2b

        # fin de la boucle principale
        ALIGN_4
L(next):
        movl   %eax,   (%esi)           # a[la-1] <- 0
        leal   -4(%esi),%esi            # a--
        leal   -4(%edi),%edi            # c--
        movl   _lb_,    %ecx
        movd   %mm0,   (%edi)           # c[la-lb] <- q
        decl   _la_
        jne    L(main)
        leal   8(%esp), %esp            # nettoie la pile
        emms
        ret
        
#else /* use_sse2 */
                
        # variables locales
        #undef _c_
        #undef _la_
        #undef _lb_
        #undef _b0_
        #undef _br_
        #undef _ctr_
        #define _b0_  20(%esp)
        #define _la_  16(%esp)
        #define _lb_  12(%esp)
        #define _c_    8(%esp)
        #define _br_   4(%esp)
        #define _ctr_   (%esp)

        pushl  (%ebx)                   # sauve b0
        leal -4(%esi,%edx,4), %esi
        leal   (%esi,%ecx,4), %esi      # esi <- &a[la-1]
        leal   (%ebx,%ecx,4), %ebx      # ebx <- &b[lb]
        negl   %ecx                     # ecx <- -lb
        leal   (%edi,%edx,4), %edi      # edi <- &c[la-lb]
        pushl  %edx                     # sauve la-lb
        pushl  %ecx                     # sauve -lb
        leal -4(%esp),  %esp            # rserve de la place pour c,br et ctr
        call  L(here)
L(here):
        leal -4(%esp),  %esp

        # prcalcule le saut dans la boucle interne
        movl  %ecx,     %eax
        andl  $31,      %eax            # eax <- (-lb) % 32
        leal  (,%eax,8),%edx            # multiplication par 17 = 8*2 + 1
        leal  17+L(sub)-L(here)(%eax,%edx,2), %eax
        addl  %eax,     _br_

        # boucle principale
        # arriver ici avec esi = &a[la-1], ebx = &b[lb], edi = &c[la-lb]
        #               et ecx = -lb
        ALIGN_4
L(main):
        movl  %edi,     _c_             # c <- &c[la-lb]

        # calcule le quotient approch, trop grand d au plus 2 units
        movl   (%esi),  %edx            # edx:eax <- a[la-1]:a[la-2]
        movl   -4(%esi),%eax
        movl   -4(%ebx),%ebp            # ebp <- b[lb-1]
        cmpl   %ebp,    %edx            # division tronque par b[lb-1]
        jb     1f
        movl   $-1,     %ebp            # si q >= BASE, q <- BASE-1
        jmp    2f
        ALIGN_4
1:
        divl   %ebp
        movl   %eax,    %ebp            # ebp <- q = quotient
2:
        movl   %ebp, -4(%edi)           # c[la-lb-1] <- q

        # a <- a - q*b
        andl   $-32,    %ecx
        leal -4(%esi,%ecx,4), %esi
        leal   (%ebx,%ecx,4), %ebx
        sarl   $5,      %ecx
        movl   %ecx,    _ctr_
        
        movl   _b0_,    %eax
        mull   %ebp
        xorl   %edi,    %edi
        xorl   %ecx,    %ecx
        jmp    *_br_
        
        # corps de boucle  drouler. taille du code = 17 octets
        # entrer dans la boucle avec edx:eax = retenue, edi = ecx = 0, CF = 0
        # code inspir de GMP (k7/mul_basecase.asm)
#undef BODY
#define BODY(x,y) \
          adcl   %eax,    %edi           /* edi += pfaible courant  */;\
       /* movl   x(%ebx), %eax              eax <- b[2i+1]          */;\
	  .byte  0x8b, 0x43, x                                        ;\
          adcl   %edx,    %ecx           /* ecx <- pfort courant    */;\
          mull   %ebp                    /* multiplie par q         */;\
       /* subl   %edi,    x(%esi)           a[2i] <- pfaible prc.  */;\
	  .byte  0x29, 0x7e, x                                        ;\
          movl   $0,      %edi                                        ;\
          adcl   %eax,    %ecx           /* ecx += pfaible courant  */;\
          movl   y(%ebx), %eax           /* eax <- b[2i+2]          */;\
          adcl   %edx,    %edi           /* edi <- pfort courant    */;\
          mull   %ebp                    /* multiplie par q         */;\
          subl   %ecx,    y(%esi)        /* a[2i+1] <- pf. prc.    */;\
          movl   $0,      %ecx

        # boucle de multiplication droule pour 32 chiffres
        ALIGN_4
L(sub):
        BODY(0,4);    BODY(8,12);    BODY(16,20);   BODY(24,28)
        BODY(32,36);  BODY(40,44);   BODY(48,52);   BODY(56,60)
        BODY(64,68);  BODY(72,76);   BODY(80,84);   BODY(88,92)
        BODY(96,100); BODY(104,108); BODY(112,116); BODY(120,124)

        incl   _ctr_
        leal   128(%esi), %esi          # a += 32
        leal   128(%ebx), %ebx          # b += 32
        jne    L(sub)
        
        adcl   %eax,    %edi            # retranche le dernier produit
        adcl   %edx,    %ecx
        subl   %edi,   (%esi)
        sbbl   %ecx,  4(%esi)

        # prpare l itration suivante
        movl   _c_,     %edi
        leal   -4(%edi),%edi            # c--
        movl   _lb_,    %ecx
        jnb    L(next)

        # si < 0, diminue q et ajoute b
        ALIGN_4
L(add):
        decl   (%edi)                   # q--
        clc
        ALIGN_4
1:
        movl   (%ebx,%ecx,4), %eax
        adcl   %eax, 4(%esi,%ecx,4)
        incl   %ecx
        jne    1b
        adcl   %ecx, 4(%esi)
        movl   _lb_, %ecx
        jnb    L(add)

        # fin de la boucle principale
        ALIGN_4
L(next):
        decl   _la_
        jne    L(main)
        leal   24(%esp), %esp           # nettoie la pile
        ret
        
#endif /* use_sse2 */
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
        

# void xn(div_n2)(chiffre *a, long lc, chiffre *b, long lb, chiffre *c)
#
# entre :
# a = naturel de longueur lc+lb
# b = naturel de longueur lb
# c = naturel de longueur lc
#
# contraintes : 
# lb >= 2, lc > 0, le bit de poids fort de b est non nul,
# a < BASE^lc*b
# a,b,c non confondus
#
# sortie :
# a <- a mod b
# c <- floor(a/b)

#ifdef debug_div_n2
ENTER(sn_div_n2_buggy)
#else
ENTER(sn_div_n2)
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la-lb
        movl   arg3,    %ebx            # ebx <- &b
        movl   arg4,    %ecx            # ecx <- lb
        movl   arg5,    %edi            # edi <- &c
#ifdef debug_div_n2
        call   .Lsn_fdiv_n2_buggy       # effectue la division
#else
        call   .Lsn_fdiv_n2
#endif
        RETURN_WITH_SP
        
#endif /* assembly_sn_div_n2 */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fdiv_n2 renvoie vers la version C
        
#if !defined(assembly_sn_div_n2) || defined(debug_div_n2)
        ALIGN_32
.Lsn_fdiv_n2:

        pushl  %edi
        pushl  %ecx
        pushl  %ebx
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_div_n2)
        leal   20(%esp), %esp
        ret
        
#endif /* !defined(assembly_sn_div_n2) || defined(debug_div_n2) */

