// file kernel/n/x86-64/gcd_n2.S: O(n^2) greatest common divisor
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                           PGCD quadratique                            |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                        # +------------------------+
                        # |  Pgcd  deux chiffres  |
                        # +------------------------+
        
# void xn(gcd_2)(chiffre *x)
#
# entre :
#   x = tableau de 8 chiffres [a0,a1,b0,b1,p,s,q,r]
#   a = a0 + BASE*a1, b = b0 + BASE*b1
#
# contrainte : 0 < b < a < BASE*b
#
# Dveloppe en fraction continue la fraction a/b tant que les coefficients
# tiennent sur un chiffre
#
# sortie :
#   [a0,a1,b0,b1] <- ind.
#   [p,s,q,r]     <- coefficients des combinaisons effectues
        
#ifdef assembly_sn_gcd_2
#undef L
#define L(x) .Lsn_gcd_2_##x
QUICKENTER(sn_gcd_2)

#undef  _a0_
#undef  _a1_
#undef  _b0_
#undef  _b1_
#undef  _p_
#undef  _q_
#undef  _r_
#undef  _s_
#define _a0_ %rax
#define _a1_ %rbx
#define _b0_ %rsi
#define _b1_ %rdx
#define _p_ %r8
#define _q_ %r9
#define _r_ %r10
#define _s_ %r11

        movq   (%rdi), _a0_
        movq  8(%rdi), _a1_
        movq 16(%rdi), _b0_
        movq 24(%rdi), _b1_

        # [p,q,r,s] <- Id
        movq   $1,     _p_
        movq   $0,     _q_
        movq   $0,     _r_
        movq   $1,     _s_
	movb   $-1,    %cl             # compteur dcalages

        # ici a > b, p > r, q >= s-1
        # dcale b,p,r tant que a >= 2b
L(loop):
        subq   _b0_,   _a0_
        sbbq   _b1_,   _a1_
L(shift_1):
        subq   _b0_,   _a0_
        sbbq   _b1_,   _a1_
	inc    %cl
        jb     L(div_1)
        shlq   $1,     _b0_
        rclq   $1,     _b1_
        shlq   $1,     _r_
        rclq   $1,     _p_
	jnc    L(shift_1)
	rcrq   $1,     _p_             # dfait les dcalages
        rcrq   $1,     _r_
        jmp    L(unshift_1)

        # ici 2^i*b <= a < 2^(i+1)*b, p > r, q >= s-1
        # calcule a/b par soustractions et dcalages
L(div_1):
        addq   _b0_,   _a0_
        adcq   _b1_,   _a1_
L(loop_10):
        addq   _r_,    _s_
        addq   _p_,    _q_
	jnc    L(loop_12)
        subq   _r_,    _s_             # dfait les additions
        subq   _p_,    _q_
L(unshift_1):
	shrq   %cl,    _p_             # divise p et r par 2^i
	shrq   %cl,    _r_
	jmp   L(done)
L(loop_11):
        shrq   $1,     _b1_
        rcrq   $1,     _b0_
        shrq   $1,     _p_
        shrq   $1,     _r_
        subq   _b0_,   _a0_
        sbbq   _b1_,   _a1_
	jnb    L(loop_10)
        addq   _b0_,   _a0_
        adcq   _b1_,   _a1_
L(loop_12):
	dec    %cl
	jns    L(loop_11)
        testq  _a0_,   _a0_            # si a = 0, termin
        jne    1f
        testq  _a1_,   _a1_
        jz     L(done)
1:
        
        # ici b > a, p > r, q >= s
        # dcale a,s,q tant que b >= 2a
        subq   _a0_,   _b0_
        sbbq   _a1_,   _b1_
L(shift_2):
        subq   _a0_,   _b0_
        sbbq   _a1_,   _b1_
	inc    %cl
	jb     L(div_2)
        shlq   $1,     _a0_
        rclq   $1,     _a1_
        shlq   $1,     _s_
        rclq   $1,     _q_
	jnc    L(shift_2)
	rcrq   $1,     _q_             # dfait les dcalages
        rcrq   $1,     _s_
        jmp    L(unshift_2)

        # ici 2^i*a <= b < 2^(i+1)*a, p > r, q >= s
        # calcule b/a par soustractions et dcalages
L(div_2):
        addq   _a0_,   _b0_
        adcq   _a1_,   _b1_
L(loop_20):
        addq   _s_,    _r_
        addq   _q_,    _p_
        jnc    L(loop_22)
        subq   _s_,    _r_             # dfait les additions
        subq   _q_,    _p_
L(unshift_2):
	shrq   %cl,    _s_             # divise q et s par 2^i
	shrq   %cl,    _q_
	jmp    L(done)
L(loop_21):
        shrq   $1,     _a1_
        rcrq   $1,     _a0_
        shrq   $1,     _s_
        shrq   $1,     _q_
        subq   _a0_,   _b0_
        sbbq   _a1_,   _b1_
	jnb    L(loop_20)
        addq   _a0_,   _b0_
        adcq   _a1_,   _b1_
L(loop_22):
	dec    %cl
        jns    L(loop_21)
        testq  _b0_,   _b0_            # si b = 0, termin
        jne    L(loop)
        testq  _b1_,   _b1_
	jne    L(loop)

L(done):
	movq   _p_,    32(%rdi)        # termin
	movq   _s_,    40(%rdi)
	movq   _q_,    48(%rdi)
	movq   _r_,    56(%rdi)
        ret

#endif /* assembly_gcd_2 */


                      # +-----------------------------+
                      # |  Demi-pgcd  deux chiffres  |
                      # +-----------------------------+

# void xn(hgcd_2)(chiffre *x)
#
# entre :
#   x = tableau de 8 chiffres [a0,a1,b0,b1,p,s,q,r]
#   a = a0 + BASE*a1, b = b0 + BASE*b1
#
# contrainte : 0 < b < a
#
# Dveloppe en fraction continue les fractions a/(b+1) et (a+1)/b
# tant que les quotients concident et que les coefficients tiennent
# sur un chiffre
#
# sortie :
#   [a0,a1,b0,b1] <- ind.
#   [p,s,q,r]     <- coefficients des combinaisons effectues
        
#ifdef assembly_sn_hgcd_2
#undef L
#define L(x) .Lsn_hgcd_2_##x
QUICKENTER(sn_hgcd_2)

#undef  _a0_
#undef  _a1_
#undef  _b0_
#undef  _b1_
#undef  _p_
#undef  _q_
#undef  _r_
#undef  _s_
#define _a0_ %rax
#define _a1_ %rbx
#define _b0_ %rsi
#define _b1_ %rdx
#define _p_ %r8
#define _q_ %r9
#define _r_ %r10
#define _s_ %r11

        movq   (%rdi), _a0_
        movq  8(%rdi), _a1_
        movq 16(%rdi), _b0_
        movq 24(%rdi), _b1_

        # [p,q,r,s] <- Id
        movq   $1,     _p_
        movq   $0,     _q_
        movq   $0,     _r_
        movq   $1,     _s_
	movb   $-1,    %cl             # compteur dcalages

        addq   _p_,    _b0_            # b <- b+p
        adcq   $0,     _b1_
        subq   _b0_,   _a0_            # a <- (a-q) - (b+p)
        sbbq   _b1_,   _a1_

        # ici a-q >= b+p, p > r, q >= s-1
        # dcale b,p,r tant que a-q >= 2(b+p)
L(shift_1):
        subq   _b0_,   _a0_
        sbbq   _b1_,   _a1_
	inc    %cl
        jb     L(div_1)
        shlq   $1,     _b0_
        rclq   $1,     _b1_
        shlq   $1,     _r_
        rclq   $1,     _p_
	jnc    L(shift_1)
	rcrq   $1,     _p_             # dfait les dcalages
        rcrq   $1,     _r_
        jmp    L(unshift_1)

        # ici 2^i*(b+p) <= a-q < 2^(i+1)*(b+p)
        # calcule (a-q)/(b+p) par soustractions et dcalages
L(div_1):
        addq   _b0_,   _a0_
        adcq   _b1_,   _a1_
L(loop_10):
        addq   _r_,    _s_
        addq   _p_,    _q_
	jnc    L(loop_12)
        subq   _r_,    _s_             # dfait les additions
        subq   _p_,    _q_
L(unshift_1):
	shrq   %cl,    _p_             # divise p et r par 2^i
	shrq   %cl,    _r_
	jmp   L(done)
L(loop_11):
        shrq   $1,     _b1_
        rcrq   $1,     _b0_
        shrq   $1,     _p_
        shrq   $1,     _r_
        subq   _b0_,   _a0_
        sbbq   _b1_,   _a1_
	jnb    L(loop_10)
        addq   _b0_,   _a0_
        adcq   _b1_,   _a1_
L(loop_12):
	dec    %cl
	jns    L(loop_11)
      
	# fin de la division de a-q par b+p
        subq   _p_,     _b0_            # b <- b-r
        sbbq   $0,      _b1_
        subq   _r_,     _b0_
        sbbq   $0,      _b1_
        addq   _s_,     _a0_            # a <- a+s
        adcq   $0,      _a1_
        addq   _q_,     _a0_
        adcq   $0,      _a1_
        subq   _a0_,    _b0_            # b <- (b-r) - (a+s)
        sbbq   _a1_,    _b1_
        jb     L(done)                  # si < 0, termin

        # ici b-r >= a+s, p > r, q >= s
        # dcale a,s,q tant que b-r >= 2(a+s)
L(shift_2):
        subq   _a0_,   _b0_
        sbbq   _a1_,   _b1_
	inc    %cl
	jb     L(div_2)
        shlq   $1,     _a0_
        rclq   $1,     _a1_
        shlq   $1,     _s_
        rclq   $1,     _q_
	jnc    L(shift_2)
	rcrq   $1,     _q_             # dfait les dcalages
        rcrq   $1,     _s_
        jmp    L(unshift_2)

        # ici 2^i*(a+s) <= b-r < 2^(i+1)*(a+s), p > r, q >= s
        # calcule (b-r)/(a+s) par soustractions et dcalages
L(div_2):
        addq   _a0_,   _b0_
        adcq   _a1_,   _b1_
L(loop_20):
        addq   _s_,    _r_
        addq   _q_,    _p_
        jnc    L(loop_22)
        subq   _s_,    _r_             # dfait les additions
        subq   _q_,    _p_
L(unshift_2):
	shrq   %cl,    _s_             # divise q et s par 2^i
	shrq   %cl,    _q_
	jmp    L(done)
L(loop_21):
        shrq   $1,     _a1_
        rcrq   $1,     _a0_
        shrq   $1,     _s_
        shrq   $1,     _q_
        subq   _a0_,   _b0_
        sbbq   _a1_,   _b1_
	jnb    L(loop_20)
        addq   _a0_,   _b0_
        adcq   _a1_,   _b1_
L(loop_22):
	dec    %cl
        jns    L(loop_21)

        # fin de la division de b-r par a+s
        subq   _s_,     _a0_            # a <- a-q
        sbbq   $0,      _a1_
        subq   _q_,     _a0_
        sbbq   $0,      _a1_
        addq   _p_,     _b0_            # b <- b+p
        adcq   $0,      _b1_
        addq   _r_,     _b0_
        adcq   $0,      _b1_
        subq   _b0_,    _a0_            # a <- (a-q) - (b+p)
        sbbq   _b1_,    _a1_
        jnb    L(shift_1)               # si < 0, termin

L(done):
	movq   _p_,    32(%rdi)         # termin
	movq   _s_,    40(%rdi)
	movq   _q_,    48(%rdi)
	movq   _r_,    56(%rdi)
        ret

#endif /* assembly_hgcd_2 */

