dnl  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.

dnl  Copyright 2007, 2008, 2010 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C		c/l
C AMD K8,K9	18
C AMD K10	18
C Intel P4	68
C Intel core2	34
C Intel corei	30.5
C Intel atom	73
C VIA nano	33


C INPUT PARAMETERS
define(`qp',		`%rdi')
define(`fn',		`%rsi')
define(`up_param',	`%rdx')
define(`un_param',	`%rcx')
define(`dp',		`%r8')

ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)

ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_divrem_2)
	FUNC_ENTRY(4)
IFDOS(`	mov	56(%rsp), %r8	')
	push	%r15
	push	%r14
	push	%r13
	push	%r12
	lea	-24(%rdx,%rcx,8), %r12	C r12 = &up[un-1]
	mov	%rsi, %r13
	push	%rbp
	mov	%rdi, %rbp
	push	%rbx
	mov	8(%r8), %r11		C d1
	mov	16(%r12), %rbx
	mov	(%r8), %r8		C d0
	mov	8(%r12), %r10

	xor	R32(%r15), R32(%r15)
	cmp	%rbx, %r11
	ja	L(2)
	setb	%dl
	cmp	%r10, %r8
	setbe	%al
	orb	%al, %dl		C "orb" form to placate Sun tools
	je	L(2)
	inc	R32(%r15)
	sub	%r8, %r10
	sbb	%r11, %rbx
L(2):
	lea	-3(%rcx,%r13), %r14	C un + fn - 3
	test	%r14, %r14
	js	L(end)

	push	%r8
	push	%r10
	push	%r11
IFSTD(`	mov	%r11, %rdi	')
IFDOS(`	mov	%r11, %rcx	')
	CALL(	mpn_invert_limb)
	pop	%r11
	pop	%r10
	pop	%r8

	mov	%r11, %rdx
	mov	%rax, %rdi
	imul	%rax, %rdx
	mov	%rdx, %r9
	mul	%r8
	xor	R32(%rcx), R32(%rcx)
	add	%r8, %r9
	adc	$-1, %rcx
	add	%rdx, %r9
	adc	$0, %rcx
	js	2f
1:	dec	%rdi
	sub	%r11, %r9
	sbb	$0, %rcx
	jns	1b
2:

	lea	(%rbp,%r14,8), %rbp
	mov	%r11, %rsi
	neg	%rsi			C -d1

C rax rbx rcx rdx rsi rdi  rbp r8 r9 r10 r11 r12 r13 r14 r15
C     n2  un      -d1 dinv qp  d0 q0     d1  up  fn      msl

	ALIGN(16)
L(top):	mov	%rdi, %rax		C di		ncp
	mul	%rbx			C		0, 17
	mov	%r10, %rcx		C
	add	%rax, %rcx		C		4
	adc	%rbx, %rdx		C		5
	mov	%rdx, %r9		C q		6
	imul	%rsi, %rdx		C		6
	mov	%r8, %rax		C		ncp
	lea	(%rdx, %r10), %rbx	C n1 -= ...	10
	xor	R32(%r10), R32(%r10)	C
	mul	%r9			C		7
	cmp	%r14, %r13		C
	jg	L(19)			C
	mov	(%r12), %r10		C
	sub	$8, %r12		C
L(19):	sub	%r8, %r10		C		ncp
	sbb	%r11, %rbx		C		11
	sub	%rax, %r10		C		11
	sbb	%rdx, %rbx		C		12
	xor	R32(%rax), R32(%rax)	C
	xor	R32(%rdx), R32(%rdx)	C
	cmp	%rcx, %rbx		C		13
	cmovnc	%r8, %rax		C		14
	cmovnc	%r11, %rdx		C		14
	adc	$0, %r9			C adjust q	14
	nop
	add	%rax, %r10		C		15
	adc	%rdx, %rbx		C		16
	cmp	%r11, %rbx		C
	jae	L(fix)			C
L(bck):	mov	%r9, (%rbp)		C
	sub	$8, %rbp		C
	dec	%r14
	jns	L(top)

L(end):	mov	%r10, 8(%r12)
	mov	%rbx, 16(%r12)
	pop	%rbx
	pop	%rbp
	pop	%r12
	pop	%r13
	pop	%r14
	mov	%r15, %rax
	pop	%r15
	FUNC_EXIT()
	ret

L(fix):	seta	%dl
	cmp	%r8, %r10
	setae	%al
	orb	%dl, %al		C "orb" form to placate Sun tools
	je	L(bck)
	inc	%r9
	sub	%r8, %r10
	sbb	%r11, %rbx
	jmp	L(bck)
EPILOGUE()