dnl  x86 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.

dnl  Copyright 2007, 2008 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C		norm	frac
C 486
C P5
C P6-13		29.2
C P6-15		*26
C K6
C K7		22
C K8		*19
C P4-f1
C P4-f2		*65
C P4-f3
C P4-f4		*72

C A star means numbers not updated for the latest version of the code.


C TODO
C  * Perhaps keep ecx or esi in stack slot, freeing up a reg for q0.
C  * The loop has not been carefully tuned.  We should at the very least do
C    some local insn swapping.
C  * The code outside the main loop is what gcc generated.  Clean up!
C  * Clean up stack slot usage.

C INPUT PARAMETERS
C qp
C fn
C up_param
C un_param
C dp


C eax ebx ecx edx esi edi ebp
C         cnt         qp

ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_divrem_2)
	push	%ebp
	push	%edi
	push	%esi
	push	%ebx
	sub	$36, %esp
	mov	68(%esp), %ecx		C un
	mov	72(%esp), %esi		C dp
	movl	$0, 32(%esp)
	lea	0(,%ecx,4), %edi
	add	64(%esp), %edi		C up
	mov	(%esi), %ebx
	mov	4(%esi), %eax
	mov	%ebx, 20(%esp)
	sub	$12, %edi
	mov	%eax, 24(%esp)
	mov	%edi, 12(%esp)
	mov	8(%edi), %ebx
	mov	4(%edi), %ebp
	cmp	%eax, %ebx
	jb	L(8)
	seta	%dl
	cmp	20(%esp), %ebp
	setae	%al
	orb	%dl, %al		C "orb" form to placate Sun tools
	jne	L(35)
L(8):
	mov	60(%esp), %esi		C fn
	lea	-3(%esi,%ecx), %edi
	test	%edi, %edi
	js	L(9)
	mov	24(%esp), %edx
	mov	$-1, %esi
	mov	%esi, %eax
	mov	%esi, %ecx
	not	%edx
	divl	24(%esp)
	mov	%eax, %esi
	imul	24(%esp), %eax
	mov	%eax, (%esp)
	mov	%esi, %eax
	mull	20(%esp)
	mov	(%esp), %eax
	add	20(%esp), %eax
	adc	$0, %ecx
	add	%eax, %edx
	adc	$0, %ecx
	mov	%ecx, %eax
	js	L(32)
L(36):	dec	%esi
	sub	24(%esp), %edx
	sbb	$0, %eax
	jns	L(36)
L(32):
	mov	%esi, 16(%esp)		C di
	mov	%edi, %ecx		C un
	mov	12(%esp), %esi		C up
	mov	24(%esp), %eax
	neg	%eax
	mov	%eax, 4(%esp)		C -d1
	ALIGN(16)
	nop

C eax ebx ecx edx esi edi ebp  0    4   8   12  16  20  24  28  32   56  60
C     n2  un      up      n1   q0  -d1          di  d0  d1      msl  qp  fn

L(loop):
	mov	16(%esp), %eax		C di
	mul	%ebx
	add	%ebp, %eax
	mov	%eax, (%esp)		C q0
	adc	%ebx, %edx
	mov	%edx, %edi		C q
	imul	4(%esp), %edx
	mov	20(%esp), %eax
	lea	(%edx, %ebp), %ebx	C n1 -= ...
	mul	%edi
	xor	%ebp, %ebp
	cmp	60(%esp), %ecx
	jl	L(19)
	mov	(%esi), %ebp
	sub	$4, %esi
L(19):	sub	20(%esp), %ebp
	sbb	24(%esp), %ebx
	sub	%eax, %ebp
	sbb	%edx, %ebx
	mov	20(%esp), %eax		C d1
	inc	%edi
	xor	%edx, %edx
	cmp	(%esp), %ebx
	adc	$-1, %edx		C mask
	add	%edx, %edi		C q--
	and	%edx, %eax		C d0 or 0
	and	24(%esp), %edx		C d1 or 0
	add	%eax, %ebp
	adc	%edx, %ebx
	cmp	24(%esp), %ebx
	jae	L(fix)
L(bck):	mov	56(%esp), %edx
	mov	%edi, (%edx, %ecx, 4)
	dec	%ecx
	jns	L(loop)

L(9):	mov	64(%esp), %esi		C up
	mov	%ebp, (%esi)
	mov	%ebx, 4(%esi)
	mov	32(%esp), %eax
	add	$36, %esp
	pop	%ebx
	pop	%esi
	pop	%edi
	pop	%ebp
	ret

L(fix):	seta	%dl
	cmp	20(%esp), %ebp
	setae	%al
	orb	%dl, %al		C "orb" form to placate Sun tools
	je	L(bck)
	inc	%edi
	sub	20(%esp), %ebp
	sbb	24(%esp), %ebx
	jmp	L(bck)

L(35):	sub	20(%esp), %ebp
	sbb	24(%esp), %ebx
	movl	$1, 32(%esp)
	jmp	L(8)
EPILOGUE()