dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.

dnl  Copyright 2006, 2010, 2011, 2012 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C	     cycles/limb
C AMD K8,K9	 3.1	< 3.85 for lshift + add_n
C AMD K10	 3.1	< 3.85 for lshift + add_n
C Intel P4	14.6	> 7.33 for lshift + add_n
C Intel core2	 3.87	> 3.27 for lshift + add_n
C Intel NHM	 4	> 3.75 for lshift + add_n
C Intel SBR	(5.8)	> 3.46 for lshift + add_n
C Intel atom	(7.75)	< 8.75 for lshift + add_n
C VIA nano	 4.7	< 6.25 for lshift + add_n

C This was written quickly and not optimized at all.  Surely one could get
C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
C   1) Use indexing to save the 3 LEA
C   2) Write reasonable feed-in code
C   3) Be more clever about register usage
C   4) Unroll more, handling CL negation, carry save/restore cost much now
C   5) Reschedule

C INPUT PARAMETERS
define(`rp',	`%rdi')
define(`up',	`%rsi')
define(`vp',	`%rdx')
define(`n',	`%rcx')
define(`cnt',	`%r8')

ifdef(`OPERATION_addlsh_n',`
  define(ADCSBB,       `adc')
  define(func, mpn_addlsh_n)
')
ifdef(`OPERATION_rsblsh_n',`
  define(ADCSBB,       `sbb')
  define(func, mpn_rsblsh_n)
')

MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)

ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)

ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(func)
	FUNC_ENTRY(4)
IFDOS(`	mov	56(%rsp), %r8d	')
	push	%r12
	push	%r13
	push	%r14
	push	%rbp
	push	%rbx

	mov	n, %rax
	xor	R32(%rbx), R32(%rbx)	C clear carry save register
	mov	R32(%r8), R32(%rcx)	C shift count
	xor	R32(%rbp), R32(%rbp)	C limb carry

	mov	R32(%rax), R32(%r11)
	and	$3, R32(%r11)
	je	L(4)
	sub	$1, R32(%r11)

L(012):	mov	(vp), %r8
	mov	%r8, %r12
	shl	R8(%rcx), %r8
	or	%rbp, %r8
	neg	R8(%rcx)
	mov	%r12, %rbp
	shr	R8(%rcx), %rbp
	neg	R8(%rcx)
	add	R32(%rbx), R32(%rbx)
	ADCSBB	(up), %r8
	mov	%r8, (rp)
	sbb	R32(%rbx), R32(%rbx)
	lea	8(up), up
	lea	8(vp), vp
	lea	8(rp), rp
	sub	$1, R32(%r11)
	jnc	L(012)

L(4):	sub	$4, %rax
	jc	L(end)

	ALIGN(16)
L(top):	mov	(vp), %r8
	mov	%r8, %r12
	mov	8(vp), %r9
	mov	%r9, %r13
	mov	16(vp), %r10
	mov	%r10, %r14
	mov	24(vp), %r11

	shl	R8(%rcx), %r8
	shl	R8(%rcx), %r9
	shl	R8(%rcx), %r10
	or	%rbp, %r8
	mov	%r11, %rbp
	shl	R8(%rcx), %r11

	neg	R8(%rcx)

	shr	R8(%rcx), %r12
	shr	R8(%rcx), %r13
	shr	R8(%rcx), %r14
	shr	R8(%rcx), %rbp		C used next iteration

	or	%r12, %r9
	or	%r13, %r10
	or	%r14, %r11

	neg	R8(%rcx)

	add	R32(%rbx), R32(%rbx)	C restore carry flag

	ADCSBB	(up), %r8
	ADCSBB	8(up), %r9
	ADCSBB	16(up), %r10
	ADCSBB	24(up), %r11

	mov	%r8, (rp)
	mov	%r9, 8(rp)
	mov	%r10, 16(rp)
	mov	%r11, 24(rp)

	sbb	R32(%rbx), R32(%rbx)	C save carry flag

	lea	32(up), up
	lea	32(vp), vp
	lea	32(rp), rp

	sub	$4, %rax
	jnc	L(top)

L(end):	add	R32(%rbx), R32(%rbx)
	ADCSBB	$0, %rbp
	mov	%rbp, %rax
	pop	%rbx
	pop	%rbp
	pop	%r14
	pop	%r13
	pop	%r12
	FUNC_EXIT()
	ret
EPILOGUE()