dnl  S/390-64 mpn_sqr_basecase.

dnl  Copyright 2011 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            cycles/limb
C z900		 ?
C z990		23
C z9		 ?
C z10		 ?
C z196		 ?

C TODO
C  * Clean up.
C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
C    This will ask for basecase handling of n = 3.
C  * Update counters and pointers more straightforwardly, possibly lowering
C    register usage.
C  * Should we use this allocation-free style for more sqr_basecase asm
C    implementations?  The only disadvantage is that it requires R != U.
C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
C    more.

C INPUT PARAMETERS
define(`rp',	`%r2')
define(`up',	`%r3')
define(`n',	`%r4')

define(`zero',	`%r8')
define(`rp_saved',	`%r9')
define(`up_saved',	`%r13')
define(`n_saved',	`%r14')

ASM_START()
PROLOGUE(mpn_sqr_basecase)
	aghi	n, -2
	jhe	L(ge2)

C n = 1
	lg	%r5, 0(up)
	mlgr	%r4, %r5
	stg	%r5, 0(rp)
	stg	%r4, 8(rp)
	br	%r14

L(ge2):	jne	L(gen)

C n = 2
	stmg	%r6, %r8, 48(%r15)
	lghi	zero, 0

	lg	%r5, 0(up)
	mlgr	%r4, %r5		C u0 * u0
	lg	%r1, 8(up)
	mlgr	%r0, %r1		C u1 * u1
	stg	%r5, 0(rp)

	lg	%r7, 0(up)
	mlg	%r6, 8(up)		C u0 * u1
	algr	%r7, %r7
	alcgr	%r6, %r6
	alcgr	%r0, zero

	algr	%r4, %r7
	alcgr	%r1, %r6
	alcgr	%r0, zero
	stg	%r4, 8(rp)
	stg	%r1, 16(rp)
	stg	%r0, 24(rp)

	lmg	%r6, %r8, 48(%r15)
	br	%r14

L(gen):
C mul_1 =======================================================================

	stmg	%r6, %r14, 48(%r15)
	lghi	zero, 0
	lgr	up_saved, up
	lgr	rp_saved, rp
	lgr	n_saved, n

	lg	%r6, 0(up)
	lg	%r11, 8(up)
	lghi	%r12, 16		C init index register
	mlgr	%r10, %r6
	lgr	%r5, n
	stg	%r11, 8(rp)
	cr	%r15, %r15		C clear carry flag

L(tm):	lg	%r1, 0(%r12,up)
	mlgr	%r0, %r6
	alcgr	%r1, %r10
	lgr	%r10, %r0		C copy high part to carry limb
	stg	%r1, 0(%r12,rp)
	la	%r12, 8(%r12)
	brctg	%r5, L(tm)

	alcgr	%r0, zero
	stg	%r0, 0(%r12,rp)

C addmul_1 loop ===============================================================

	aghi	n, -1
	je	L(outer_end)
L(outer_loop):

	la	rp, 16(rp)		C rp += 2
	la	up, 8(up)		C up += 1
	lg	%r6, 0(up)
	lg	%r11, 8(up)
	lghi	%r12, 16		C init index register
	mlgr	%r10, %r6
	lgr	%r5, n
	alg	%r11, 8(rp)
	stg	%r11, 8(rp)

L(tam):	lg	%r1, 0(%r12,up)
	lg	%r7, 0(%r12,rp)
	mlgr	%r0, %r6
	alcgr	%r1, %r7
	alcgr	%r0, zero
	algr	%r1, %r10
	lgr	%r10, %r0
	stg	%r1, 0(%r12,rp)
	la	%r12, 8(%r12)
	brctg	%r5, L(tam)

	alcgr	%r0, zero
	stg	%r0, 0(%r12,rp)

	brctg	n, L(outer_loop)
L(outer_end):

	lg	%r6, 8(up)
	lg	%r1, 16(up)
	lgr	%r7, %r0		C Same as: lg %r7, 24(,rp)
	mlgr	%r0, %r6
	algr	%r1, %r7
	alcgr	%r0, zero
	stg	%r1, 24(rp)
	stg	%r0, 32(rp)

C sqr_diag_addlsh1 ============================================================

define(`up', `up_saved')
define(`rp', `rp_saved')
	la	n, 1(n_saved)

	lg	%r1, 0(up)
	mlgr	%r0, %r1
	stg	%r1, 0(rp)
C	clr	%r15, %r15		C clear carry (already clear per above)

L(top):	lg	%r11, 8(up)
	la	up, 8(up)
	lg	%r6, 8(rp)
	lg	%r7, 16(rp)
	mlgr	%r10, %r11
	alcgr	%r6, %r6
	alcgr	%r7, %r7
	alcgr	%r10, zero		C propagate carry to high product limb
	algr	%r6, %r0
	alcgr	%r7, %r11
	stmg	%r6, %r7, 8(rp)
	la	rp, 16(rp)
	lgr	%r0, %r10		C copy carry limb
	brctg	n, L(top)

	alcgr	%r0, zero
	stg	%r0, 8(rp)

	lmg	%r6, %r14, 48(%r15)
	br	%r14
EPILOGUE()