dnl  S/390-32 mpn_sqr_basecase.

dnl  Copyright 2011 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            cycles/limb
C z900		 ?
C z990		23
C z9		 ?
C z10		 ?
C z196		 ?

C TODO
C  * Clean up.
C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
C    This will ask for basecase handling of n = 3.
C  * Update counters and pointers more straightforwardly, possibly lowering
C    register usage.
C  * Should we use this allocation-free style for more sqr_basecase asm
C    implementations?  The only disadvantage is that it requires R != U.
C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
C    more.

C INPUT PARAMETERS
define(`rp',	`%r2')
define(`up',	`%r3')
define(`n',	`%r4')

define(`zero',	`%r8')
define(`rp_saved',	`%r9')
define(`up_saved',	`%r13')
define(`n_saved',	`%r14')

ASM_START()
PROLOGUE(mpn_sqr_basecase)
	ahi	n, -2
	jhe	L(ge2)

C n = 1
	l	%r5, 0(up)
	mlr	%r4, %r5
	st	%r5, 0(rp)
	st	%r4, 4(rp)
	br	%r14

L(ge2):	jne	L(gen)

C n = 2
	stm	%r6, %r8, 24(%r15)
	lhi	zero, 0

	l	%r5, 0(up)
	mlr	%r4, %r5		C u0 * u0
	l	%r1, 4(up)
	mlr	%r0, %r1		C u1 * u1
	st	%r5, 0(rp)

	l	%r7, 0(up)
	ml	%r6, 4(up)		C u0 * u1
	alr	%r7, %r7
	alcr	%r6, %r6
	alcr	%r0, zero

	alr	%r4, %r7
	alcr	%r1, %r6
	alcr	%r0, zero
	st	%r4, 4(rp)
	st	%r1, 8(rp)
	st	%r0, 12(rp)

	lm	%r6, %r8, 24(%r15)
	br	%r14

L(gen):
C mul_1 =======================================================================

	stm	%r6, %r14, 24(%r15)
	lhi	zero, 0
	lr	up_saved, up
	lr	rp_saved, rp
	lr	n_saved, n

	l	%r6, 0(up)
	l	%r11, 4(up)
	lhi	%r12, 8		C init index register
	mlr	%r10, %r6
	lr	%r5, n
	st	%r11, 4(rp)
	cr	%r15, %r15		C clear carry flag

L(tm):	l	%r1, 0(%r12,up)
	mlr	%r0, %r6
	alcr	%r1, %r10
	lr	%r10, %r0		C copy high part to carry limb
	st	%r1, 0(%r12,rp)
	la	%r12, 4(%r12)
	brct	%r5, L(tm)

	alcr	%r0, zero
	st	%r0, 0(%r12,rp)

C addmul_1 loop ===============================================================

	ahi	n, -1
	je	L(outer_end)
L(outer_loop):

	la	rp, 8(rp)		C rp += 2
	la	up, 4(up)		C up += 1
	l	%r6, 0(up)
	l	%r11, 4(up)
	lhi	%r12, 8		C init index register
	mlr	%r10, %r6
	lr	%r5, n
	al	%r11, 4(rp)
	st	%r11, 4(rp)

L(tam):	l	%r1, 0(%r12,up)
	l	%r7, 0(%r12,rp)
	mlr	%r0, %r6
	alcr	%r1, %r7
	alcr	%r0, zero
	alr	%r1, %r10
	lr	%r10, %r0
	st	%r1, 0(%r12,rp)
	la	%r12, 4(%r12)
	brct	%r5, L(tam)

	alcr	%r0, zero
	st	%r0, 0(%r12,rp)

	brct	n, L(outer_loop)
L(outer_end):

	l	%r6, 4(up)
	l	%r1, 8(up)
	lr	%r7, %r0		C Same as: l %r7, 12(,rp)
	mlr	%r0, %r6
	alr	%r1, %r7
	alcr	%r0, zero
	st	%r1, 12(rp)
	st	%r0, 16(rp)

C sqr_dia_addlsh1 ============================================================

define(`up', `up_saved')
define(`rp', `rp_saved')
	la	n, 1(n_saved)

	l	%r1, 0(up)
	mlr	%r0, %r1
	st	%r1, 0(rp)
C	clr	%r15, %r15		C clear carry (already clear per above)

L(top):	l	%r11, 4(up)
	la	up, 4(up)
	l	%r6, 4(rp)
	l	%r7, 8(rp)
	mlr	%r10, %r11
	alcr	%r6, %r6
	alcr	%r7, %r7
	alcr	%r10, zero		C propagate carry to high product limb
	alr	%r6, %r0
	alcr	%r7, %r11
	stm	%r6, %r7, 4(rp)
	la	rp, 8(rp)
	lr	%r0, %r10		C copy carry limb
	brct	n, L(top)

	alcr	%r0, zero
	st	%r0, 4(rp)

	lm	%r6, %r14, 24(%r15)
	br	%r14
EPILOGUE()