dnl IA-64 mpn_tabselect. dnl Copyright 2011 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C Itanium: ? C Itanium 2: 2.5 C NOTES C * Using software pipelining could trivially yield 2 c/l without unrolling, C or 1+epsilon with unrolling. (This code was modelled after the powerpc64 C code, for simplicity.) C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) define(`rp', `r32') define(`tp', `r33') define(`n', `r34') define(`nents', `r35') define(`which', `r36') define(`mask', `r8') define(`rp1', `r32') define(`tp1', `r33') define(`rp2', `r14') define(`tp2', `r15') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_tabselect) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 tp = 0, tp C M I zxt4 n = n C I .mii; nop 0 zxt4 nents = nents C I zxt4 which = which C I ;; ') .mmi; add rp2 = 8, rp1 add tp2 = 8, tp1 add r6 = -2, n ;; .mmi; cmp.eq p10, p0 = 1, n and r9 = 1, n C set cr0 for use in inner loop shr.u r6 = r6, 1 C inner loop count ;; .mmi; cmp.eq p8, p0 = 0, r9 sub which = nents, which shl n = n, 3 ;; L(outer): .mmi cmp.eq p6, p7 = which, nents C are we at the selected table entry? nop 0 mov ar.lc = r6 C I0 ;; .mmb; (p6) mov mask = -1 (p7) mov mask = 0 (p8) br.dptk L(top) C branch to loop entry if n even ;; .mmi; ld8 r16 = [tp1], 8 add tp2 = 8, tp2 nop 0 ;; .mmi; ld8 r18 = [rp1] and r16 = r16, mask nop 0 ;; .mmi; andcm r18 = r18, mask ;; or r16 = r16, r18 nop 0 ;; .mmb; st8 [rp1] = r16, 8 add rp2 = 8, rp2 (p10) br.dpnt L(end) ALIGN(32) L(top): .mmi; ld8 r16 = [tp1], 16 ld8 r17 = [tp2], 16 nop 0 ;; .mmi; ld8 r18 = [rp1] and r16 = r16, mask nop 0 .mmi; ld8 r19 = [rp2] and r17 = r17, mask nop 0 ;; .mmi; andcm r18 = r18, mask andcm r19 = r19, mask nop 0 ;; .mmi; or r16 = r16, r18 or r17 = r17, r19 nop 0 ;; .mmb; st8 [rp1] = r16, 16 st8 [rp2] = r17, 16 br.cloop.dptk L(top) ;; L(end): .mmi; sub rp1 = rp1, n C move rp back to beginning sub rp2 = rp2, n C move rp back to beginning cmp.ne p9, p0 = 1, nents .mmb; add nents = -1, nents nop 0 (p9) br.dptk L(outer) ;; .mib; nop 0 nop 0 br.ret.sptk.many b0 EPILOGUE()