/* mpn_toom42_mulmid -- toom42 middle product

   Contributed by David Harvey.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2011 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */


#include "gmp.h"
#include "gmp-impl.h"


/*
  Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.

  Neither ap nor bp may overlap rp.

  Must have n >= 4.

  Amount of scratch space required is given by mpn_toom42_mulmid_itch().

  FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
  requirements should be clarified.
*/
void
mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
                   mp_ptr scratch)
{
  mp_limb_t cy, e[12], zh, zl;
  mp_size_t m;
  int neg;

  ASSERT (n >= 4);
  ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
  ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));

  ap += n & 1;   /* handle odd row and diagonal later */
  m = n / 2;

  /* (e0h:e0l) etc are correction terms, in 2's complement */
#define e0l (e[0])
#define e0h (e[1])
#define e1l (e[2])
#define e1h (e[3])
#define e2l (e[4])
#define e2h (e[5])
#define e3l (e[6])
#define e3h (e[7])
#define e4l (e[8])
#define e4h (e[9])
#define e5l (e[10])
#define e5h (e[11])

#define s (scratch + 2)
#define t (rp + m + 2)
#define p0 rp
#define p1 scratch
#define p2 (rp + m)
#define next_scratch (scratch + 3*m + 1)

  /*
            rp                            scratch
  |---------|-----------|    |---------|---------|----------|
  0         m         2m+2   0         m         2m        3m+1
            <----p2---->       <-------------s------------->
  <----p0----><---t---->     <----p1---->
  */

  /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
  cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
  cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
		       bp + m, bp, m, cy);
  mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);

  /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
  if (mpn_cmp (bp + m, bp, m) < 0)
    {
      ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
				      ap + m - 1, ap + 2*m - 1, m, 0));
      neg = 1;
    }
  else
    {
      ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
				      ap + m - 1, ap + 2*m - 1, m, 0));
      neg = 0;
    }

  /* recursive middle products. The picture is:

      b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
      ...       -   A   A   A   B   B   B   -   -   -   -
      b[m]      -   -   A   A   A   B   B   B   -   -   -
      b[m-1]    -   -   -   C   C   C   D   D   D   -   -
      ...       -   -   -   -   C   C   C   D   D   D   -
      b[0]      -   -   -   -   -   C   C   C   D   D   D
               a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
  */

  if (m < MULMID_TOOM42_THRESHOLD)
    {
      /* A + B */
      mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
      /* accumulate high limbs of p0 into e1 */
      ADDC_LIMB (cy, e1l, e1l, p0[m]);
      e1h += p0[m + 1] + cy;
      /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
      mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
      /* C + D   (overwrites t) */
      mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
    }
  else
    {
      /* as above, but use toom42 instead */
      mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
      ADDC_LIMB (cy, e1l, e1l, p0[m]);
      e1h += p0[m + 1] + cy;
      mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
      mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
    }

  /* apply error terms */

  /* -e0 at rp[0] */
  SUBC_LIMB (cy, rp[0], rp[0], e0l);
  SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
  if (UNLIKELY (cy))
    {
      cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
      SUBC_LIMB (cy, e1l, e1l, cy);
      e1h -= cy;
    }

  /* z = e1 - e2 + high(p0) */
  SUBC_LIMB (cy, zl, e1l, e2l);
  zh = e1h - e2h - cy;

  /* z at rp[m] */
  ADDC_LIMB (cy, rp[m], rp[m], zl);
  zh = (zh + cy) & GMP_NUMB_MASK;
  ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
  cy -= (zh >> (GMP_NUMB_BITS - 1));
  if (UNLIKELY (cy))
    {
      if (cy == 1)
	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
      else /* cy == -1 */
	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
    }

  /* e3 at rp[2*m] */
  ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
  rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;

  /* e4 at p1[0] */
  ADDC_LIMB (cy, p1[0], p1[0], e4l);
  ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
  if (UNLIKELY (cy))
    mpn_add_1 (p1 + 2, p1 + 2, m, 1);

  /* -e5 at p1[m] */
  SUBC_LIMB (cy, p1[m], p1[m], e5l);
  p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;

  /* adjustment if p1 ends up negative */
  cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));

  /* add (-1)^neg * (p1 - B^m * p1) to output */
  if (neg)
    {
      mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
      mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
      mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
    }
  else
    {
      mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
      mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
      mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
    }

  /* odd row and diagonal */
  if (n & 1)
    {
      /*
        Products marked E are already done. We need to do products marked O.

        OOOOO----
        -EEEEO---
        --EEEEO--
        ---EEEEO-
        ----EEEEO
       */

      /* first row of O's */
      cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
      ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);

      /* O's on diagonal */
      /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
         that can handle the sum below. Currently we're relying on
         mulmid_basecase being pretty fast for a diagonal sum like this,
	 which is true at least for the K8 asm verion, but surely false
	 for the generic version. */
      mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
      mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
    }
}