#include "atlas_misc.h"
#include "atlas_level1.h"
#include "atlas_reflvl2.h"
#include "atlas_reflevel2.h"
#include "atlas_lvl2.h"
#if defined(ATL_INL1)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr_L1.h))
   #define ATL_syr Mjoin(PATL,syr_L1)
#elif defined(ATL_INL2)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr_L2.h))
   #define ATL_syr Mjoin(PATL,syr_L2)
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr.h))
   #define ATL_syr Mjoin(PATL,syr)
#endif

#ifdef ATL_NXTUNE
   extern int ATL_KERN_NX;
   #define ATL_S1NX ATL_KERN_NX
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syrNX.h))
   #ifndef ATL_S1NX
      #define ATL_S1NX 128
   #endif
#endif
void Mjoin(PATL,syr_kU)
(
   ATL_r1kern_t gerk0,          /* func ptr to selected GER kernel */
   ATL_CINT FNU,                /* non-0: kern does not handle N%NU != 0 */
   ATL_CINT N,                  /* size of prob to solve */
   const TYPE alpha,            /* alpha */
   const TYPE *x,               /* vector X -- may have alpha applied */
   const TYPE *xt,              /* X^T */
   TYPE *A,                     /* symmetric matrix, A = A + x*xt */
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r1kern_t gerk=gerk0;
   ATL_INT nx, j;
   ATL_CINT NN = (N/ATL_s1U_NU)*ATL_s1U_NU;

   nx = (ATL_S1NX >= ATL_s1U_NU) ? (ATL_S1NX/ATL_s1U_NU)*ATL_s1U_NU : ATL_s1U_NU;
   nx = Mmin(nx, N);
   Mjoin(PATL,refsyr)(AtlasUpper, nx, alpha, xt, 1, A, lda);
   for (j=nx; j < NN; j += ATL_s1U_NU)
   {
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (j >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(j, ATL_s1U_NU, x, xt+j, A+j*lda, lda);
      ATL_SYR1U_nu(A+j*(lda+1), lda, x+j, xt+j);
   }
   nx = N - j;
   if (nx)
   {
      ATL_GENGERK(j, nx, x, xt+j, A+j*lda, lda);
      Mjoin(PATL,refsyrU)(nx, alpha, xt+j, 1, A+j*(lda+1), lda);
   }
}

void Mjoin(PATL,syr_kL)
(
   ATL_r1kern_t gerk0,          /* func ptr to selected GER kernel */
   ATL_CINT N,                  /* size of prob to solve */
   const TYPE alpha,            /* alpha */
   const TYPE *x,               /* vector X -- may have alpha applied */
   const TYPE *xt,              /* X^T */
   TYPE *A,                     /* symmetric matrix, A = A + x*xt */
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r1kern_t gerk=gerk0;
   ATL_INT nx=Mmin(ATL_S1NX,N), i, NN, n;

   i = N - nx;
   i = (i/ATL_s1L_NU)*ATL_s1L_NU;
   if (i != N-nx)
      nx += N-nx-i;
   NN = N - nx;
   for (i=0; i < NN; i += ATL_s1L_NU)
   {
      ATL_SYR1L_nu(A, lda, x, xt);
      n = N-i-ATL_s1L_NU;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (n >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(n, ATL_s1L_NU, x+ATL_s1L_NU, xt, A+ATL_s1L_NU, lda);
      A += ATL_s1L_NU*(lda+1);
      xt += ATL_s1L_NU;
      x += ATL_s1L_NU;
   }
   Mjoin(PATL,refsyr)(AtlasLower, nx, alpha, xt, 1, A, lda);
}

#define MY_GERK(m_, n_, x_, xt_, A_, lda_) \
{ \
   if (FNU) \
   { \
      ATL_CINT nnu = ((n_) >= minN && (m_) >= minM) ? ((n_)/nu)*nu : 0, \
               nr = (n_)-nnu; \
      if (nnu) \
         gerk(m_, nnu, x_, xt_, A_, lda_); \
      if (nr) \
         ATL_GENGERK(m_, nr, x_, (xt_)+nnu, (A_)+nnu*(lda_), lda_); \
   } \
   else \
      gerk(m_, n_, x_, xt_, A_, lda_); \
}
void ATL_syr(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha,
               const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda)
{
   size_t t1, t2;
   void *vp=NULL;
   TYPE *x, *xt, *xx=(TYPE*)X;
   ATL_r1kern_t gerk, gerk0;
   ATL_INT MB, NB, mb, nb, Nmb, i, n, incx=incX, CacheElts;
   int mu, nu, minM, minN, alignX, alignXt, FNU;
   int COPYX=0, COPYXt=0, ALIGNX2A=0;
   const int ALPHA_IS_ONE=(alpha == ATL_rone);

   if (N < 1 || (alpha == ATL_rzero))
      return;
/*
 * For very small problems, avoid overhead of func calls & data copy
 */
   if (N < 50)
   {
      Mjoin(PATL,refsyr)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   ATL_GetPartS1(A, lda, mb, nb);
   if (!mb || !nb || mb > N || nb > N)
   {
      MB = N-ATL_s1L_NU;
      NB = N-ATL_s1L_NU;
      mb = nb = N;
   }
   else
   {
      MB = mb;
      NB = nb;
   }
   gerk = ATL_GetR1Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX,
                        &ALIGNX2A, &alignXt, &FNU, &CacheElts);
/*
 * Determine if we need to copy the vectors
 */
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   COPYXt = (incX != 1);
   if (!COPYXt && alignXt)  /* alignment might still force a copy */
   {
      t1 = (size_t) X;
      COPYXt = ((t1/alignX)*alignX != t1);
   }
   MB = (mb > N || mb < 1) ? N : mb;
   if (nb > N || nb < 1)
      nb = N;
/*
 * See if X and Xt can legally be the same vector
 */
   if (ALPHA_IS_ONE && (!alignXt || alignX == alignXt))
   {
      if (COPYX)
      {
         vp = malloc(ATL_MulBySize(N)+ATL_Cachelen);
         if (!vp)
         {
            Mjoin(PATL,refsyr)(Uplo, N, alpha, X, incX, A, lda);
            return;
         }
         x = xt = ALIGNX2A ? ATL_Align2Ptr(vp, A) : ATL_AlignPtr(vp);
         Mjoin(PATL,copy)(N, X, incX, x, 1);
         COPYX = 0;
      }
      else
         x = xt = (TYPE*) X;
   }
   else if (incX == 1 && !COPYXt)          /* apply alpha to X, orig vec Xt */
   {
      COPYX = 1;
      xt = (TYPE*) X;
      vp = malloc(ATL_MulBySize(MB)+ATL_Cachelen);
      if (!vp)
      {
         Mjoin(PATL,refsyr)(Uplo, N, alpha, X, incX, A, lda);
         return;
      }
      x = ALIGNX2A ? ATL_Align2Ptr(vp, A) : ATL_AlignPtr(vp);
   }
   else                         /* must copy both X & Xt, apply alpha to x */
   {
      COPYX = 1;
      vp = malloc(ATL_MulBySize(MB+N)+2*ATL_Cachelen);
      if (!vp)
      {
         Mjoin(PATL,refsyr)(Uplo, N, alpha, X, incX, A, lda);
         return;
      }
      x = ALIGNX2A ? ATL_Align2Ptr(vp, A) : ATL_AlignPtr(vp);
      xt = x + MB;
      xt = ATL_AlignPtr(xt);
      Mjoin(PATL,copy)(N, X, incX, xt, 1);
/*
 *    Set it up so that we copy from contiguous vector, not original X
 */
      xx = xt;
      incx = 1;
   }
   Nmb = ((N-1)/mb)*mb;
   if (Uplo == AtlasUpper)
   {
      for (i=0; i < Nmb; i += mb)
      {
         n = N-i-MB;
         if (COPYX)
            Mjoin(PATL,cpsc)(MB, alpha, xx+i*incx, incx, x, 1);
         Mjoin(PATL,syr_kU)(gerk, FNU, MB, alpha, x, xt+i, A+i*(lda+1), lda);
         MY_GERK(MB, n, x, xt+i+MB, A+(MB+i)*lda+i, lda);
         if (!COPYX)
            x += MB;
      }
      mb = N - Nmb;
      if (COPYX)
         Mjoin(PATL,cpsc)(mb, alpha, xx+Nmb*incx, incx, x, 1);
      Mjoin(PATL,syr_kU)(gerk, FNU, mb, alpha, x, xt+Nmb, A+Nmb*(lda+1), lda);
   }
   else         /* Uplo == AtlasLower */
   {
      mb = N - Nmb;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk0 = gerk = (mb >= minM) ? gerk0 : ATL_GENGERK;
      #endif
      if (COPYX)
         Mjoin(PATL,cpsc)(mb, alpha, xx, incx, x, 1);
      Mjoin(PATL,syr_kL)(gerk, mb, alpha, x, xt, A, lda);
      for (i=mb; i < N; i += MB)
      {
         #if ATL_MIN_RESTRICTED_M > 0
            gerk = (i >= minN) ? gerk0 : ATL_GENGERK;
         #endif
         if (COPYX)
            Mjoin(PATL,cpsc)(MB, alpha, xx+i*incx, incx, x, 1);
         else
            x += mb;
         MY_GERK(MB, i, x, xt, A+i, lda);
         Mjoin(PATL,syr_kL)(gerk, MB, alpha, x, xt+i, A+i*(lda+1), lda);
         mb = MB;
      }
   }

   if (vp)
     free(vp);
}
