// -*- C++ -*-
// ACL:license
// ----------------------------------------------------------------------
// This software and ancillary information (herein called "SOFTWARE")
// called POOMA (Parallel Object-Oriented Methods and Applications) is
// made available under the terms described here.  The SOFTWARE has been
// approved for release with associated LA-CC Number LA-CC-98-65.
// 
// Unless otherwise indicated, this SOFTWARE has been authored by an
// employee or employees of the University of California, operator of the
// Los Alamos National Laboratory under Contract No. W-7405-ENG-36 with
// the U.S. Department of Energy.  The U.S. Government has rights to use,
// reproduce, and distribute this SOFTWARE. The public may copy, distribute,
// prepare derivative works and publicly display this SOFTWARE without 
// charge, provided that this Notice and any statement of authorship are 
// reproduced on all copies.  Neither the Government nor the University 
// makes any warranty, express or implied, or assumes any liability or 
// responsibility for the use of this SOFTWARE.
// 
// If SOFTWARE is modified to produce derivative works, such modified
// SOFTWARE should be clearly marked, so as not to confuse it with the
// version available from LANL.
// 
// For more information about POOMA, send e-mail to pooma@acl.lanl.gov,
// or visit the POOMA web page at http://www.acl.lanl.gov/pooma/.
// ----------------------------------------------------------------------
// ACL:license

#ifndef POOMA_EVALUATOR_REDUCTIONEVALUATOR_H
#define POOMA_EVALUATOR_REDUCTIONEVALUATOR_H

//-----------------------------------------------------------------------------
// Class: 
//   ReductionEvaluator<InlineKernelTag>
//   ReductionEvaluator<CompressibleKernelTag>
//   CompressibleReduce<T, Op>
//-----------------------------------------------------------------------------


//-----------------------------------------------------------------------------
// Overview:
//
// ReductionEvaluator<InlineKernelTag> reduces expressions by inlining a 
// simple loop. ReductionEvaluator<CompressibleKernelTag> can optionally take
// advantage of compression.
//-----------------------------------------------------------------------------

//-----------------------------------------------------------------------------
// Includes:
//-----------------------------------------------------------------------------

#include "Engine/EngineFunctor.h"
#include "Evaluator/CompressibleEngines.h"
#include "Evaluator/KernelTags.h"
#include "PETE/OperatorTags.h"
#include "Utilities/WrappedInt.h"
#include "Utilities/PAssert.h"

//-----------------------------------------------------------------------------
// Forward Declarations:
//-----------------------------------------------------------------------------

template<class KernelTag>
struct ReductionEvaluator;

//-----------------------------------------------------------------------------
// The point of this class is to input an expression with the
// 'evaluate' member function and reduce it by looping over the
// whole domain.
//
// This is the simplest possible reduction. It makes the simplifying
// assumption that expression passed in can handle random access to 
// all of its elements efficiently.
//-----------------------------------------------------------------------------

template<>
struct ReductionEvaluator<InlineKernelTag>
{
  //---------------------------------------------------------------------------
  // Input an expression and cause it to be evaluated.
  // All this template function does is extract the domain
  // from the expression and call evaluate on that.

  template<class T, class Op, class Expr>
  inline static void evaluate(T &ret, const Op &op, const Expr &e)
  {
    typedef typename Expr::Domain_t Domain_t;
    evaluate(ret, op, e, e.domain(),
      WrappedInt<Domain_t::dimensions>());
  }

  //---------------------------------------------------------------------------
  // This is the function both of the above functions call.
  // It adds a third argument which is a tag class templated on
  // the dimension of the domain.
  //
  // This parameter lets us specialize the function based on
  // that dimension.
  //
  // Some day, we will figure out how to specialize template 
  // member functions outside the class declaration...
  //
  // These functions are all inline for efficiency. That means that if
  // they are being used at the user level we will get the optimization
  // of recognizing multiple uses of a single Array on the right hand
  // side.
  //
  // There are seven specializations here, for dimension 1 through 7.
  // Rather than use template metaprograms for these seven cases we
  // simply enumerate them explicitly.  This is done to reduce the
  // burden on the compiler, which would otherwise have to jump through
  // a bunch of hoops to get the code that is here.
  //
  // For each of the specializations it builds a nested loop for each
  // dimension. Each loop is constructed with first() and last() from the
  // appropriate dimension of the domain.
  //
  // NOTE: These loops assume that the domain passed in is a unit-stride
  // domain starting at 0.  Assertions are made to make sure this is true.
  
  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<1>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    int e0 = domain[0].length();

    ret = e.read(0);
    for (int i0 = 1; i0 < e0; ++i0)
      op(ret, e.read(i0));
  }

  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<2>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    PAssert(domain[1].first() == 0);
    int e0 = domain[0].length();
    int e1 = domain[1].length();

    int i00;
    bool firstLoop = true;
    
    ret = e.read(0, 0);
    for (int i1 = 0; i1 < e1; ++i1)
      {
        if (firstLoop)
          {
            firstLoop = false;
            i00 = 1;
          }
        else
          i00 = 0;
        for (int i0 = i00; i0 < e0; ++i0)
          op(ret, e.read(i0, i1));
      }
  }
  
  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<3>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    PAssert(domain[1].first() == 0);
    PAssert(domain[2].first() == 0);
    int e0 = domain[0].length();
    int e1 = domain[1].length();
    int e2 = domain[2].length();

    int i00;
    bool firstLoop = true;
    
    ret = e.read(0, 0, 0);
    for (int i2 = 0; i2 < e2; ++i2)
      for (int i1 = 0; i1 < e1; ++i1)
        {
          if (firstLoop)
            {
              firstLoop = false;
              i00 = 1;
            }
          else
            i00 = 0;
          for (int i0 = i00; i0 < e0; ++i0)
            op(ret, e.read(i0, i1, i2));
        }
  }

  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<4>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    PAssert(domain[1].first() == 0);
    PAssert(domain[2].first() == 0);
    PAssert(domain[3].first() == 0);
    int e0 = domain[0].length();
    int e1 = domain[1].length();
    int e2 = domain[2].length();
    int e3 = domain[3].length();

    int i00;
    bool firstLoop = true;
    
    ret = e.read(0, 0, 0, 0);
    for (int i3 = 0; i3 < e3; ++i3)
      for (int i2 = 0; i2 < e2; ++i2)
        for (int i1 = 0; i1 < e1; ++i1)
          {
            if (firstLoop)
              {
                firstLoop = false;
                i00 = 1;
              }
            else
              i00 = 0;
            for (int i0 = i00; i0 < e0; ++i0)
              op(ret, e.read(i0, i1, i2, i3));
          }
  }

  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<5>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    PAssert(domain[1].first() == 0);
    PAssert(domain[2].first() == 0);
    PAssert(domain[3].first() == 0);
    PAssert(domain[4].first() == 0);
    int e0 = domain[0].length();
    int e1 = domain[1].length();
    int e2 = domain[2].length();
    int e3 = domain[3].length();
    int e4 = domain[4].length();

    int i00;
    bool firstLoop = true;
    
    ret = e.read(0, 0, 0, 0, 0);
    for (int i4 = 0; i4 < e4; ++i4)
      for (int i3 = 0; i3 < e3; ++i3)
        for (int i2 = 0; i2 < e2; ++i2)
          for (int i1 = 0; i1 < e1; ++i1)
            {
              if (firstLoop)
                {
                  firstLoop = false;
                  i00 = 1;
                }
              else
                i00 = 0;
              for (int i0 = i00; i0 < e0; ++i0)
                op(ret, e.read(i0, i1, i2, i3, i4));
            }
  }

  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<6>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    PAssert(domain[1].first() == 0);
    PAssert(domain[2].first() == 0);
    PAssert(domain[3].first() == 0);
    PAssert(domain[4].first() == 0);
    PAssert(domain[5].first() == 0);
    int e0 = domain[0].length();
    int e1 = domain[1].length();
    int e2 = domain[2].length();
    int e3 = domain[3].length();
    int e4 = domain[4].length();
    int e5 = domain[5].length();

    int i00;
    bool firstLoop = true;
    
    ret = e.read(0, 0, 0, 0, 0, 0);
    for (int i5 = 0; i5 < e5; ++i5)
      for (int i4 = 0; i4 < e4; ++i4)
        for (int i3 = 0; i3 < e3; ++i3)
          for (int i2 = 0; i2 < e2; ++i2)
            for (int i1 = 0; i1 < e1; ++i1)
              {
                if (firstLoop)
                  {
                    firstLoop = false;
                    i00 = 1;
                  }
                else
                  i00 = 0;
                for (int i0 = i00; i0 < e0; ++i0)
                  op(ret, e.read(i0, i1, i2, i3, i4));
              }
  }

  template<class T, class Op, class Expr, class Domain>
  inline static void evaluate(T &ret, const Op &op, const Expr &e,
    const Domain &domain, WrappedInt<7>)
  {
    CTAssert(Domain::unitStride == 1);
    PAssert(domain[0].first() == 0);
    PAssert(domain[1].first() == 0);
    PAssert(domain[2].first() == 0);
    PAssert(domain[3].first() == 0);
    PAssert(domain[4].first() == 0);
    PAssert(domain[5].first() == 0);
    PAssert(domain[6].first() == 0);
    int e0 = domain[0].length();
    int e1 = domain[1].length();
    int e2 = domain[2].length();
    int e3 = domain[3].length();
    int e4 = domain[4].length();
    int e5 = domain[5].length();
    int e6 = domain[6].length();

    int i00;
    bool firstLoop = true;
    
    ret = e.read(0, 0, 0, 0, 0, 0, 0);
    for (int i6 = 0; i6 < e6; ++i6)
      for (int i5 = 0; i5 < e5; ++i5)
        for (int i4 = 0; i4 < e4; ++i4)
          for (int i3 = 0; i3 < e3; ++i3)
            for (int i2 = 0; i2 < e2; ++i2)
              for (int i1 = 0; i1 < e1; ++i1)
                {
                  if (firstLoop)
                    {
                      firstLoop = false;
                      i00 = 1;
                    }
                  else
                    i00 = 0;
                  for (int i0 = i00; i0 < e0; ++i0)
                    op(ret, e.read(i0, i1, i2, i3, i4));
                }
  }
};


//-----------------------------------------------------------------------------
// This class handles the evaluation of a reduction from a single compressed
// value. The current possibilies are:
//   o sum:    N * val
//   o prod:   val^N
//   o min:    val
//   o max:    val
//   o any:    val
//   o all:    val
//   o bitOr:  val
//   o bitAnd: val
//-----------------------------------------------------------------------------

template<class T, class Op>
struct CompressibleReduce
{
  template<class T1>
  inline static void evaluate(T &ret, const Op &, const T1 &val, int)
  {
    ret = static_cast<T>(val);
  }
};

template<class T>
struct CompressibleReduce<T, OpAddAssign>
{
  template<class T1>
  inline static void evaluate(T &ret, const OpAddAssign &, const T1 &val, 
    int n)
  {
    ret = static_cast<T>(n * val);
  }
};

template<class T>
struct CompressibleReduce<T, OpMultiplyAssign>
{
  template<class T1>
  inline static void evaluate(T &ret, const OpMultiplyAssign &, const T1 &val,
    int n)
  {
    ret = static_cast<T>(val);
    while (--n > 0)
      ret *= static_cast<T>(val);
  }
};


//-----------------------------------------------------------------------------
// The point of this class is to input an expression with the
// 'evaluate' member function and reduce it, optionally taking advantage of
// compression.
//-----------------------------------------------------------------------------

template<>
struct ReductionEvaluator<CompressibleKernelTag>
{
  //---------------------------------------------------------------------------
  // Input an expression and cause it to be reduced.
  // This class relies on another class, CompressibleReduce<T, Op> to
  // perform the correct reduction based on the operator if the expression
  // is compressed. If it is not, we simply use 
  // ReductionEvaluator<InlineKernelTag>.

  template<class T, class Op, class Expr>
  inline static void evaluate(T &ret, const Op &op, const Expr &e)
  {
    if (engineFunctor(e, Compressed()))
      {
        CompressibleReduce<T, Op>::
          evaluate(ret, op, engineFunctor(e, CompressedRead()), 
            e.domain().size());
      }
    else
      {
        ReductionEvaluator<InlineKernelTag>::evaluate(ret, op, e);
      }
  }
};

#endif // POOMA_EVALUATOR_REDUCTIONEVALUATOR_H

// ACL:rcsinfo
// ----------------------------------------------------------------------
// $RCSfile: ReductionEvaluator.h,v $   $Author: swhaney $
// $Revision: 1.3 $   $Date: 2000/04/18 22:32:57 $
// ----------------------------------------------------------------------
// ACL:rcsinfo
