/*
 *   A simple example of reduction across the processor array
 *   (c) 1995, Howard E. Motteler
 *
 *   The hand-coded example using the X-net is O(sqrt(n)), for n
 *   PEs, and can be sped up by a factor of two, fairly easily,
 *   but in either case is much slower than the library routine.
 *   
 *   This example also shows how to compute cumulative dpu time.
 */

#include <mpl.h>
#include <stdio.h>

extern void dpuTimerStart();
extern unsigned long dpuTimerTicks();
extern double dpuTimerConst();
extern int reduceAdd32();

#define NOPS 1000

main () {

   int i, j, k;
   plural s, a;

   printf("nproc = %d\n", nproc);
   printf("nyproc (PE rows) = %d\n", nyproc);
   printf("nxproc (PE cols) = %d\n", nxproc);

   printf("\ntesting hand-coded X-net reduce . . .\n");

   dpuTimerStart();

   for (k=0; k<NOPS; k++) {

      a = 1;

      /*   reduce columns; 
       *   active sets are successive rows
       */
      for (i=nyproc-2; i>=0; i--) 	/* row counter */

	 /*  make a PE row active and add elements 
	  *  from the connected set:
	  */
	 if (iyproc == i) a += xnetS[1].a;   
      
      for (j=nxproc-2; j>=0; j--) 	/* col counter */
      
	 /*  make a single PE active and add elemnts
	  *  from the connected set:
	  */
	 if (iyproc == 0 && ixproc == j) a += xnetE[1].a;   

      }

   printf("sum is %d\n", proc[0].a);
   printf("time for %d reduce ops was %g\n",  
           NOPS, dpuTimerTicks() * dpuTimerConst());


   printf("\ntesting reduce library routine . . .\n");

   dpuTimerStart();

   for (k=0; k<NOPS; k++) {
      a = 1;
      j = reduceAdd32(a);
      }

   printf("sum is %d\n", j);
   printf("time for %d reduce ops was %g\n",  
           NOPS, dpuTimerTicks() * dpuTimerConst());

   }

