//********************************************************************************
//
// IterSolvers: A collection of Iterative Solvers
// Written by James Sandham
// 3 March 2015
//
//********************************************************************************

//********************************************************************************
//
// IterSolvers is free software; you can redistribute it and/or modify it under the
// terms of the GNU Lesser General Public License (as published by the Free
// Software Foundation) version 2.1 dated February 1999.
//
//********************************************************************************

#include<stdio.h>
#include<stdlib.h>
#include<mpi.h>
#include"pAMG.h"
#include"math.h"

//********************************************************************************
//
// pAMG: Parallel Classical Algebraic Multigrid
//
//********************************************************************************

#define DEBUG 1
#define FAST_ERROR 0
#define MAX_VCYCLES 100


//-------------------------------------------------------------------------------
// structure representing an array
//-------------------------------------------------------------------------------
struct array{
  int value;
  unsigned id;
};


//-------------------------------------------------------------------------------
// pAMG main function
//-------------------------------------------------------------------------------
void pamg(int r[], int c[], double v[], double x[], double b[], int n, int m, double theta, double tol, int id, int np)
{
  int n1 = 2;                            //
  int n2 = 2;                            // number of smoothing steps and maximum number of levels
  int level = 1;                         //

  int *nSizes = malloc((level+1)*sizeof(nSizes)); // n is the number of rows in the global size of A matrix 
  int *mSizes = malloc((level+1)*sizeof(mSizes)); // m is the number of rows of A given to proc id

  int **ar = malloc((level+1)*sizeof(*ar));
  int **ac = malloc((level+1)*sizeof(*ac));
  double **av = malloc((level+1)*sizeof(*av));  
  double **ad = malloc((level+1)*sizeof(*ad));

  int **wr = malloc(level*sizeof(*wr));
  int **wc = malloc(level*sizeof(*wc));
  double **wv = malloc(level*sizeof(*wv)); 

  //initialize first level to original A matrix and solution vector to zero
  amg_init(r,c,v,ar,ac,av,ad,nSizes,mSizes,n,m,id,np);  

  //Phase 1: pAMG setup
  level = amg_setup(ar,ac,av,ad,wr,wc,wv,nSizes,mSizes,level,theta,id,np);

  //Phase 2: pAMG solution


  //delete temporary arrays that were allocated on the heap
  for(int i=0;i<level+1;i++){
    free(ar[i]);
    free(ac[i]);
    free(av[i]);
    free(ad[i]);
  }
  for(int i=0;i<level;i++){
    free(wr[i]);
    free(wc[i]);
    free(wv[i]);
  }
  free(ar);
  free(ac);
  free(av);
  free(ad);
  free(wr);
  free(wc);
  free(wv);
  free(nSizes);
  free(mSizes);
}





//-------------------------------------------------------------------------------
// AMG initialize  function
//-------------------------------------------------------------------------------
void amg_init(int r[], int c[], double v[], int *ar[], int *ac[], double *av[], double *ad[], int nSizes[], int mSizes[], int n, int m, int id, int np)
{ 
  nSizes[0] = n; mSizes[0] = m;
  ar[0] = malloc((m+1)*sizeof(*ar[0])); 
  ac[0] = malloc(r[m]*sizeof(*ac[0]));
  av[0] = malloc(r[m]*sizeof(*av[0]));
  ad[0] = malloc(m*sizeof(*ad[0])); 
  for(int i=0;i<m+1;i++){ar[0][i] = r[i];}
  for(int i=0;i<r[m];i++){ac[0][i] = c[i];}
  for(int i=0;i<r[m];i++){av[0][i] = v[i];}

  //find diagonal entries of A matrix
  for(int i=0;i<m;i++){
    for(int j=r[i];j<r[i+1];j++){
      if(c[j]==i){
        ad[0][i] = v[j];
        break;
      }
    }
  }
}





//-------------------------------------------------------------------------------
// AMG setup phase
//-------------------------------------------------------------------------------
int amg_setup(int *ar[], int *ac[], double *av[], double *ad[], int *wr[], int *wc[], double *wv[], int nSizes[], int mSizes[], int level, double theta, int id, int np)
{
  int nptr_size = 0;
  int mptr_size = 0;

  int i = 0; 
  while(nSizes[i]>1 && i<level){
    nptr_size = nSizes[i]+1;  //nptr_size is the size of the row pointer array ar at the ith level
    mptr_size = mSizes[i]+1;  //mptr_size is the size of the row pointer array ar at the ith level

    //determine size of strength matrix
    int ssize = strength_matrix_size(ar[i],ac[i],av[i],mptr_size,theta,id,np);

    //intialize temporary arrays
    int *lambda = malloc((mptr_size-1)*sizeof(lambda));  
    int *cfpoints = malloc((mptr_size-1)*sizeof(cfpoints)); 
    int *srow = malloc(mptr_size*sizeof(srow));  
    int *scol = malloc(ssize*sizeof(scol));  
    double *sval = malloc(ssize*sizeof(sval));    
    int *strow = malloc(mptr_size*sizeof(srow));
    int *stcol = malloc(ssize*sizeof(scol));
    double *stval = malloc(ssize*sizeof(sval));
    for(int j=0;j<nptr_size-1;j++){lambda[j]=0;}
    for(int j=0;j<nptr_size-1;j++){cfpoints[j]=0;}
    for(int j=0;j<mptr_size;j++){srow[j]=0;}
    for(int j=0;j<ssize;j++){scol[j]=0;}
    for(int j=0;j<ssize;j++){sval[j]=0.0;}
    for(int j=0;j<mptr_size;j++){strow[j]=0;}
    for(int j=0;j<ssize;j++){stcol[j]=0;}
    for(int j=0;j<ssize;j++){stval[j]=0.0;}

    //compute strength matrix S
    strength_matrix(ar[i],ac[i],av[i],srow,scol,sval,lambda,mptr_size,theta,id,np);

    //compute strength transpose matrix S^T
    strength_transpose_matrix(srow,scol,sval,strow,stcol,stval,lambda,mptr_size,theta,id,np);

    //determine c-points and f-points (first pass)
    pre_cpoint(srow,scol,strow,stcol,lambda,cfpoints,mptr_size,id,np);

    //determine c-points and f-points (second pass)
    post_cpoint(srow,scol,cfpoints,mptr_size,id,np);

    //compute interpolation matrix W
    int numCPoints = weight_matrix(ar[i],ac[i],av[i],ad[i],srow,scol,sval,wr,wc,wv,cfpoints,mptr_size,i,id,np);

    printf("%d",numCPoints);

    //perform galarkin product Ac = W'*A*W
    galerkin_prod2(ar,ac,av,ad,wr,wc,wv,mptr_size,numCPoints,i,id,np);

    nSizes[i+1] = 0;
    mSizes[i+1] = 0; //numCPoints;  //size of the A matrix at the next level

    i++;

    //delete temporary arrays
    free(cfpoints);
    free(lambda);
    free(srow);
    free(scol);
    free(sval);
  } 
} 




//-------------------------------------------------------------------------------
// function for finding strength matrix size
//-------------------------------------------------------------------------------
int strength_matrix_size(int r[], int c[], double v[], int mptr_size, double theta, int id, int np)
{
  int str_size = 0;
  for(int i=0;i<mptr_size-1;i++){
    int start = id*(mptr_size-1);
    int end = (id+1)*(mptr_size-1);
    double max_value = 0.0;
    for(int j=r[i];j<r[i+1];j++){
      if(c[j]>=start && c[j]<end){
        if(-v[j]>max_value && (i+id*(mptr_size-1))!=c[j]){max_value = fabs(v[j]);}
      }
    }

    max_value = max_value*theta;
    for(int j=r[i];j<r[i+1];j++){
      if(c[j]>=start && c[j]<end){
        if(-v[j]>max_value && (i+id*(mptr_size-1))!=c[j]){str_size++;}
      }
    }
  }

  return str_size;
}





//-------------------------------------------------------------------------------
// function for finding strength matrix and lambda array
//-------------------------------------------------------------------------------
void strength_matrix(int r[], int c[], double v[], int sr[], int sc[], double sv[], int lambda[], int mptr_size, double theta, int id, int np)
{
  //determine strength matrix
  int ind = 0;
  for(int i=0;i<mptr_size-1;i++){
    int start = id*(mptr_size-1);
    int end = (id+1)*(mptr_size-1);    
    double max_value = 0.0;
    for(int j=r[i];j<r[i+1];j++){
      if(c[j]>=start && c[j]<end){
        if(-v[j]>max_value && (i+id*(mptr_size-1))!=c[j]){max_value = fabs(v[j]);}
      }
    }

    max_value = max_value*theta;
    sr[i+1] = sr[i];
    for(int j=r[i];j<r[i+1];j++){
      if(c[j]>=start && c[j]<end){
        if(-v[j]>max_value && (i+id*(mptr_size-1))!=c[j]){
          sc[ind] = c[j]-start;
          lambda[sc[ind]]++;
          sv[ind] = v[j];
          ind++;
          sr[i+1]++;
        }
      }
    }
  }


  if(id==0){
    for(int i=0;i<mptr_size-1;i++){printf("%d",lambda[i]);}
    printf("\n");
    //for(int i=0;i<mptr_size;i++){printf("%d",sr[i]);}
    //printf("\n");
    //for(int i=0;i<sr[mptr_size-1];i++){printf("%d",sc[i]);}
    //printf("\n");
    //for(int i=0;i<sr[mptr_size-1];i++){printf("%f",sv[i]);}
    //printf("\n");
  }
  else if(id==1){
    for(int i=0;i<mptr_size-1;i++){printf("%d",lambda[i]);}
    printf("\n");
    //for(int i=0;i<mptr_size;i++){printf("%d",sr[i]);}
    //printf("\n");
    //for(int i=0;i<sr[mptr_size-1];i++){printf("%d",sc[i]);}
    //printf("\n");
    //for(int i=0;i<sr[mptr_size-1];i++){printf("%f",sv[i]);}
    //printf("\n");
  }
}




//-------------------------------------------------------------------------------
// function for finding strength transpose matrix
//-------------------------------------------------------------------------------
void strength_transpose_matrix(int sr[], int sc[], double sv[], int str[], int stc[], double stv[], int lambda[], int mptr_size, double theta, int id, int np)
{
  //determine transpose strength matrix
  for(int i=1;i<mptr_size;i++){str[i]=lambda[i-1]+str[i-1];}

  unsigned *tmp = malloc((mptr_size-1)*sizeof(tmp));  
  for(int i=0;i<mptr_size-1;i++){tmp[i] = 0;}
  for(int i=0;i<mptr_size-1;i++){
    for(int j=sr[i];j<sr[i+1];j++){
      stc[str[sc[j]]+tmp[sc[j]]] = i;
      stv[str[sc[j]]+tmp[sc[j]]] = sv[j];
      tmp[sc[j]]++;
    }
  }

  free(tmp);
  if(id==0){
    for(int i=0;i<mptr_size;i++){printf("%d",str[i]);}
    printf("\n");
    for(int i=0;i<sr[mptr_size-1];i++){printf("%d",stc[i]);}
    printf("\n");
    for(int i=0;i<sr[mptr_size-1];i++){printf("%f",stv[i]);}
    printf("\n");
  }
  else if(id==1){
    for(int i=0;i<mptr_size;i++){printf("%d",str[i]);}
    printf("\n");
    for(int i=0;i<sr[mptr_size-1];i++){printf("%d",stc[i]);}
    printf("\n");
    for(int i=0;i<sr[mptr_size-1];i++){printf("%f",stv[i]);}
    printf("\n");
  }
}






//-------------------------------------------------------------------------------
// function for finding c-points and f-points (first pass)
//-------------------------------------------------------------------------------
void pre_cpoint(int sr[], int sc[], int str[], int stc[], int lambda[], int cfpoints[], int mptr_size, int id, int np)
{
  unsigned locInSortedLambda = 0;
  unsigned numOfNodesToCheck = 0;
  unsigned *nodesToCheck = malloc((mptr_size-1)*sizeof(nodesToCheck));
  struct array *sortedLambda = malloc((mptr_size-1)*sizeof(sortedLambda));
  for(unsigned i=0;i<mptr_size-1;i++){nodesToCheck[i] = 0;}

  //copy lambda into struct array and then sort
  for(unsigned i=0;i<mptr_size-1;i++){
    sortedLambda[i].value = lambda[i];
    sortedLambda[i].id = i;
  }
  qsort(sortedLambda, mptr_size-1, sizeof(sortedLambda[0]), compare_structs);
  nodesToCheck[0] = sortedLambda[0].id;
  numOfNodesToCheck++;

  int num_nodes_not_assign = mptr_size-1;
  while(num_nodes_not_assign>0)
  {
    int max_value = -999;
    unsigned max_index = 0;
    while(locInSortedLambda<mptr_size-2 && lambda[sortedLambda[locInSortedLambda].id]==-999){
      locInSortedLambda++;
    }
    nodesToCheck[0] = sortedLambda[locInSortedLambda].id;

    for(int i=0;i<numOfNodesToCheck;i++){
      if(lambda[nodesToCheck[i]]>max_value){
        max_value = lambda[nodesToCheck[i]];
        max_index = nodesToCheck[i];
      }
    }
    numOfNodesToCheck = 1;

    cfpoints[max_index] = 1;
    lambda[max_index] = -999;
    num_nodes_not_assign--;

    //determine how many nonzero entries are in the max_index column of S and 
    //what rows those nonzero values are in
    int nnz_in_col = str[max_index+1]-str[max_index];
    int *index_of_nz = malloc(nnz_in_col*sizeof(index_of_nz));  
    for(int i=str[max_index];i<str[max_index+1];i++){
      index_of_nz[i-str[max_index]] = stc[i];
    }

    //make all connections to cpoint fpoints and update lambda array
    for(int i=0;i<nnz_in_col;i++){
      if(lambda[index_of_nz[i]]!=-999){
        lambda[index_of_nz[i]] = -999;
        num_nodes_not_assign--;
        for(int j=sr[index_of_nz[i]];j<sr[index_of_nz[i]+1];j++){
          if(lambda[sc[j]]!=-999){
            lambda[sc[j]]++;
            int flag = 0;
            for(int k=0;k<numOfNodesToCheck;k++){
              if(nodesToCheck[k]==sc[j]){
                flag = 1;
                break;
              }
            }
            if(flag==0){
              nodesToCheck[numOfNodesToCheck] = sc[j];
              numOfNodesToCheck++;
            }
          }
        }
      }
    }
    free(index_of_nz);
  }
  free(nodesToCheck);
  free(sortedLambda);


  if(id==0){
    for(int i=0;i<mptr_size-1;i++){printf("%d",cfpoints[i]);}
    printf("\n");
  }
  else if(id==1){
    for(int i=0;i<mptr_size-1;i++){printf("%d",cfpoints[i]);}
    printf("\n");
  }
  else if(id==2){
    for(int i=0;i<mptr_size-1;i++){printf("%d",cfpoints[i]);}
    printf("\n");
  }
  else if(id==3){
    for(int i=0;i<mptr_size-1;i++){printf("%d",cfpoints[i]);}
    printf("\n");
  }
}





//-------------------------------------------------------------------------------
// function for finding c-points and f-points (second pass)
//-------------------------------------------------------------------------------
void post_cpoint(int sr[], int sc[], int cfpoints[], int mptr_size, int id, int np)
{
  int max_nstrc = 0;  //max number of strong connections in any row
  for(int i=0;i<mptr_size-1;i++){
    if(max_nstrc<sr[i+1]-sr[i]){max_nstrc = sr[i+1]-sr[i];}
  }

  int *scpoints = malloc(max_nstrc*sizeof(scpoints)); 

  //perform second pass adding c-points where necessary
  for(int i=0;i<mptr_size-1;i++){
    if(cfpoints[i]==0){                //i is an fpoint
      int nstrc = sr[i+1]-sr[i];       //number of strong connections in row i
      int scindex = 0;                 //number of c-points in row i
      for(int j=sr[i];j<sr[i+1];j++){
        if(cfpoints[sc[j]]==1){
          scpoints[scindex] = sc[j];
          scindex++;
        }
      }

      #if(DEBUG)
        if(scindex==0){printf("ERROR: no cpoint for the f-point ");}
      #endif

      for(int j=sr[i];j<sr[i+1];j++){
        if(cfpoints[sc[j]]==0){  //sc[j] is an fpoint
          int ind1 = 0, ind2 = 0, flag = 1;
          while(ind1<scindex && ind2<(sr[sc[j]+1]-sr[sc[j]])){
            if(scpoints[ind1]==sc[sr[sc[j]]+ind2]){
              flag = 0;
              break;
            }
            else if(scpoints[ind1]<sc[sr[sc[j]]+ind2]){
              ind1++;
            }
            else if(scpoints[ind1]>sc[sr[sc[j]]+ind2]){
              ind2++;
            }
          }
          if(flag){
            cfpoints[sc[j]] = 1; // sc[j] was an fpoint, but now is a cpoint 
            scpoints[scindex] = sc[j];
            scindex++;
          }
        }
      }
    }
  }
  free(scpoints);
}





//-------------------------------------------------------------------------------
// function for finding interpolation weight matrix
//-------------------------------------------------------------------------------
int weight_matrix(int r[], int c[], double v[], double d[], int sr[], int sc[], double sv[], int *wr[], int *wc[], double *wv[], unsigned cfpoints[], int mptr_size, int level, int id, int np)
{
  //determine the number of c-points and f-points
  int cnum = 0;
  int fnum = 0;
  for(int i=0;i<mptr_size-1;i++){cnum = cnum + cfpoints[i];}
  fnum = mptr_size-1-cnum;

  //determine the size of the interpolation matrix W
  int wsize=cnum;
  for(int i=0;i<mptr_size-1;i++){
    if(cfpoints[i]==0){
      for(int j=sr[i];j<sr[i+1];j++){
        if(cfpoints[sc[j]]==1){wsize++;}
      }
    }
  }

  //initialize interpolation matrix W
  wr[level] = malloc(mptr_size*sizeof(*wr[level]));  //new int[rptr_size];
  wc[level] = malloc(wsize*sizeof(*wc[level]));  //new int[wsize];
  wv[level] = malloc(wsize*sizeof(*wv[level]));  //new double[wsize];
  for(int j=0;j<mptr_size;j++){wr[level][j]=0;}
  for(int j=0;j<wsize;j++){wc[level][j]=-1;}
  for(int j=0;j<wsize;j++){wv[level][j]=0.0;}

  //modify cfpoints array so that nonzeros now correspond to the cpoint location
  int loc = 0;
  for(int i=0;i<mptr_size-1;i++){
    if(cfpoints[i]==1){
      cfpoints[i] = cfpoints[i] + loc;
      loc++;
    }
  }

  //find beta array (sum of weak f-points)
  int ind1 = 0, ind2 = 0, ii = 0;
  double *beta = malloc(fnum*sizeof(beta));  //new double[fnum];
  for(int i=0;i<fnum;i++){beta[i] = 0.0;}
  for(int i=0;i<mptr_size-1;i++){
    if(cfpoints[i]==0){
      ind1 = 0;
      ind2 = 0;
      while(ind1<(r[i+1]-r[i]) && ind2<(sr[i+1]-sr[i])){
        if(c[r[i]+ind1]==sc[sr[i]+ind2]+id*(mptr_size-1)){
          ind1++;
          ind2++;
        }
        else if(c[r[i]+ind1]<sc[sr[i]+ind2]+id*(mptr_size-1)){
          if(c[r[i]+ind1]!=i){
            beta[ii] = beta[ii] + v[r[i]+ind1];
          }
          ind1++;
        }
      }
      while(ind1<(r[i+1]-r[i])){
        if(c[r[i]+ind1]!=i){
          beta[ii] = beta[ii] + v[r[i]+ind1];
        }
        ind1++;
      }
      ii++;
    }
  }

  //create interpolation matrix W
  double aii = 0.0, aij = 0.0, temp = 0.0;
  int index = 0, rindex = 0;
  ind1 = 0;
  ind2 = 0;
  for(int i=0;i<mptr_size-1;i++){
    if(cfpoints[i]>=1){
      wc[level][index] = ind1;
      wv[level][index] = 1.0;
      ind1++;
      index++;
      rindex++;
      wr[level][rindex] = wr[level][rindex-1] + 1;
    }
    else{
      //determine diagonal element a_ii
      aii = d[i];

      //find all strong c-points and f-points in the row i
      int ind3 = 0, ind4 = 0;
      int scnum = 0;
      int sfnum = 0;
      int *scpts = malloc((sr[i+1]-sr[i])*sizeof(scpts));  //new int[sr[i+1]-sr[i]];
      int *sfpts = malloc((sr[i+1]-sr[i])*sizeof(sfpts));  //new int[sr[i+1]-sr[i]];
      int *scind = malloc((sr[i+1]-sr[i])*sizeof(scind));  //new int[sr[i+1]-sr[i]];
      double *scval = malloc((sr[i+1]-sr[i])*sizeof(scval));  //new double[sr[i+1]-sr[i]];
      double *sfval = malloc((sr[i+1]-sr[i])*sizeof(sfval));  //new double[sr[i+1]-sr[i]];
      for(int j=0;j<(sr[i+1]-sr[i]);j++){
        scpts[j] = -1;
        sfpts[j] = -1;
        scind[j] = -1;
        scval[j] = 0.0;
        sfval[j] = 0.0;
      }
      for(int j=sr[i];j<sr[i+1];j++){
        if(cfpoints[sc[j]]>=1){
          scpts[scnum] = sc[j]+id*(mptr_size-1);
          scval[scnum] = sv[j]+id*(mptr_size-1);
          scind[scnum] = cfpoints[sc[j]]-1;
          scnum++;
        }
        else{
          sfpts[sfnum] = sc[j]+id*(mptr_size-1);
          sfval[sfnum] = sv[j]+id*(mptr_size-1);
          sfnum++;
        }
      }

      #if(DEBUG)
        if(scnum==0){printf("ERROR: no cpoints in row ");}
      #endif

      if(sfnum==0){
        //loop all strong c-points 
        for(int k=0;k<scnum;k++){
          aij = scval[k];
          wc[level][index] = scind[k];
          wv[level][index] = -(aij)/(aii + beta[ind2]);
          index++;
        }
      }
      else{
        //loop thru all the strong f-points to find alpha array
        double *alpha = malloc(sfnum*sizeof(alpha));  //new double[sfnum];
        for(int k=0;k<sfnum;k++){alpha[k] = 0.0;}
        for(int k=0;k<sfnum;k++){
          ind3 = 0;
          ind4 = 0;
          while(ind3<scnum && ind4<(r[sfpts[k]+1]-r[sfpts[k]])){
            if(scpts[ind3]==c[r[sfpts[k]]+ind4]){
              alpha[k] = alpha[k] + v[r[sfpts[k]]+ind4];
              ind3++;
              ind4++;
            }
            else if(scpts[ind3]<c[r[sfpts[k]]+ind4]){
              ind3++;
            }
            else if(scpts[ind3]>c[r[sfpts[k]]+ind4]){
              ind4++;
            }
          }
        }

        //loop all strong c-points 
        for(int k=0;k<scnum;k++){
          aij = scval[k];
          temp = 0.0;
          for(int l=0;l<sfnum;l++){
            for(int m=r[sfpts[l]];m<r[sfpts[l]+1];m++){
              if(c[m]==scpts[k]){
                #if(DEBUG)
                  if(alpha[l]==0.0){printf("ERROR: alpha is zero");}
                #endif
                temp = temp + sfval[l]*v[m]/alpha[l];
                break;
              }
            }
          }
          wc[level][index] = scind[k];
          wv[level][index] = -(aij + temp)/(aii + beta[ind2]);
          index++;
        }

        free(alpha);
      }
      ind2++;
      rindex++;
      wr[level][rindex] = wr[level][rindex-1] + scnum;

      free(scpts);
      free(sfpts);
      free(scind);
      free(scval);
      free(sfval);
    }
  }

  free(beta);

  if(id==0){
    for(int i=0;i<mptr_size;i++){printf("%d",wr[level][i]);}
    printf("\n");
    for(int i=0;i<wr[level][mptr_size-1];i++){printf("%d",wc[level][i]);}
    printf("\n");
    for(int i=0;i<wr[level][mptr_size-1];i++){printf("%f",wv[level][i]);}
    printf("\n");
  }
  else if(id==1){
    for(int i=0;i<mptr_size;i++){printf("%d",wr[level][i]);}
    printf("\n");
    for(int i=0;i<wr[level][mptr_size-1];i++){printf("%d",wc[level][i]);}
    printf("\n");
    for(int i=0;i<wr[level][mptr_size-1];i++){printf("%f",wv[level][i]);}
    printf("\n");
  }

  return cnum;
}





//-------------------------------------------------------------------------------
// function for performing galarkin product: W'*A*W
//-------------------------------------------------------------------------------
void galerkin_prod2(int *ar[], int *ac[], double *av[], double *ad[], int *wr[], int *wc[], double *wv[], int mptr_size, int m, int level, int id, int np)
{
  int n = mptr_size-1; //number of rows in A and W. 

  int *temp1 = malloc(m*sizeof(temp1));  //new int[m];

  int *wpr = malloc((m+1)*sizeof(wpr));  //new int[m+1];
  int *wpc = malloc(wr[level][n]*sizeof(wpc));  //new int[wr[level][n]];
  double *wpv = malloc(wr[level][n]*sizeof(wpv));  //new double[wr[level][n]];

  //initialize nnzIthWCol to zero
  for(int i=0;i<m;i++){temp1[i]=0;}

  //first determine how many non-zeros exist in each column of W and store in array nnzIthWCol
  for(int i=0;i<wr[level][n];i++){temp1[wc[level][i]]++;}

  //find W' in CRS format from W
  wpr[0] = 0;
  for(int i=1;i<m+1;i++){wpr[i] = wpr[i-1] + temp1[i-1];}
  temp1[0] = 0;
  for(int i=1;i<m;i++){temp1[i] = temp1[i-1] + wpr[i] - wpr[i-1];}
  for(int i=0;i<n;i++){
    for(int j=wr[level][i];j<wr[level][i+1];j++){
      wpc[temp1[wc[level][j]]] = i;
      wpv[temp1[wc[level][j]]] = wv[level][j];
      temp1[wc[level][j]]++;
    }
  }

  //Now determine how many non-zeros exist in the matrix product of A*W
  //int nnz = 0;
  //for(int i=0;i<m;i++){temp1[i] = 0;}
  //for(int i=0;i<n;i++){ //loop through each row of A
  //  for(int j=ar[level][i];j<ar[level][i+1];j++){
  //    for(int k=wr[level][ac[level][j]];k<wr[level][ac[level][j]+1];k++){
  //      if(temp1[wc[level][k]]!=-(i+1)){
  //        nnz++;
  //        temp1[wc[level][k]]=-(i+1);
  //      }
  //    }
  //  }
  //}

  //create temporary arrays for storing the result of A*W and initialize to zeros
  //int *tr = malloc((n+1)*sizeof(tr));  //new int[n+1];
  //int *tc = malloc(nnz*sizeof(tc));  //new int[nnz];
  //double *tv = malloc(nnz*sizeof(tv));  //new double[nnz];
  //for(int i=0;i<nnz;i++){
  //  tc[i] = 0;
  //  tv[i] = 0.0;
  //}

  //now compute the matrix product A*W and store the result in tr, tc and tv
  //tr[0] = 0;
  //int indx = 0, start = 0;
  //for(int i=0;i<n;i++){ //loop through each row of A
  //  for(int j=ar[level][i];j<ar[level][i+1];j++){
  //    for(int k=wr[level][ac[level][j]];k<wr[level][ac[level][j]+1];k++){
  //      int found = 0;
  //      for(int l=start;l<indx;l++){
  //        if(tc[l]==wc[level][k]){
  //          tv[l] = tv[l] + av[level][j]*wv[level][k];
  //          found = 1;
  //          break;
  //        }
  //      }
  //      if(found==0){
  //        tc[indx] = wc[level][k];
  //        tv[indx] = av[level][j]*wv[level][k];
  //        indx++;
  //      }
  //    }
  //  }
  //  //sort tc[start:indx-1]
  //  sort(tc,tv,start,indx);
  //  start = indx;
  //  tr[i+1] = indx;
  //}

  //We know have the produxt T=A*W given by the arrays: tr, tc, & tv. Need to perform the 
  //product B=W'T. Now determine how many non-zeros exist in the matrix product of W'T
  //nnz = 0;
  //for(int i=0;i<m;i++){temp1[i] = 0;}
  //for(int i=0;i<m;i++){ //loop through each row of W'
  //  for(int j=wpr[i];j<wpr[i+1];j++){
  //    for(int k=tr[wpc[j]];k<tr[wpc[j]+1];k++){
  //      if(temp1[tc[k]]!=-(i+1)){
  //        nnz++;
  //        temp1[tc[k]]=-(i+1);
  //      }
  //    }
  //  }
  //}

  //create arrays for storing the result of W'*A*W and initialize to zeros
  //ar[level+1] = malloc((m+1)*sizeof(*ar[level+1]));  //new int[m+1];
  //ac[level+1] = malloc(nnz*sizeof(*ac[level+1]));  //new int[nnz];
  //av[level+1] = malloc(nnz*sizeof(*av[level+1]));  //new double[nnz];
  //ad[level+1] = malloc(m*sizeof(*ad[level+1]));  //new double[m];
  //for(int i=0;i<nnz;i++){
  //  ac[level+1][i] = 0;
  //  av[level+1][i] = 0.0;
  //}

  //now compute the matrix product W'T and store the result in ac and av
  //ar[level+1][0] = 0;
  //indx = 0, start = 0;
  //for(int i=0;i<m;i++){ //loop through each row of W'
  //  for(int j=wpr[i];j<wpr[i+1];j++){
  //    for(int k=tr[wpc[j]];k<tr[wpc[j]+1];k++){
  //      int found = 0;
  //      for(int l=start;l<indx;l++){
  //        if(ac[level+1][l]==tc[k]){
  //          av[level+1][l] = av[level+1][l] + wpv[j]*tv[k];
  //          found = 1;
  //          break;
  //        }
  //      }
  //      if(found==0){
  //        ac[level+1][indx] = tc[k];
  //        av[level+1][indx] = wpv[j]*tv[k];
  //        indx++;
  //      }
  //    }
  //  }
  //  //sort tc[start:indx-1]
  //  sort(ac[level+1],av[level+1],start,indx);
  //  start = indx;
  //  ar[level+1][i+1] = indx;
  //}

  //find diagonal entries of A matrix
  //for(int i=0;i<m;i++){
  //  for(int j=ar[level+1][i];j<ar[level+1][i+1];j++){
  //    if(ac[level+1][j]==i){
  //      ad[level+1][i] = av[level+1][j];
  //      break;
  //    }
  //  }
  //}

  //delete[] nnzIthWCol;
  free(temp1);
  free(wpr);
  free(wpc);
  free(wpv);
  //free(tr);
  //free(tc);
  //free(tv);
}





//-------------------------------------------------------------------------------
// sort function used in galarkin2
//-------------------------------------------------------------------------------
inline void sort(int array1[], double array2[], int start, int end)
{
  for(int i=start;i<end;i++){
    int index = i;
    for(int j=i+1;j<end;j++){
      if(array1[index]>array1[j]){
        index = j;
      }
    }
    int temp1 = array1[i];
    double temp2 = array2[i];
    array1[i] = array1[index];
    array2[i] = array2[index];
    array1[index] = temp1;
    array2[index] = temp2;
  }
}





//-------------------------------------------------------------------------------
// compare function for sorting structure array
//-------------------------------------------------------------------------------
int compare_structs(const void *a, const void *b){
    struct array *struct_a = (struct array *) a;
    struct array *struct_b = (struct array *) b;

    if (struct_a->value < struct_b->value) return 1;
    else if (struct_a->value == struct_b->value) return 0;
    else return -1;
}












  //MPI_Status status;
  //int ierr;
  //ierr = MPI_Send(&lambda[i*np],mptr_size-1,MPI_INT,i+1,1,MPI_COMM_WORLD);
  //ierr = MPI_Recv(&temp[0],mptr_size-1,MPI_INT,i+1,1,MPI_COMM_WORLD,&status);
  //ierr = MPI_Recv(&temp[0],mptr_size-1,MPI_INT,i-1,1,MPI_COMM_WORLD,&status);
  //ierr = MPI_Send(&lambda[i*np],mptr_size-1,MPI_INT,i-1,1,MPI_COMM_WORLD);

  //int idSize = mptr_size-1;
  //int *id = new int[idSize];
  //for(int i=0;i<idSize;i++){id[i] = i;}
  //
  //int num_nodes_not_assign = nptr_size-1;
  //while(num_nodes_not_assign>0)
  //{
  //
  //}

