#include "sort_module.h"

#define STACK_SIZE 10000

/* macro's needed for non-recursive qsort */
#define Push(nl,nr) ++stackPtr; stack[stackPtr].l=nl; stack[stackPtr].r=nr
#define Pop(nl,nr) nl=stack[stackPtr].l; nr=stack[stackPtr].r; --stackPtr
#define Empty (stackPtr<0)

#define swap(x,y) {uint8_t *tt=data[x].key;\
                   uint64_t t1=data[x].len;\
   data[x].key=data[y].key;\
   data[x].len=data[y].len;\
   data[y].key=tt;\
   data[y].len=t1;\
   }

/* heap element used in non-recursive qsort */
typedef struct {
   int l,r;
} StackElt;

/* in-place iterative qsort, tuned to reduced instruction usage while maximizing cache usage */
void tuned_qsort(ptr_struct *data, const uint64_t N)
{
  StackElt stack[STACK_SIZE];
  ptr_struct v;
  const int64_t M=25;
  int64_t l,r,i,j;
  int64_t stackPtr=-1;

  Push(0,N-1);

  while (!Empty) {
    Pop(l,r);

    if ((r-l) <= M) {
      for (i=r-1; i>=l; --i) {
        if (sncmp( data[i].key, data[i+1].key, data[i].len, data[i+1].len) > 0) {
          v=data[i];
          j = i+1;

          do {
            data[j-1]=data[j];
            ++j;
          } while ((j <= r) && (sncmp(data[j].key, v.key, data[j].len, v.len) < 0 ));
          data[j-1] = v;
        }
      }
      continue;
    }
    swap((l+r)>>1,l+1);

    if (sncmp( data[l+1].key, data[r].key, data[l+1].len, data[r].len) > 0) swap(l+1,r);
    if (sncmp( data[l].key, data[r].key, data[l].len, data[r].len) > 0)   swap(l,r);
    if (sncmp( data[l+1].key, data[l].key, data[l+1].len, data[l].len) > 0) swap(l+1,l);
    i = l+1;
    j = r;

    v = data[l];

    while(1)
    {
      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);

      do ++i; while (sncmp( data[i].key, v.key, data[i].len, v.len) < 0);
      do --j; while (sncmp( data[j].key, v.key, data[j].len, v.len) > 0);
      if (j<i) break;
      swap(i,j);
    }
    swap(l,j);

    if ((j-l) < (r-i+1)) {
      Push(i-1,r);
      Push(l,j);
    } else {
      Push(l,j);
      Push(i-1,r);
    }
  }
}

