/*
 * buddy.c
 * Buddy memory allocator.
 *
 */

#include	"results.h"
#include	"general.h"
#include	"lock.h"
#include	"queue.h"
#include	"sync.h"
#include	"memalloc.h"
#include	"init.h"
#include	"arch_init.h"

extern shared int	monitoring_enabled;
extern shared int 	steering_enabled;
extern shared int 	adv_steer_enabled;
extern shared int       number_of_lms;
extern shared long	dynamic_memory_per_processor;
volatile	private	int	next_proc;
extern	mem_free_list_t	*memory_pool;
extern	int	*num_of_procs;

extern void *memcpy();
extern void child_exit ARGS((int));

bucket_t
ddequeue(dqueue)
bucket_t	dqueue;
{
    bucket_t	tmp;

    tmp = dqueue->next;
    dremove(tmp);
    return(tmp);
}

void
dremove(dq)
bucket_t	dq;
{
    dq->prev->next = dq->next;
    dq->next->prev = dq->prev;
}

void
init_dqueue(dq)
bucket_t	dq;
{
    dq->next = dq->prev = dq;
}

void
denqueue(dqueue, dqitem)
bucket_t	dqueue,dqitem;
{
    dqitem->next = dqueue->next;
    dqueue->next->prev = dqitem;
    dqueue->next = dqitem;
    dqitem->prev = dqueue;
}


RESULT
memory_alloc(ptr, size, node)
memory_t *ptr;
int	size,node;
{
    int			tmp,chunk,mem_node = -1,avail_chunk,i;
    mem_free_list_t	mflist = NULL;
    unsigned	long	mem_block,chsize;
    mem_header_t	mheader;
    bucket_t		tbucket, split_buddy;

    if((tmp = (size & 0x03)) != 0)
      size += 4-tmp;	/* Align to long	*/
    size += sizeof(struct mem_header);
    tmp = MIN_CHUNK_SIZE;
    chunk = 0;
    while (tmp < size){
	tmp <<= 1;
	chunk++;
    }
    DBG2("Min chsiz is %d, size is %d\n",MIN_CHUNK_SIZE, size);

    /*
     * We have to allocate a block of 2**(chunk+MIN_CHUNK) bytes,
     * which corresponds to the bucket "chunk"
     */
    if(chunk > config.memory_exponent - 4) return(T_TOO_BIG_MEMORY_MODULE);

    if(node == N_ANYWHERE){
	for(i=0, tmp=next_proc; i< *num_of_procs; i++){
	    if(memory_pool[tmp]->biggest_chunk >= chunk){/* We can't be sure*/
		mflist = memory_pool[tmp];		 /* until we lock it*/
		internal_mutex_lock(&mflist->memlock);
		if(mflist->biggest_chunk >= chunk){
		    if (tmp == next_proc && ++next_proc == *num_of_procs)
		      next_proc = 0;
		    mem_node = tmp;
		    break;
		} else {
		    internal_mutex_unlock(&mflist->memlock);
		}
	    }
	    DBG3("chunk is %d, mem[%d].chunk is %d\n", 
		 chunk, tmp, memory_pool[tmp]->biggest_chunk);
	    
	    if (++tmp == *num_of_procs)
	      tmp = 0;
	}
	if(i == *num_of_procs)	/* No such big memory module found	*/
	  return(T_NOMEMORY);
    } else {
	if(node == N_CURRENT || node >= *num_of_procs || node <= N_LESS)
	  node = virtual_processor();
	mflist = memory_pool[node];
	internal_mutex_lock(&mflist->memlock);
	if(mflist->biggest_chunk < chunk){
	    internal_mutex_unlock(&mflist->memlock);
	    return(T_NOMEMORY);
	}
	mem_node = node;
    }
    /*
     * At this point, mem_node is the node on which the memory is going
     * to be allocated and mflist points to the memory structure for this 
     * node. IT IS LOCKED
     */
    DBG2("mem_node is %02x, mflist is %lx\n",mem_node,(long)mflist);

    tbucket = &mflist->buckets[0];
    for( avail_chunk = chunk; 
	tbucket[avail_chunk].next == &tbucket[avail_chunk];
	avail_chunk++) {
	;
    }

    mem_block = (unsigned long) ddequeue(&tbucket[avail_chunk]);
    chsize = 1 << (avail_chunk + MIN_CHUNK);
    DBG4("avail_chunk is %d, chunk is %d, mem_block is %lx, chsize %ld\n",
	 avail_chunk,chunk,(long)mem_block,chsize);

    if (avail_chunk != chunk)
      while (avail_chunk != chunk){
	  chsize >>= 1;
	  avail_chunk--;
	  split_buddy = (bucket_t)(mem_block + chsize);
	  split_buddy->chlen = avail_chunk;
	  DBG2("putting %lx in bucket %d\n",(long)split_buddy, avail_chunk);
	  denqueue(&tbucket[avail_chunk], split_buddy);
      }
    mflist->total_mem_free -= chsize;
    if(tbucket[mflist->biggest_chunk].next == &tbucket[mflist->biggest_chunk]){
	tmp = mflist->biggest_chunk;
	tmp--;
	while(tbucket[tmp].next == &tbucket[tmp] && tmp >= 0)
	  tmp--;
	mflist->biggest_chunk = tmp;
    }
    mheader = (mem_header_t)(mem_block);
    mheader->tag = M_USED;
    mheader->processor = mem_node;
    mheader->bucket_size = chunk;
    internal_mutex_unlock(&mflist->memlock);
    *ptr = (memory_t)(&mheader[1]);
    return(T_SUCCEED);
}

void
memory_free(mptr)
memory_t	mptr;
{
    mem_header_t		mheadr,buddy;
    mem_free_list_t		mflist;
    unsigned 	long	chsize;
    short				chunk;

    mheadr = (mem_header_t)((long)mptr - sizeof(struct mem_header));
    mflist = memory_pool[mheadr->processor];
    internal_mutex_lock(&mflist->memlock);
    chunk = mheadr->bucket_size;
    chsize = 1 << (chunk + MIN_CHUNK);
    mflist->total_mem_free += chsize;
    DBG4("mheadr is %lx, mflist is %lx, chunk = %d, chsize = %ld\n",
	 (long)mheadr, (long)mflist, chunk, chsize);
    while(chunk < mflist->chunk_limit){
	DBG3("mheadr is %lx, base is %lx, AND is %lx\n",(long)mheadr,mflist->base,
	     (((long)mheadr - mflist->base) & chsize));

	if(((long)mheadr - mflist->base) & chsize) {
	    buddy = (mem_header_t)((long)mheadr - chsize);
	} else {
	    buddy = (mem_header_t)((long)mheadr + chsize);
	}
	DBG4( "mheadr=%lx, buddy=%lx, chunk=%d, chsize=%ld\n",
	     (long)mheadr, (long)buddy, chunk, chsize);
	DBG1(" buddy_len=%d\n",((bucket_t)buddy)->chlen);

	if((buddy->tag & 0x01) == M_FREE && ((bucket_t)buddy)->chlen == chunk){
	    chsize <<= 1;
	    dremove((bucket_t)buddy);
	    chunk++;
	    if((unsigned long)buddy < (unsigned long)mheadr)
	      mheadr = buddy;
	} else {
	    break;
	}
    }
    if(chunk > mflist->biggest_chunk)
      mflist->biggest_chunk = chunk;
    ((bucket_t)mheadr)->chlen = chunk;
    denqueue(&mflist->buckets[chunk], (bucket_t) mheadr);
    /*
     * This automatically will satisfy the "tag & 0x01" condition, since
     * all pointers point to even addresses and on the tag a pointer is
     * going to be stored.
     */
    internal_mutex_unlock(&mflist->memlock);
}

RESULT
memory_realloc(ptr, size)
memory_t	*ptr;
int			size;
{
    int			tmp;
    memory_t		mptr = *ptr;
    mem_header_t	mheadr;
    unsigned 	long	chsize;
    short		chunk;

    mheadr = (mem_header_t)((long)mptr - sizeof(struct mem_header));
    if (mheadr->tag != M_USED) {
	fprintf(stderr, 
		"Attempt to realloc an non-alloc'd block %lx, size %d\n",
		(long) ptr, size);
	fprintf(stderr, "Unable to perform realloc.  Program exitting.\n");
	child_exit(1);
    }
    chunk = mheadr->bucket_size;
    chsize = 1 << (chunk + MIN_CHUNK);
    if((tmp = (size & 0x03)) != 0)
      size += 4-tmp;	/* Align to long	*/
    size += sizeof(struct mem_header);
    /*
    **  In general, the chsize of the old block is going to be the smallest
    **  power of two that is greater than the original size.
    **  This means that this test is often true for large objects growing 
    **  just a little bit...
    */
    if (size <= chsize) {
	return(T_SUCCEED);
    } else {
	/*
	**  else we must get more memory...
	**  A more efficient approach would be to see if the next
	**  block is free for the grabbing, but we take the low energy
	**  approach, allocating more memory and doing a memcpy
	*/
	memory_t	tmp_mptr;
	RESULT		result = memory_alloc(&tmp_mptr, size, mheadr->processor);
	if (result != T_SUCCEED) {
	    return(result);
	}
	memcpy(tmp_mptr, mptr, chsize - sizeof(struct mem_header));
	memory_free(mptr);
	*ptr = tmp_mptr;
    }
    return(T_SUCCEED);
}

int
memory_node(mptr)
memory_t	mptr;
{
    mem_header_t		mheadr;

    mheadr = (mem_header_t)((long)mptr - sizeof(struct mem_header));
    return mheadr->processor;
}

void
deb_buddy_mem(procno)
int		procno;
{
    int		i;

    if(procno == -1) {
	for(i=0; i < *num_of_procs; i++) {
	    deb_mem_struct(i);
	}
    } else {
	deb_mem_struct(procno);
    }
}

void
deb_mem_struct(procno)
int		procno;
{
#define		WRAP_AROUND	8
    mem_free_list_t	mflist;
    int			i,counter;
    bucket_t		mptr,mfirst;

    mflist = memory_pool[procno];
#ifdef	MACH
    printf("Processor %02x memory struct (%lx (ph %lx)) is:\n",
	   procno, (long)mflist, getphysaddr(mflist));
#else
    printf("Processor %02x memory struct (%lx) is:\n",procno,(long)mflist);
#endif
    internal_mutex_lock(&mflist->memlock);

#ifdef	MACH
    printf(
      "lock=%04x, tot_mem_free= %ld, base = %lx (ph %lx), biggest chunk = %d\n",
	   mflist->memlock, mflist->total_mem_free, mflist->base, 
	   getphysaddr(mflist->base), mflist->biggest_chunk);
#else
    printf("lock=%04x, tot_mem_free= %ld, base = %lx, biggest chunk = %d\n",
	   (unsigned) mflist->memlock, mflist->total_mem_free, 
	   mflist->base, mflist->biggest_chunk);
#endif

    for (i=0; i<mflist->chunk_limit; i++){
	printf("%2d--> ",i);
	counter = 0;
	mfirst = &mflist->buckets[i];
	mptr = mfirst->next;
	while(mptr != mfirst){
	    printf("%lx ",(long)mptr);
	    if( ++counter == WRAP_AROUND){
		counter = 0;
		printf("\n      ");
	    }
	    mptr = mptr->next;
	}
	printf("\n");
    }
    internal_mutex_unlock(&mflist->memlock);
}

RESULT
meminit()
{
   mem_free_list_t	mtmp;
   int			tmp, j, size, node, aligned_header;
   int count;
   int chunk_limit = config.memory_exponent - MIN_CHUNK;

#ifdef WITH_MONITORING
   if (monitoring_enabled) {
       count = *num_of_procs + number_of_lms;
       if (steering_enabled)	/* reg steering */
	 {
	   count++;
	 }
       if (adv_steer_enabled)	/* adv steering */
	 {
	   count++;
	 }
   } else {
       count = *num_of_procs;
   }
#else 
   count = *num_of_procs;
#endif
   size		= count * sizeof(mem_free_list_t);
   memory_pool	= (mem_free_list_t *)allocate_and_share(size,-1);
   size = sizeof(struct mem_free_list) + chunk_limit * sizeof(struct bucket);
   tmp = size & 0x03;		/* Make sure that size is multiple of 4 */
   if(tmp != 0)			/* Becasue we want the memory_alloc()	*/
      size += 4-tmp;		/* to return long alligned memory (attadd32)*/
   aligned_header = size;
   size += dynamic_memory_per_processor; /* This is multiple of 4 */
   size += 8;  /* fudge so we can shift align to %8=4 later */
   for (node = 0; node < count; node++){
      mtmp =(mem_free_list_t)allocate_and_share(size,node);
      memory_pool[node] = mtmp;
      mtmp->base = (unsigned long )((unsigned long)mtmp + aligned_header);

      /*
      ** We want memory_alloc() to return items aligned on 8 byte boundaries.
      ** The buddy allocator breaks the memory it's given into 16 byte
      ** (or larger) blocks starting at it's base address.  If a block is 
      ** allocated, the first 4 bytes are reserved as a header and the 
      ** remainder is returned.  Thus if we want to return stuff that is 8 byte
      ** aligned, the base should be an 8 byte boundary minus 4.
      */
      tmp = mtmp->base & 0x07;
      if (tmp != 4) {
	  DBG2("Realigning, tmp was %d, base was %lx\n", tmp, mtmp->base);
	  mtmp->base += (12-tmp)%8;
      }
      DBG3("Node %d, memory pool from %lx to %lx\n", node, (long)mtmp->base, 
	   (long)mtmp+size-1);
      LOCK_INIT(mtmp->memlock);
      mtmp->total_mem_free = dynamic_memory_per_processor;
      mtmp->chunk_limit = mtmp->biggest_chunk = chunk_limit;

      for(j=0; j<=chunk_limit; j++) {
	  init_dqueue(&mtmp->buckets[j]);
      }
      denqueue(&mtmp->buckets[mtmp->biggest_chunk], (bucket_t) mtmp->base);
      ((bucket_t)(mtmp->base))->chlen = mtmp->biggest_chunk;
      next_proc = 0;
   }

   return(T_SUCCEED);
}
