/*
 * Linux-os.c. 
 * From FreeBSD-os.c
 * From osf1-os.c,v 1.1 94/03/27 15:30:51 hallgren Exp $
 *
 * OS-dependent routines.  This file (along with os.h) exports an
 * OS-independent interface to the operating system VM facilities.
 * Suprisingly, this interface looks a lot like the Mach interface
 * (but simpler in some places).  For some operating systems, a subset
 * of these functions will have to be emulated.
 *
 * This is the OSF1 version.  By Sean Hallgren.
 * Much hacked by Paul Werkowski
 * Morfed from the FreeBSD file by Peter Van Eynde (July 1996)
 * GENCGC support by Douglas Crosher, 1996, 1997.
 * Alpha support by Julian Dolby, 1999.
 *
 * $Header: /home/CVS-cmucl/src/lisp/Linux-os.c,v 1.12 2000/10/24 13:32:30 dtc Exp $
 *
 */

#include <stdio.h>
#include <sys/param.h>
#include <sys/file.h>
#include <errno.h>
#include "./signal.h"
#include "os.h"
#include "arch.h"
#include "globals.h"
#include "interrupt.h"
#include "lispregs.h"
#include "internals.h"
#include <sys/socket.h>
#include <sys/utsname.h>

#include <sys/types.h>
#include <signal.h>
/* #include <sys/sysinfo.h> */
#include <sys/time.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/resource.h>
#include <sys/wait.h>
#include <netdb.h>

#include "validate.h"
size_t os_vm_page_size;

#define DPRINTF(t,a) {if (t) fprintf a;}

#if defined GENCGC
#include "gencgc.h"
#endif

#if ((LINUX_VERSION_CODE >= linuxversion(2,1,0)) || (__GNU_LIBRARY__ >= 6))
int PVE_stub_errno;
#endif

#if ((LINUX_VERSION_CODE >= linuxversion(2,1,0)) || (__GNU_LIBRARY__ >= 6))
void update_errno (void)
{
  PVE_stub_errno = errno;
}
#endif

static has_mmap_tuning=0;


void os_init(void)
{
  struct utsname name;
  
  uname(&name);

  /* We need this for mmap */

  if (name.release[0] < '2')
    {
      printf("Linux version should be later than 2.0.0!\n");
      printf("Dazed and confused but trying to continue...\n");
      has_mmap_tuning=0;
    }
  else 
    {
      if (1 || ((name.release[0]) > '2') ||
          (((name.release[0]) == '2') && ((name.release[2]) >= '1')))
        {
          DPRINTF(0,(stderr,"has mman tuning\n"));
          has_mmap_tuning=1;
        }
      else
        {
          printf("Linux version 2.2.X detected, use -lazy mode...\n");
          lazy_memory_allocation = 1;
        }
    }
  
  os_vm_page_size = getpagesize();

#ifdef i386
  setfpucw(0x1372|4|8|16|32); /* No interrupts */
#endif
}

#ifdef i386
#if (LINUX_VERSION_CODE >= linuxversion(2,1,0)) || (__GNU_LIBRARY__ >= 6)
int sc_reg(struct sigcontext *c, int offset)
#else
     int sc_reg(struct sigcontext_struct *c, int offset)
#endif
{
  switch(offset)
    {
    case  0: return c->eax;
    case  2: return c->ecx;
    case  4: return c->edx;
    case  6: return c->ebx;
    case  8: return c->esp;
    case 10: return c->ebp;
    case 12: return c->esi;
    case 14: return c->edi;
    }
  return 0;
}
#endif

void os_save_context(void)
{
  /*
   * Called from interrupt handlers so C stuff knows things set in Lisp.
   */
}

void os_set_context(void)
{
}

static boolean in_range_p(os_vm_address_t a, lispobj sbeg, size_t slen)
{
  char* beg = (char*) sbeg;
  char* end = (char*) sbeg + slen;
  char* adr = (char*) a;
  return (adr >= beg && adr < end);
}

int do_mmap(os_vm_address_t *addr, os_vm_size_t len, int flags)
{
  /* We _must_ have the memory where we want it... */
  os_vm_address_t old_addr = *addr;

  DPRINTF(0,(stderr,"do_mmap: %p->%p len: 0x%x 0x%x\n",*addr,(*addr)+len, len, flags));
  
  if ((lazy_memory_allocation == 1) &&
      in_range_p((os_vm_address_t) *addr, 
		 DYNAMIC_0_SPACE_START, dynamic_space_size))
    {
      int page;
      int start_page = find_page_index( (void *) *addr);
      int number_of_pages = len / 4096;
 
      //len = number_of_pages * 4096;

      DPRINTF(0, (stderr, "do_mmap in dynamic space: %p len: 0x%x\n", *addr, len));
      DPRINTF(0, (stderr, "start-page: 0x%x number-of-pages: 0x%x\n", start_page, number_of_pages));
      // clean the allocate mask;
      if (page_table == NULL)
        {
          DPRINTF(0,(stderr,"\npage_table is NULL! Too early to do anything\n"));
        }
      else
        if (start_page == -1)
          {
            DPRINTF(0,(stderr, "Start_page is -1! XXX YYY XXX\n"));
          }
        else
          {
            for(page = 0; page < number_of_pages; page++)
              page_table[page+ start_page].flags 
                |= PAGE_LAZY_ALLOCATE_MASK;
          }
    }
  
  *addr = mmap(*addr, len, OS_VM_PROT_ALL, flags, -1, 0);

  if ((old_addr != NULL && *addr != old_addr) || 
      *addr == (os_vm_address_t) -1)
    {
      fprintf(stderr, "Error in allocating memory

CMUCL asks the kernel to make a lot of memory potentially available.
Truely a lot of memory, actually it asks for all memory a process
can allocate. 

Note that due to the high demands placed on the kernel,
I am only sure CMUCL works with 2.2.X or higher kernels.

Now you have two choices:
 - Accept this and lift the kernel and other limits by doing:
 as root:
 echo 1 > /proc/sys/vm/overcommit_memory
 as the user:
 ulimit -d unlimited 
 ulimit -v unlimited 
 ulimit -m unlimited 

This might also be caused by using 'enterprise' or 'big memory' kernels
that restrict the amount of memory a process can use. Try to use the'
-dynamic-space-size flag to limit the amount of memory we reserve.
");
      /*
  - Try to use the lazy-allocation routines. They are pretty experimental
 and might interact badly with some kernels. To do this start lisp with the
 \"-lazy\" flag, like:
 lisp -lazy
      */
      perror("mmap");
      return 1;
    }
  return 0;
}



os_vm_address_t os_validate(os_vm_address_t addr, os_vm_size_t len)
{
  int flags = MAP_PRIVATE | MAP_ANONYMOUS;

  if (lazy_memory_allocation == 1)
    {
      switch((unsigned long) addr) 
	{
	case 0L:
	  DPRINTF(0,(stderr,"\n\nI was asked to validate NULL? addr: %p len: %x", addr, len));
	  break;
	case READ_ONLY_SPACE_START: 
	  DPRINTF(0,(stderr,
                     "\n\nIt's readonly space... ignoring request for memory\n")); 
	  return addr;
	case STATIC_SPACE_START: 
	  DPRINTF(0,(stderr,
                     "\n\nIt's static space... ignoring request for memory\n")); 
	  return addr;
	case BINDING_STACK_START: 
	  DPRINTF(0,(stderr,
                     "\n\nIt's the binding stack... ignoring request for memory\n")); 
	  return addr;
	case CONTROL_STACK_START: 
	  DPRINTF(0,(stderr,"\n\nIt's the control stack %p->%p %x\n",addr, 
		     (os_vm_address_t) (((unsigned long) addr + len)), flags));  
	  flags |= MAP_GROWSDOWN; 
	  addr = (os_vm_address_t) (((unsigned long) addr + len - 4096) & ~0xFFF);
	  len = 4096;
	  break;
        } 
      if (in_range_p((os_vm_address_t) addr,
                           DYNAMIC_0_SPACE_START, dynamic_space_size))
        {
          DPRINTF(0,(stderr,
                     "\n\nIt's  in dynamic 0 space...ignoring request for memory\n"));  
          return addr;
        }
    }
  else
    flags |= MAP_NORESERVE;

  /* Try to avoid turning on overcommit globally */

  if (addr)
    flags |= MAP_FIXED;
  else
    flags |= MAP_VARIABLE;

  DPRINTF(0, (stderr, "os_validate %p ->  %p 0x%x => \n", addr, addr+len, flags));
  if (do_mmap(&addr, len, flags))
    {
      DPRINTF(0, (stderr, "mmap failed!\n"));
      exit(42);

      return NULL;
    }
  else
    {
      DPRINTF(0, (stderr, "mmap worked, returned: %p\n", addr));

      return addr;
    }
}

void os_invalidate(os_vm_address_t addr, os_vm_size_t len)
{
  DPRINTF(0, (stderr, "os_invalidate %p -> %p\n", addr, addr+len));

  if ((lazy_memory_allocation == 1) &&
      in_range_p((os_vm_address_t) addr, 
		 DYNAMIC_0_SPACE_START, dynamic_space_size))
    {
      int page;
      int start_page = find_page_index( (void *) addr);
      int number_of_pages = len / 4096;
      
      DPRINTF(0, (stderr, "\n\nos_invalidate in dynamic space: %p len: 0x%x\n", addr, len));
      DPRINTF(0, (stderr, "start-page: 0x%x number-of-pages: 0x%x\n", start_page, number_of_pages));

      // clean the allocate mask;
      if (start_page == -1)
        {
          DPRINTF(0,(stderr, "Start_page is -1!\n"));
        }
      else
      if (page_table == NULL)
        {
          fprintf(stderr,"\npage_table is NULL!\n");
        }
      else
        {
          for(page = 0; page < number_of_pages; page++)
            page_table[page+ start_page].flags 
              &= ~PAGE_LAZY_ALLOCATE_MASK;
        }
    }
  if (munmap(addr, len) == -1)
    {
      perror("munmap");
      exit(42);
    }
}

os_vm_address_t os_map(int fd, int offset, os_vm_address_t addr,
		       os_vm_size_t len)
{
  DPRINTF(0,(stderr,"os map: fd: %i offset: 0x%x addr: %p -> %p\n", fd, offset, addr, addr+len));

  if ((lazy_memory_allocation == 1) &&
      in_range_p((os_vm_address_t) addr, 
		 DYNAMIC_0_SPACE_START, dynamic_space_size))
    {
      int start_page = find_page_index( (void *) addr);
      int number_of_pages = len / 4096;
      int page;

      len = number_of_pages * 4096;
      DPRINTF(0, (stderr, "os_map in dynamic space: %p for 0x%x\n", addr, len));
      DPRINTF(0, (stderr, "start-page: 0x%x number-of-pages: 0x%x\n", start_page, number_of_pages));

      if (start_page == -1)
        {
          DPRINTF(0,(stderr, "Start_page is -1!\n"));
          exit(42);
        }
      else
      if (page_table == NULL)
        {
          fprintf(stderr,"\npage_table is NULL! XXX\n");
          exit(2);
        }
      else
        {
          for(page = 0; page < number_of_pages; page++)
            page_table[page+ start_page].flags 
              |= PAGE_LAZY_ALLOCATE_MASK;
        }

    }
  
  addr = mmap(addr, len,
	      OS_VM_PROT_ALL,
	      MAP_PRIVATE | MAP_FILE | MAP_FIXED,
	      fd, (off_t) offset);
  DPRINTF(0,(stderr,"osmap: 0x%lx -> 0x%lx\n",
             (unsigned long) addr,
             (unsigned long) (addr+len)));

  if (addr == (os_vm_address_t) -1)
    {
      perror("mmap");
      exit(42);
    }

  return addr;
}

void os_flush_icache(os_vm_address_t address, os_vm_size_t length)
{
}

void os_protect(os_vm_address_t address, os_vm_size_t length,
		os_vm_prot_t prot)
{
  /* make certain the page is already mapped! */
  int ret;

  DPRINTF(0,(stderr,"Os_protect addr: 0x%x length: 0x%x prot: 0x%x\n",
             address, length, prot));
  ret = mprotect(address, length, prot);
  DPRINTF(0,(stderr,"resulted in 0x%x\n",ret));

  if (ret == -1)
    {
      if ((lazy_memory_allocation == 1) &&
          in_range_p((os_vm_address_t) address, 
                     DYNAMIC_0_SPACE_START, dynamic_space_size) &&
          (page_table != NULL))
        {
          int page_index;
          
          page_index=find_page_index((void *) address);
          if (page_index == -1)
            {
              DPRINTF(0,(stderr, "page_index is -1!\n"));
              exit(42);
            }
          if (PAGE_LAZY_ALLOCATE(page_index) == 0)
            {
              DPRINTF(0,(stderr,"\n\nignoring mprotect of %p (index: 0x%x) NON-EXISTING lenght 0x%x to %p prot: 0x%x\n",
                         address, page_index, length , address+length,  prot));
              return;
            }
        }
      
      DPRINTF(0, (stderr, "\n\nos protect at %p length 0x%x prot: %x resulted in %x\n", address, length, prot, ret));
      DPRINTF(0, (stderr, "XXX: lazy: %x == 1  in-range: %x index: %x == 1  lazy: %x == 0\n",lazy_memory_allocation, 
                  in_range_p(address, DYNAMIC_0_SPACE_START, dynamic_space_size), 
                  find_page_index((void *) address), 
                  PAGE_LAZY_ALLOCATE(find_page_index((void *) address))));
      DPRINTF(0, (stderr, "os_protect in dynamic space: %p\n", address));
      DPRINTF(0, (stderr, "length: 0x%x prot: 0x%x\n",length,prot));
      
      perror("mprotect");
      exit(15);
    }
}




boolean valid_addr(os_vm_address_t addr)
{
  os_vm_address_t newaddr;
  newaddr = os_trunc_to_page(addr);

  if (   in_range_p(addr, READ_ONLY_SPACE_START, READ_ONLY_SPACE_SIZE)
         || in_range_p(addr, STATIC_SPACE_START   , STATIC_SPACE_SIZE   )
         || in_range_p(addr, DYNAMIC_0_SPACE_START, DYNAMIC_SPACE_SIZE  )
         || in_range_p(addr, DYNAMIC_1_SPACE_START, DYNAMIC_SPACE_SIZE  )
         || in_range_p(addr, CONTROL_STACK_START  , CONTROL_STACK_SIZE  )
         || in_range_p(addr, BINDING_STACK_START  , BINDING_STACK_SIZE  ))
    return TRUE;
  return FALSE;
}



#if defined GENCGC
void sigsegv_handler(HANDLER_ARGS)
{
  GET_CONTEXT

    int  fault_addr = ((struct sigcontext_struct *) (&contextstruct))->cr2;
  int  page_index = find_page_index((void *) fault_addr);

  /* First we see if it is because of the lazy-allocation magic... */
  DPRINTF(0,(stderr,"\n\nsigsegv handler: fault addr: 0x%x page index: 0x%x\n",fault_addr,page_index));

#if 0  
  if ((lazy_memory_allocation == 1))
    {
      if (in_range_p((os_vm_address_t) fault_addr, 
		     READ_ONLY_SPACE_START, READ_ONLY_SPACE_SIZE))
	{
	  DPRINTF(0,(stderr,"mapping read-only page in at %x\n",fault_addr));
	  DPRINTF(0,(stderr,"from %x to %x\n", (fault_addr & (~ 0xFFF)), (fault_addr & (~ 0xFFF)) + 4096));

	  fault_addr &= ~0xFFF;
	  if (do_mmap((os_vm_address_t *) &fault_addr, 4 * 1024 * 1024, //4096, 
		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED))
	    perror("map failed");

	  return;
	}
      if (in_range_p((os_vm_address_t) fault_addr, 
                     STATIC_SPACE_START, STATIC_SPACE_SIZE))
        {
          DPRINTF(0,(stderr,"mapping static page in at %x\n",fault_addr));
          DPRINTF(0,(stderr,"from %x to %x\n", (fault_addr & (~ 0xFFF)), (fault_addr & (~ 0xFFF)) + 4096));
          fault_addr &=  ~0xFFF;
          if (do_mmap((os_vm_address_t *) &fault_addr,  4 * 1024 * 1024, //4096, 
                      MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED))
            perror("map failed");	  

          return;
        }
      if (in_range_p((os_vm_address_t) fault_addr, 
                     BINDING_STACK_START, BINDING_STACK_SIZE))
        {
          DPRINTF(0,(stderr,"mapping binding stack page in at %x\n",
                     fault_addr));
          DPRINTF(0,(stderr,"from %x to %x\n", (fault_addr & (~ 0xFFF)), (fault_addr & (~ 0xFFF)) + 4096));
          fault_addr &=  ~0xFFF;
          if (do_mmap((os_vm_address_t *) &fault_addr,  4 * 1024 * 1024, //4096, 
                      MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED))
            perror("map failed");	  

          return;
        }
      if (in_range_p((os_vm_address_t) fault_addr, 
                     CONTROL_STACK_START, CONTROL_STACK_SIZE))
        {
          DPRINTF(0,(stderr,"mapping control stack page in at %x\n",
                     fault_addr));
          DPRINTF(0,(stderr,"from %x to %x\n", (fault_addr & (~ 0xFFF)), (fault_addr & (~ 0xFFF)) + 4096));
          fault_addr &=  ~0xFFF;
          if (do_mmap((os_vm_address_t *) &fault_addr,  4 * 1024 * 1024, //4096, 
                      MAP_GROWSDOWN | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED))
            perror("map failed");	  

          return;
        }
    }
#endif

  // Check if the fault is within the dynamic space. 
  if (page_index != -1) {
    // Un-protect the page 
    
    if ((lazy_memory_allocation == 1) &&
        (PAGE_LAZY_ALLOCATE(page_index) == 0))
      {
        int address;
        
        DPRINTF(0,(stderr,"\n\nmapping dynamic space page in at 0x%x index: %i\n\n",
                   fault_addr,PAGE_WRITE_PROTECTED(page_index)));
        // system("cat /proc/$PPID/maps"); /* PVE */

        address = fault_addr;
        
        address &= ~0xFFF;
        DPRINTF(0,(stderr,"\nMAPPING from 0x%x to 0x%x\n", address, address+4*1024));
        if (do_mmap((os_vm_address_t *) &address,  4 * 1024,
                    MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED))
          {
            perror("map failed");	  
            exit(42);
          }
        
        DPRINTF(0,(stderr,"\nGot memory at: 0x%x\n", address));
        page_table[page_index].flags |= PAGE_LAZY_ALLOCATE_MASK;

        if (PAGE_WRITE_PROTECTED(page_index))
          {
            DPRINTF(0,(stderr,"\nWrite protecting page 0x%x after the fact\n",page_index));
            os_protect(page_address(page_index), 4096, OS_VM_PROT_READ | OS_VM_PROT_EXECUTE);
          }
        return;
      }
    else    
      {
        // The page should have been marked write protected 
        if (!PAGE_WRITE_PROTECTED(page_index))
          {
            fprintf(stderr, 
                    "*** Sigsegv in page not marked as write protected: fault: %x lazy? %i flags: %x lazy?%x\n",
                    fault_addr,lazy_memory_allocation,
                    page_table[page_index].flags,
                    page_table[page_index].flags & PAGE_LAZY_ALLOCATE_MASK);
            interrupt_handle_now(signal, contextstruct);
          }
      }

  DPRINTF(0,(stderr,"Unprotecting page 0x%x\n", page_index));
  os_protect(page_address(page_index), 4096, OS_VM_PROT_ALL);
  page_table[page_index].flags &= ~PAGE_WRITE_PROTECTED_MASK;
  page_table[page_index].flags |= PAGE_WRITE_PROTECT_CLEARED_MASK;
  DPRINTF(0,(stderr,"Returning to try again...\n"));
  return;
  }
  
  DPRINTF(0,(stderr,"\nsigsegv fall through!: eip: 0x%lx fault: 0x%x\n",context->eip, fault_addr));
  DPRINTF(0,(stderr,"\neax 0x%lx ebx 0x%lx ecx 0x%lx edx %lx", context->eax, context->ebx, context->ecx, context->edx));
  DPRINTF(0,(stderr,"\ngs 0x%x fs 0x%x es 0x%x ds 0x%x cs %x ss 0x%x",context->gs,context->fs,context->es,context->ds,context->cs,context->ss));
  DPRINTF(0,(stderr,"\n__gsh 0x%x __fsh 0x%x __esh 0x%x __dsh 0x%x __csh 0x%x __ssh 0x%x",context->__gsh,context->__fsh,context->__esh,context->__dsh,context->__csh,context->__ssh));
  DPRINTF(0,(stderr,"\nedi 0x%lx esi 0x%lx ebp 0x%lx esp %lx eip 0x%lx",context->edi,context->esi,context->ebp,context->esp,context->eip));
  DPRINTF(0,(stderr,"\ntrapno 0x%lx err 0x%lx",context->trapno,context->err));  
  DPRINTF(0,(stderr,"\neflags 0x%lx esp_at_signal 0x%lx oldmask 0x%lx cr2 0x%lx",context->eflags,context->esp_at_signal,context->oldmask,context->cr2));
  
  interrupt_handle_now(signal, contextstruct);
}

#else
static void sigsegv_handler(HANDLER_ARGS)
{
  os_vm_address_t addr;

#ifdef i386
  GET_CONTEXT
#endif

    DPRINTF(0, (stderr, "sigsegv\n"));
#ifdef i386
  interrupt_handle_now(signal, contextstruct);
#else
#define CONTROL_STACK_TOP (((char*) CONTROL_STACK_START) + CONTROL_STACK_SIZE)

  addr = arch_get_bad_addr(signal,code,context);

  if (addr != NULL && context->sc_regs[reg_ALLOC] & (1 << 63)) {
    context->sc_regs[reg_ALLOC] -= (1 << 63);
    interrupt_handle_pending(context);
  } else if (addr > CONTROL_STACK_TOP && addr < BINDING_STACK_START) {
    DPRINTF(0,(stderr, "Possible stack overflow at 0x%08lX!\n", addr));
    /* try to fix control frame pointer */
    while (!(CONTROL_STACK_START <= *current_control_frame_pointer &&
	     *current_control_frame_pointer <= CONTROL_STACK_TOP))
      ((char*) current_control_frame_pointer) -= sizeof(lispobj);
    ldb_monitor();
  } else if (!interrupt_maybe_gc(signal, code, context))
    interrupt_handle_now(signal, code, context);
#endif
}
#endif

static void sigbus_handler(HANDLER_ARGS)
{
#ifdef i386
  GET_CONTEXT
#endif

    DPRINTF(0, (stderr, "sigbus:\n")); /* there is no sigbus in linux??? */
#ifdef i386
  interrupt_handle_now(signal, contextstruct);
#else
  interrupt_handle_now(signal, code, context);
#endif  
}

void os_install_interrupt_handlers(void)
{
  interrupt_install_low_level_handler(SIGSEGV, sigsegv_handler);
  interrupt_install_low_level_handler(SIGBUS, sigbus_handler);
}
