lguest25/ 775 0 0 0 11035304410 113375ustar00bootesbooteslguest25/dat.h 664 0 0 16575 11022406437 12326ustar00bootesbootestypedef struct BIOS32si BIOS32si; typedef struct BIOS32ci BIOS32ci; typedef struct Conf Conf; typedef struct Confmem Confmem; typedef struct FPsave FPsave; typedef struct ISAConf ISAConf; typedef struct Label Label; typedef struct Lock Lock; typedef struct MMU MMU; typedef struct Mach Mach; typedef struct Notsave Notsave; typedef struct PCArch PCArch; typedef struct Pcidev Pcidev; typedef struct PCMmap PCMmap; typedef struct PCMslot PCMslot; typedef struct Page Page; typedef struct PMMU PMMU; typedef struct Proc Proc; typedef struct Segdesc Segdesc; typedef vlong Tval; typedef struct Ureg Ureg; typedef struct Vctl Vctl; #pragma incomplete BIOS32si #pragma incomplete Pcidev #pragma incomplete Ureg #define MAXSYSARG 5 /* for mount(fd, afd, mpt, flag, arg) */ /* * parameters for sysproc.c */ #define AOUT_MAGIC (I_MAGIC) struct Lock { ulong key; ulong sr; ulong pc; Proc *p; Mach *m; ushort isilock; long lockcycles; }; struct Label { ulong sp; ulong pc; }; /* * FPsave.status */ enum { /* this is a state */ FPinit= 0, FPactive= 1, FPinactive= 2, /* the following is a bit that can be or'd into the state */ FPillegal= 0x100, }; struct FPsave { ushort control; ushort r1; ushort status; ushort r2; ushort tag; ushort r3; ulong pc; ushort selector; ushort r4; ulong operand; ushort oselector; ushort r5; uchar regs[80]; /* floating point registers */ }; struct Confmem { ulong base; ulong npage; ulong kbase; ulong klimit; }; struct Conf { ulong nmach; /* processors */ ulong nproc; /* processes */ ulong monitor; /* has monitor? */ Confmem mem[4]; /* physical memory */ ulong npage; /* total physical pages of memory */ ulong upages; /* user page pool */ ulong nimage; /* number of page cache image headers */ ulong nswap; /* number of swap pages */ int nswppo; /* max # of pageouts per segment pass */ ulong base0; /* base of bank 0 */ ulong base1; /* base of bank 1 */ ulong copymode; /* 0 is copy on write, 1 is copy on reference */ ulong ialloc; /* max interrupt time allocation in bytes */ ulong pipeqsize; /* size in bytes of pipe queues */ int nuart; /* number of uart devices */ }; /* * MMU stuff in proc */ #define NCOLOR 1 struct PMMU { Page* mmupdb; /* page directory base */ Page* mmufree; /* unused page table pages */ Page* mmuused; /* used page table pages */ Page* kmaptable; /* page table used by kmap */ uint lastkmap; /* last entry used by kmap */ int nkmap; /* number of current kmaps */ }; /* * things saved in the Proc structure during a notify */ struct Notsave { ulong svflags; ulong svcs; ulong svss; }; #include "../port/portdat.h" typedef struct { ulong link; /* link (old TSS selector) */ ulong esp0; /* privilege level 0 stack pointer */ ulong ss0; /* privilege level 0 stack selector */ ulong esp1; /* privilege level 1 stack pointer */ ulong ss1; /* privilege level 1 stack selector */ ulong esp2; /* privilege level 2 stack pointer */ ulong ss2; /* privilege level 2 stack selector */ ulong xcr3; /* page directory base register - not used because we don't use trap gates */ ulong eip; /* instruction pointer */ ulong eflags; /* flags register */ ulong eax; /* general registers */ ulong ecx; ulong edx; ulong ebx; ulong esp; ulong ebp; ulong esi; ulong edi; ulong es; /* segment selectors */ ulong cs; ulong ss; ulong ds; ulong fs; ulong gs; ulong ldt; /* selector for task's LDT */ ulong iomap; /* I/O map base address + T-bit */ } Tss; struct Segdesc { ulong d0; ulong d1; }; struct Mach { int machno; /* physical id of processor (KNOWN TO ASSEMBLY) */ ulong splpc; /* pc of last caller to splhi */ ulong* pdb; /* page directory base for this processor (va) */ Tss* tss; /* tss for this processor */ Segdesc *gdt; /* gdt for this processor */ Proc* proc; /* current process on this processor */ Proc* externup; /* extern register Proc *up */ Page* pdbpool; int pdbcnt; ulong ticks; /* of the clock since boot time */ Label sched; /* scheduler wakeup */ Lock alarmlock; /* access to alarm list */ void* alarm; /* alarms bound to this clock */ int inclockintr; Proc* readied; /* for runproc */ ulong schedticks; /* next forced context switch */ int tlbfault; int tlbpurge; int pfault; int cs; int syscall; int load; int intr; int flushmmu; /* make current proc flush it's mmu state */ int ilockdepth; Perf perf; /* performance counters */ ulong spuriousintr; int lastintr; int loopconst; Lock apictimerlock; int cpumhz; uvlong cyclefreq; /* Frequency of user readable cycle counter */ uvlong cpuhz; int cpuidax; int cpuiddx; char cpuidid[16]; char* cpuidtype; int havetsc; int havepge; uvlong tscticks; int pdballoc; int pdbfree; vlong mtrrcap; vlong mtrrdef; vlong mtrrfix[11]; vlong mtrrvar[32]; /* 256 max. */ int stack[1]; }; /* * KMap the structure doesn't exist, but the functions do. */ typedef struct KMap KMap; #define VA(k) ((void*)(k)) KMap* kmap(Page*); void kunmap(KMap*); struct { Lock; int machs; /* bitmap of active CPUs */ int exiting; /* shutdown */ int ispanic; /* shutdown in response to a panic */ int thunderbirdsarego; /* lets the added processors continue to schedinit */ }active; /* * routines for things outside the PC model, like power management */ struct PCArch { char* id; int (*ident)(void); /* this should be in the model */ void (*reset)(void); /* this should be in the model */ int (*serialpower)(int); /* 1 == on, 0 == off */ int (*modempower)(int); /* 1 == on, 0 == off */ void (*intrinit)(void); int (*intrenable)(Vctl*); int (*intrvecno)(int); int (*intrdisable)(int); void (*introff)(void); void (*intron)(void); void (*clockenable)(void); uvlong (*fastclock)(uvlong*); void (*timerset)(uvlong); }; /* cpuid instruction result register bits */ enum { /* dx */ Fpuonchip = 1<<0, // Pse = 1<<3, /* page size extensions */ Tsc = 1<<4, /* time-stamp counter */ Cpumsr = 1<<5, /* model-specific registers, rdmsr/wrmsr */ Pae = 1<<6, /* physical-addr extensions */ Mce = 1<<7, /* machine-check exception */ Cmpxchg8b = 1<<8, Cpuapic = 1<<9, Mtrr = 1<<12, /* memory-type range regs. */ Pge = 1<<13, /* page global extension */ // Pse2 = 1<<17, /* more page size extensions */ Clflush = 1<<19, Mmx = 1<<23, Sse = 1<<25, /* thus sfence instr. */ Sse2 = 1<<26, /* thus mfence & lfence instr.s */ }; /* * a parsed plan9.ini line */ #define NISAOPT 8 struct ISAConf { char *type; ulong port; int irq; ulong dma; ulong mem; ulong size; ulong freq; int nopt; char *opt[NISAOPT]; }; extern PCArch *arch; /* PC architecture */ /* * Each processor sees its own Mach structure at address MACHADDR. * However, the Mach structures must also be available via the per-processor * MMU information array machp, mainly for disambiguation and access to * the clock which is only maintained by the bootstrap processor (0). */ Mach* machp[MAXMACH]; #define MACHP(n) (machp[n]) extern Mach *m; #define up (((Mach*)MACHADDR)->externup) /* * hardware info about a device */ typedef struct { ulong port; int size; } Devport; struct DevConf { ulong intnum; /* interrupt number */ char *type; /* card type, malloced */ int nports; /* Number of ports */ Devport *ports; /* The ports themselves */ }; typedef struct BIOS32ci { /* BIOS32 Calling Interface */ u32int eax; u32int ebx; u32int ecx; u32int edx; u32int esi; u32int edi; } BIOS32ci; lguest25/devarch.c 664 0 0 52244 11022522141 13145ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "ureg.h" #include "../port/error.h" #include "lguest.h" #pragma profile 0 typedef struct IOMap IOMap; struct IOMap { IOMap *next; int reserved; char tag[13]; ulong start; ulong end; }; static struct { Lock; IOMap *m; IOMap *free; IOMap maps[32]; // some initial free maps QLock ql; // lock for reading map } iomap; enum { Qdir = 0, Qioalloc = 1, Qiob, Qiow, Qiol, Qbase, Qmax = 16, }; typedef long Rdwrfn(Chan*, void*, long, vlong); static Rdwrfn *readfn[Qmax]; static Rdwrfn *writefn[Qmax]; static Dirtab archdir[Qmax] = { ".", { Qdir, 0, QTDIR }, 0, 0555, "ioalloc", { Qioalloc, 0 }, 0, 0444, "iob", { Qiob, 0 }, 0, 0660, "iow", { Qiow, 0 }, 0, 0660, "iol", { Qiol, 0 }, 0, 0660, }; Lock archwlock; /* the lock is only for changing archdir */ int narchdir = Qbase; int (*_pcmspecial)(char*, ISAConf*); void (*_pcmspecialclose)(int); static int doi8253set = 0; /* arguably this should be in here, not in main.c */ extern struct lguest_data lguest_data; /* * Add a file to the #P listing. Once added, you can't delete it. * You can't add a file with the same name as one already there, * and you get a pointer to the Dirtab entry so you can do things * like change the Qid version. Changing the Qid path is disallowed. */ Dirtab* addarchfile(char *name, int perm, Rdwrfn *rdfn, Rdwrfn *wrfn) { int i; Dirtab d; Dirtab *dp; memset(&d, 0, sizeof d); strcpy(d.name, name); d.perm = perm; lock(&archwlock); if(narchdir >= Qmax){ unlock(&archwlock); return nil; } for(i=0; inext){ m = *l; if (m->start < 0x400) continue; i = m->start - port; if(i > size) break; if(align > 0) port = ((port+align-1)/align)*align; else port = m->end; } if(*l == nil){ unlock(&iomap); return -1; } m = iomap.free; if(m == nil){ print("ioalloc: out of maps"); unlock(&iomap); return port; } iomap.free = m->next; m->next = *l; m->start = port; m->end = port + size; m->reserved = 1; strncpy(m->tag, tag, sizeof(m->tag)); m->tag[sizeof(m->tag)-1] = 0; *l = m; archdir[0].qid.vers++; unlock(&iomap); return m->start; } // // alloc some io port space and remember who it was // alloced to. if port < 0, find a free region. // int ioalloc(int port, int size, int align, char *tag) { IOMap *m, **l; int i; lock(&iomap); if(port < 0){ // find a free port above 0x400 and below 0x1000 port = 0x400; for(l = &iomap.m; *l; l = &(*l)->next){ m = *l; if (m->start < 0x400) continue; i = m->start - port; if(i > size) break; if(align > 0) port = ((port+align-1)/align)*align; else port = m->end; } if(*l == nil){ unlock(&iomap); return -1; } } else { // Only 64KB I/O space on the x86. if((port+size) > 0x10000){ unlock(&iomap); return -1; } // see if the space clashes with previously allocated ports for(l = &iomap.m; *l; l = &(*l)->next){ m = *l; if(m->end <= port) continue; if(m->reserved && m->start == port && m->end == port + size) { m->reserved = 0; unlock(&iomap); return m->start; } if(m->start >= port+size) break; unlock(&iomap); return -1; } } m = iomap.free; if(m == nil){ print("ioalloc: out of maps"); unlock(&iomap); return port; } iomap.free = m->next; m->next = *l; m->start = port; m->end = port + size; strncpy(m->tag, tag, sizeof(m->tag)); m->tag[sizeof(m->tag)-1] = 0; *l = m; archdir[0].qid.vers++; unlock(&iomap); return m->start; } void iofree(int port) { IOMap *m, **l; lock(&iomap); for(l = &iomap.m; *l; l = &(*l)->next){ if((*l)->start == port){ m = *l; *l = m->next; m->next = iomap.free; iomap.free = m; break; } if((*l)->start > port) break; } archdir[0].qid.vers++; unlock(&iomap); } int iounused(int start, int end) { IOMap *m; for(m = iomap.m; m; m = m->next){ if(start >= m->start && start < m->end || start <= m->start && end > m->start) return 0; } return 1; } static void checkport(int start, int end) { /* standard vga regs are OK */ if(start >= 0x2b0 && end <= 0x2df+1) return; if(start >= 0x3c0 && end <= 0x3da+1) return; if(iounused(start, end)) return; error(Eperm); } static Chan* archattach(char* spec) { return devattach('P', spec); } Walkqid* archwalk(Chan* c, Chan *nc, char** name, int nname) { return devwalk(c, nc, name, nname, archdir, narchdir, devgen); } static int archstat(Chan* c, uchar* dp, int n) { return devstat(c, dp, n, archdir, narchdir, devgen); } static Chan* archopen(Chan* c, int omode) { return devopen(c, omode, archdir, narchdir, devgen); } static void archclose(Chan*) { } enum { Linelen= 31, }; static long archread(Chan *c, void *a, long n, vlong offset) { char *buf, *p; int port; ushort *sp; ulong *lp; IOMap *m; Rdwrfn *fn; switch((ulong)c->qid.path){ case Qdir: return devdirread(c, a, n, archdir, narchdir, devgen); case Qiob: port = offset; checkport(offset, offset+n); for(p = a; port < offset+n; port++) *p++ = inb(port); return n; case Qiow: if(n & 1) error(Ebadarg); checkport(offset, offset+n); sp = a; for(port = offset; port < offset+n; port += 2) *sp++ = ins(port); return n; case Qiol: if(n & 3) error(Ebadarg); checkport(offset, offset+n); lp = a; for(port = offset; port < offset+n; port += 4) *lp++ = inl(port); return n; case Qioalloc: break; default: if(c->qid.path < narchdir && (fn = readfn[c->qid.path])) return fn(c, a, n, offset); error(Eperm); break; } if((buf = malloc(n)) == nil) error(Enomem); p = buf; n = n/Linelen; offset = offset/Linelen; lock(&iomap); for(m = iomap.m; n > 0 && m != nil; m = m->next){ if(offset-- > 0) continue; sprint(p, "%8lux %8lux %-12.12s\n", m->start, m->end-1, m->tag); p += Linelen; n--; } unlock(&iomap); n = p - buf; memmove(a, buf, n); free(buf); return n; } static long archwrite(Chan *c, void *a, long n, vlong offset) { char *p; int port; ushort *sp; ulong *lp; Rdwrfn *fn; switch((ulong)c->qid.path){ case Qiob: p = a; checkport(offset, offset+n); for(port = offset; port < offset+n; port++) outb(port, *p++); return n; case Qiow: if(n & 1) error(Ebadarg); checkport(offset, offset+n); sp = a; for(port = offset; port < offset+n; port += 2) outs(port, *sp++); return n; case Qiol: if(n & 3) error(Ebadarg); checkport(offset, offset+n); lp = a; for(port = offset; port < offset+n; port += 4) outl(port, *lp++); return n; default: if(c->qid.path < narchdir && (fn = writefn[c->qid.path])) return fn(c, a, n, offset); error(Eperm); break; } return 0; } Dev archdevtab = { 'P', "arch", devreset, devinit, devshutdown, archattach, archwalk, archstat, archopen, devcreate, archclose, archread, devbread, archwrite, devbwrite, devremove, devwstat, }; /* * the following is a generic version of the * architecture specific stuff */ static int unimplemented(int) { return 0; } static void nop(void) { } /* * 386 has no compare-and-swap instruction. * Run it with interrupts turned off instead. */ static int cmpswap386(long *addr, long old, long new) { int r, s; s = splhi(); if(r = (*addr == old)) *addr = new; splx(s); return r; } /* * On a uniprocessor, you'd think that coherence could be nop, * but it can't. We still need a barrier when using coherence() in * device drivers. * * On VMware, it's safe (and a huge win) to set this to nop. * Aux/vmware does this via the #P/archctl file. */ void (*coherence)(void) = nop; int (*cmpswap)(long*, long, long) = cmpswap386; uvlong lguestfastticks(uvlong *hz){ uvlong x; if (hz) *hz = 1024*1024*1024*4ULL; // cycles(&x); x = lguest_get_ns(); return x; } ulong µs(void) { uvlong x; x = lguest_get_ns() / 4096; return x; } Lock lgintrlock; void lguesttimerset(uvlong){ } void lgintrinit(void) { } int lgintrisr(int) { return 0; } int lgintrenable(Vctl* v) { int irq, irqbit; /* * Given an IRQ, enable the corresponding interrupt in the lgintr * and return the vector to be used. The lgintr is set to use a fixed * range of vectors starting at VectorPIC. */ irq = v->irq; if(irq < 0 || irq > 63){ print("lgintrenable: irq %d out of range\n", irq); return -1; } /* drop 32 from the irq */ irq -= 32; irqbit = 1< 63){ print("lgintrenable: irq %d out of range\n", irq); return -1; } /* drop 32 from the irq */ irq -= 32; irqbit = 1<\n"); intrenable(IrqCLOCK, lgclock, 0, BUSUNKNOWN, "clock"); /* now kick off the timer */ lgrestart(); } PCArch* arch; extern PCArch* knownarch[]; PCArch archlguest = { .id= "archlguest", .ident= 0, //.reset= panic, .serialpower= unimplemented, .modempower= unimplemented, .fastclock= lguestfastticks, .timerset= lguesttimerset, .intrinit= lgintrinit, .intrenable= lgintrenable, .intrvecno= lgintrvecno, .intrdisable= lgintrdisable, .intron= lgintron, .introff= lgintroff, .clockenable= lgclockenable, /* .intrinit= unimplemented, .intrenable= unimplemented, .intrvecno= unimplemented, .intrdisable= unimplemented, .intron= unimplemented, .introff= unimplemented, */ }; typedef struct X86type X86type; struct X86type { int family; int model; int aalcycles; char* name; }; static X86type x86intel[] = { { 4, 0, 22, "486DX", }, /* known chips */ { 4, 1, 22, "486DX50", }, { 4, 2, 22, "486SX", }, { 4, 3, 22, "486DX2", }, { 4, 4, 22, "486SL", }, { 4, 5, 22, "486SX2", }, { 4, 7, 22, "DX2WB", }, /* P24D */ { 4, 8, 22, "DX4", }, /* P24C */ { 4, 9, 22, "DX4WB", }, /* P24CT */ { 5, 0, 23, "P5", }, { 5, 1, 23, "P5", }, { 5, 2, 23, "P54C", }, { 5, 3, 23, "P24T", }, { 5, 4, 23, "P55C MMX", }, { 5, 7, 23, "P54C VRT", }, { 6, 1, 16, "PentiumPro", },/* trial and error */ { 6, 3, 16, "PentiumII", }, { 6, 5, 16, "PentiumII/Xeon", }, { 6, 6, 16, "Celeron", }, { 6, 7, 16, "PentiumIII/Xeon", }, { 6, 8, 16, "PentiumIII/Xeon", }, { 6, 0xB, 16, "PentiumIII/Xeon", }, { 0xF, 1, 16, "P4", }, /* P4 */ { 0xF, 2, 16, "PentiumIV/Xeon", }, { 3, -1, 32, "386", }, /* family defaults */ { 4, -1, 22, "486", }, { 5, -1, 23, "P5", }, { 6, -1, 16, "P6", }, { 0xF, -1, 16, "P4", }, /* P4 */ { -1, -1, 16, "unknown", }, /* total default */ }; /* * The AMD processors all implement the CPUID instruction. * The later ones also return the processor name via functions * 0x80000002, 0x80000003 and 0x80000004 in registers AX, BX, CX * and DX: * K5 "AMD-K5(tm) Processor" * K6 "AMD-K6tm w/ multimedia extensions" * K6 3D "AMD-K6(tm) 3D processor" * K6 3D+ ? */ static X86type x86amd[] = { { 5, 0, 23, "AMD-K5", }, /* guesswork */ { 5, 1, 23, "AMD-K5", }, /* guesswork */ { 5, 2, 23, "AMD-K5", }, /* guesswork */ { 5, 3, 23, "AMD-K5", }, /* guesswork */ { 5, 6, 11, "AMD-K6", }, /* trial and error */ { 5, 7, 11, "AMD-K6", }, /* trial and error */ { 5, 8, 11, "AMD-K6-2", }, /* trial and error */ { 5, 9, 11, "AMD-K6-III", },/* trial and error */ { 6, 1, 11, "AMD-Athlon", },/* trial and error */ { 6, 2, 11, "AMD-Athlon", },/* trial and error */ { 4, -1, 22, "Am486", }, /* guesswork */ { 5, -1, 23, "AMD-K5/K6", }, /* guesswork */ { 6, -1, 11, "AMD-Athlon", },/* guesswork */ { 0xF, -1, 11, "AMD64", }, /* guesswork */ { -1, -1, 11, "unknown", }, /* total default */ }; /* * WinChip 240MHz */ static X86type x86winchip[] = { {5, 4, 23, "Winchip",}, /* guesswork */ {6, 7, 23, "Via C3 Samuel 2 or Ezra",}, {6, 8, 23, "Via C3 Ezra-T",}, {6, 9, 23, "Via C3 Eden-N",}, { -1, -1, 23, "unknown", }, /* total default */ }; /* * SiS 55x */ static X86type x86sis[] = { {5, 0, 23, "SiS 55x",}, /* guesswork */ { -1, -1, 23, "unknown", }, /* total default */ }; static X86type *cputype; static void simplecycles(uvlong*); void (*cycles)(uvlong*) = simplecycles; void _cycles(uvlong*); /* in l.s */ void delay(int millisecs) { millisecs *= m->loopconst; if(millisecs <= 0) millisecs = 1; aamloop(millisecs); } void microdelay(int microsecs) { microsecs *= m->loopconst; microsecs /= 1000; if(microsecs <= 0) microsecs = 1; aamloop(microsecs); } /* * performance measurement ticks. must be low overhead. * doesn't have to count over a second. */ ulong perfticks(void) { uvlong x; if(m->havetsc) cycles(&x); else x = lguest_get_ns(); return x; } static void simplecycles(uvlong*x) { *x = m->ticks; } void cpuidprint(void) { int i; char buf[128]; i = sprint(buf, "cpu%d: %dMHz ", m->machno, m->cpumhz); if(m->cpuidid[0]) i += sprint(buf+i, "%12.12s ", m->cpuidid); sprint(buf+i, "%s (cpuid: AX 0x%4.4uX DX 0x%4.4uX)\n", m->cpuidtype, m->cpuidax, m->cpuiddx); print(buf); } /* * figure out: * - cpu type * - whether or not we have a TSC (cycle counter) * - whether or not it supports page size extensions * (if so turn it on) * - whether or not it supports machine check exceptions * (if so turn it on) * - whether or not it supports the page global flag * (if so turn it on) */ int cpuidentify(void) { char *p; int family, model, nomce; X86type *t, *tab; ulong cr4; vlong mca, mct; cpuid(m->cpuidid, &m->cpuidax, &m->cpuiddx); iprint("cpu id is %s\n", m->cpuidid); if(strncmp(m->cpuidid, "AuthenticAMD", 12) == 0) tab = x86amd; else if(strncmp(m->cpuidid, "CentaurHauls", 12) == 0) tab = x86winchip; else if(strncmp(m->cpuidid, "SiS SiS SiS ", 12) == 0) tab = x86sis; else tab = x86intel; family = X86FAMILY(m->cpuidax); model = X86MODEL(m->cpuidax); for(t=tab; t->name; t++) if((t->family == family && t->model == model) || (t->family == family && t->model == -1) || (t->family == -1)) break; iprint("t->name is %p\n", t->name); m->cpuidtype = t->name; /* * if there is one, set tsc to a known value */ if(0 && m->cpuiddx & 0x10){ m->havetsc = 1; cycles = _cycles; if(m->cpuiddx & 0x20) wrmsr(0x10, 0); } /* * use i8253 to guess our cpu speed */ // guesscpuhz(t->aalcycles); m->cpumhz = 1000; m->havetsc = 0; cycles = _cycles; /* * If machine check exception, page size extensions or page global bit * are supported enable them in CR4 and clear any other set extensions. * If machine check was enabled clear out any lingering status. */ if(0 && m->cpuiddx & 0x2088){ cr4 = 0; if(m->cpuiddx & 0x08) cr4 |= 0x10; /* page size extensions */ if(p = getconf("*nomce")) nomce = strtoul(p, 0, 0); else nomce = 0; if((m->cpuiddx & 0x80) && !nomce){ cr4 |= 0x40; /* machine check enable */ if(family == 5){ rdmsr(0x00, &mca); rdmsr(0x01, &mct); } } /* * Detect whether the chip supports the global bit * in page directory and page table entries. When set * in a particular entry, it means ``don't bother removing * this from the TLB when CR3 changes.'' * * We flag all kernel pages with this bit. Doing so lessens the * overhead of switching processes on bare hardware, * even more so on VMware. See mmu.c:/^memglobal. * * For future reference, should we ever need to do a * full TLB flush, it can be accomplished by clearing * the PGE bit in CR4, writing to CR3, and then * restoring the PGE bit. */ if(m->cpuiddx & 0x2000){ cr4 |= 0x80; /* page global enable bit */ m->havepge = 1; } putcr4(cr4); if(m->cpuiddx & 0x80) rdmsr(0x01, &mct); } iprint("cpu type is %p, family %#x\n", t, t->family); cputype = t; return t->family; } static long cputyperead(Chan*, void *a, long n, vlong offset) { char str[32]; ulong mhz; mhz = (m->cpuhz+999999)/1000000; snprint(str, sizeof(str), "%s %lud\n", cputype->name, mhz); return readstr(offset, a, n, str); } static long lgread(Chan*, void *a, long nn, vlong offset){ struct lguest_data *lg = &lguest_data; char buf[256]; int n; n = snprint(buf, sizeof buf, "blocked %#ulx walltime %#8.8ulx %#8.8ulx\n",lg->blocked_interrupts, lg->time.seconds, lg->time.nanoseconds); n += snprint(buf+n, sizeof buf-n, "reserve_mem %#8.8ulx tsc_khz %lud noirq_start %p noirq_end %p syscall_vec %d\n", lg->reserve_mem, lg->tsc_khz, (void *)lg->noirq_start, (void *)lg->noirq_end, lg->syscall_vec); buf[n] = 0; return readstr(offset, a, nn, buf); } static long archctlread(Chan*, void *a, long nn, vlong offset) { char buf[256]; int n; extern ulong fcallcount[], intrcount[]; n = snprint(buf, sizeof buf, "cpu %s %lud%s\n", cputype->name, (ulong)(m->cpuhz+999999)/1000000, m->havepge ? " pge" : ""); n += snprint(buf+n, sizeof buf-n, "pge %s\n", getcr4()&0x80 ? "on" : "off"); n += snprint(buf+n, sizeof buf-n, "coherence "); if(coherence == mb386) n += snprint(buf+n, sizeof buf-n, "mb386\n"); else if(coherence == mb586) n += snprint(buf+n, sizeof buf-n, "mb586\n"); else if(coherence == nop) n += snprint(buf+n, sizeof buf-n, "nop\n"); else n += snprint(buf+n, sizeof buf-n, "0x%p\n", coherence); n += snprint(buf+n, sizeof buf-n, "cmpswap "); if(cmpswap == cmpswap386) n += snprint(buf+n, sizeof buf-n, "cmpswap386\n"); else if(cmpswap == cmpswap486) n += snprint(buf+n, sizeof buf-n, "cmpswap486\n"); else n += snprint(buf+n, sizeof buf-n, "0x%p\n", cmpswap); n += snprint(buf+n, sizeof buf-n, "i8253set %s\n", doi8253set ? "on" : "off"); n += snprint(buf+n, sizeof buf-n, "fcallcount %ld irqcount %ld\n", fcallcount[0], intrcount[0]); buf[n] = 0; return readstr(offset, a, nn, buf); } enum { CMpge, CMcoherence, CMi8253set, }; static Cmdtab archctlmsg[] = { CMpge, "pge", 2, CMcoherence, "coherence", 2, CMi8253set, "i8253set", 2, }; static long archctlwrite(Chan*, void *a, long n, vlong) { Cmdbuf *cb; Cmdtab *ct; cb = parsecmd(a, n); if(waserror()){ free(cb); nexterror(); } ct = lookupcmd(cb, archctlmsg, nelem(archctlmsg)); switch(ct->index){ case CMpge: if(!m->havepge) error("processor does not support pge"); if(strcmp(cb->f[1], "on") == 0) putcr4(getcr4() | 0x80); else if(strcmp(cb->f[1], "off") == 0) putcr4(getcr4() & ~0x80); else cmderror(cb, "invalid pge ctl"); break; case CMcoherence: if(strcmp(cb->f[1], "mb386") == 0) coherence = mb386; else if(strcmp(cb->f[1], "mb586") == 0){ if(X86FAMILY(m->cpuidax) < 5) error("invalid coherence ctl on this cpu family"); coherence = mb586; } else if(strcmp(cb->f[1], "nop") == 0){ /* only safe on vmware */ if(conf.nmach > 1) error("cannot disable coherence on a multiprocessor"); coherence = nop; }else cmderror(cb, "invalid coherence ctl"); break; case CMi8253set: if(strcmp(cb->f[1], "on") == 0) doi8253set = 1; else if(strcmp(cb->f[1], "off") == 0){ doi8253set = 0; (*arch->timerset)(0); }else cmderror(cb, "invalid i2853set ctl"); break; } free(cb); poperror(); return n; } void archinit(void) { arch = &archlguest; /* * Decide whether to use copy-on-reference (386 and mp). * We get another chance to set it in mpinit() for a * multiprocessor. */ if(X86FAMILY(m->cpuidax) == 3) conf.copymode = 1; if(X86FAMILY(m->cpuidax) >= 4) cmpswap = cmpswap486; if(X86FAMILY(m->cpuidax) >= 5) coherence = mb586; addarchfile("cputype", 0444, cputyperead, nil); addarchfile("archctl", 0664, archctlread, archctlwrite); addarchfile("lguest", 0444, lgread, nil); } /* * call either the pcmcia or pccard device setup */ int pcmspecial(char *idstr, ISAConf *isa) { return (_pcmspecial != nil)? _pcmspecial(idstr, isa): -1; } /* * call either the pcmcia or pccard device teardown */ void pcmspecialclose(int a) { if (_pcmspecialclose != nil) _pcmspecialclose(a); } /* * return value and speed of timer set in arch->clockenable */ uvlong fastticks(uvlong *hz) { return (*arch->fastclock)(hz); } /* * set next timer interrupt */ void timerset(Tval x) { if(doi8253set) (*arch->timerset)(x); } lguest25/devrtc.c 664 0 0 4202 11022045253 12773ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "../port/error.h" #include "lguest.h" /* * lguest real time clock */ #pragma profile 0 enum{ Qdir = 0, Qrtc, }; Dirtab rtcdir[]={ ".", {Qdir, 0, QTDIR}, 0, 0555, "rtc", {Qrtc, 0}, 0, 0664, }; static Chan* rtcattach(char* spec) { return devattach('r', spec); } static Walkqid* rtcwalk(Chan* c, Chan *nc, char** name, int nname) { return devwalk(c, nc, name, nname, rtcdir, nelem(rtcdir), devgen); } static int rtcstat(Chan* c, uchar* dp, int n) { return devstat(c, dp, n, rtcdir, nelem(rtcdir), devgen); } static Chan* rtcopen(Chan* c, int omode) { omode = openmode(omode); switch((ulong)c->qid.path){ case Qrtc: if(strcmp(up->user, eve)!=0 && omode!=OREAD) error(Eperm); break; } return devopen(c, omode, rtcdir, nelem(rtcdir), devgen); } static void rtcclose(Chan*) { } static Lock nvrtlock; long rtctime(void) { int i; long t, ot; ilock(&nvrtlock); /* loop till we get two reads in a row the same */ t = lguest_get_wallclock(); for(i = 0; i < 100; i++){ ot = t; t = lguest_get_wallclock(); if(ot == t) break; } if(i == 100) print("we are boofheads\n"); iunlock(&nvrtlock); return t; } static long rtcread(Chan* c, void* buf, long n, vlong off) { ulong t; ulong offset = off; if(c->qid.type & QTDIR) return devdirread(c, buf, n, rtcdir, nelem(rtcdir), devgen); switch((ulong)c->qid.path){ case Qrtc: t = rtctime(); n = readnum(offset, buf, n, t, 12); return n; } error(Ebadarg); return 0; } /* at some point we can set a fixed offset here. But I don't see any reason * (yet) to support this. */ static long rtcwrite(Chan* c, void*, long n, vlong off) { ulong offset = off; error(Eperm); if(offset!=0) error(Ebadarg); switch((ulong)c->qid.path){ case Qrtc: /* * write the clock */ ilock(&nvrtlock); iunlock(&nvrtlock); return n; } error(Ebadarg); return 0; } Dev rtcdevtab = { 'r', "rtc", devreset, devinit, devshutdown, rtcattach, rtcwalk, rtcstat, rtcopen, devcreate, rtcclose, rtcread, devbread, rtcwrite, devbwrite, devremove, devwstat, }; AX 0x%4.4uX DX 0x%4.4uX)\n", m->cpuidtype, m->cpuidax, m->cpuiddx); print(buf); } /* * figure out: * - cpu type * - whether or not we have a TSC (cycle counter) * - whether or not it supports page size extensions * (if so turn it on) * - whether or not it supports machine check exceptions * (if so turn it on) * - whether or not it supports the page global flag * lguest25/diskpart 664 0 0 423 10774531323 13100ustar00bootesbootes#!/boot/rc -m /boot/rcmain /boot/echo "diskpart here ready to serve" /boot/fdisk -p '#S/sd00/data' /boot/fdisk -p '#S/sd00/data' > '#S/sd00/ctl' /boot/prep -p '#S/sd00/plan9' /boot/prep -p '#S/sd00/plan9' > '#S/sd00/ctl' /boot/ls -l '#S/sd00' /boot/echo "diskpart ends" lguest25/etherif.h 664 0 0 1701 10774531323 13153ustar00bootesbootesenum { MaxEther = 48, Ntypes = 8, }; typedef struct Ether Ether; struct Ether { ISAConf; /* hardware info */ int ctlrno; int tbdf; /* type+busno+devno+funcno */ int minmtu; int maxmtu; uchar ea[Eaddrlen]; void (*attach)(Ether*); /* filled in by reset routine */ void (*detach)(Ether*); void (*transmit)(Ether*); void (*interrupt)(Ureg*, void*); long (*ifstat)(Ether*, void*, long, ulong); long (*ctl)(Ether*, void*, long); /* custom ctl messages */ void (*power)(Ether*, int); /* power on/off */ void (*shutdown)(Ether*); /* shutdown hardware before reboot */ void *ctlr; Queue* oq; Netif; }; extern Block* etheriq(Ether*, Block*, int); extern void addethercard(char*, int(*)(Ether*)); extern ulong ethercrc(uchar*, int); extern int parseether(uchar*, char*); #define NEXT(x, l) (((x)+1)%(l)) #define PREV(x, l) (((x) == 0) ? (l)-1: (x)-1) #define HOWMANY(x, y) (((x)+((y)-1))/(y)) #define ROUNDUP(x, y) (HOWMANY((x), (y))*(y)) a, long n, vlong offset) { char str[32]; ulong mhz; mhz = (lguest25/etherlg.c 664 0 0 20021 11022045261 13152ustar00bootesbootes/* * lguest ethernet, from: * Realtek 8139 (but not the 8129). */ #include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "../port/error.h" #include "../port/netif.h" #pragma profile 0 #include "etherif.h" #include "lguest.h" enum { NRDMA = 1, /* number RDMAs in flight */ /* The feature bitmap for virtio net */ VIRTIO_NET_F_CSUM = 1, /* Can handle pkts w/ partial csum */ VIRTIO_NET_F_MAC= (1<<5), /* Host has given MAC address. */ VIRTIO_NET_F_GSO = (1<<6), /* Can handle pkts w/ any GSO type */ /* features and where they live */ config_mac = 0, /* packet flags */ VIRTIO_NET_HDR_F_NEEDS_CSUM = 1, // Use csum_start, csum_offset /* GSO flags */ VIRTIO_NET_HDR_GSO_NONE = 0, // Not a GSO frame VIRTIO_NET_HDR_GSO_TCPV4 = 1, // GSO frame, IPv4 TCP (TSO) VIRTIO_NET_HDR_GSO_UDP = 3, // GSO frame, IPv4 UDP (UFO) VIRTIO_NET_HDR_GSO_TCPV6 = 4, // GSO frame, IPv6 TCP VIRTIO_NET_HDR_GSO_ECN = 0x80, // TCP has ECN set /* header structure */ hsize = 80, /* byte offsets for xdr */ hflags = 0, hgso = 1, hdr_len = 2, /* Ethernet + IP + tcp/udp hdrs */ gso_size = 4, /* Bytes to append to gso_hdr_len per frame */ hcsum_start = 6, /* Position to start checksumming from */ hcsum_off = 8, /* Offset after that to place checksum */ }; struct netheader { unsigned char h[10]; }; typedef struct Ctlr Ctlr; typedef struct Ctlr { int instance; Ctlr* next; int active; int id; int devno; QLock alock; /* attach */ Lock ilock; /* init */ void* alloc; /* base of per-Ctlr allocated data */ Block *bp[NRDMA]; Lock tlock; /* transmit */ } Ctlr; int nether = 0; static Ctlr* ctlrhead; static Ctlr* ctlrtail; static void lginterrupt(Ureg*, void* arg); extern struct lguest_device_desc *lgd; static void lgpromiscuous(void*, int ) { } static void lgmulticast(void*, uchar*, int) { } static long lgifstat(Ether*, void* a, long n, ulong offset) { int i; int l = 0; char *p; struct Ctlr *ctlr = ctlrhead; if((p = malloc(READSTR)) == nil) error(Enomem); for(i = 0; i < nether; i++, ctlr= ctlr->next){ l += snprint(p+l, READSTR-l, "ether%d: devno %d", i, ctlr->devno); /* for(j = 0; j < lgv[i].lgd->num_vq; j++){ l += snprint(p+l, READSTR-l, "[%d irq %d]", j, lgvirq(i, j)); } */ l += snprint(p+l, READSTR-l, "\n"); } n = readstr(offset, a, n, p); free(p); return n; } static int lgreset(Ether*e) { iprint("LG RESET: e->ctlrno %d\n", e->ctlrno); /* FIXME: be sure to probe the ethers, right now we just do 1 */ if (e->ctlrno) return -1; return 0; } static void lghalt(Ctlr*) { } void setmac(u8 *) { iprint("SETMAC: NO\n"); } static void lginit(Ether* edev) { Ctlr *ctlr = edev->ctlr; ilock(&ctlr->ilock); iprint("LGINIT\n"); lghalt(ctlr); /* * MAC Address. */ /* todo: ask jmk why we're doing this in two places */ setmac(edev->ea); /* * Interrupts. */ // intrenable(NETINTR, lginterrupt, edev, BUSUNKNOWN, "lgether"); /* * Enable receiver/transmitter. * Nothing to do. */ iunlock(&ctlr->ilock); } void fillslot(Ether* edev) { int lgvaddbuf(int dev, int ring, void *v[], int len[], int out, int in, void *tag); int lgvnumfree(int devno, int ring); Ctlr *ctlr; Block *bp; void *v[2]; int len[2]; /* fix this later. */ unsigned char *nh; ctlr = edev->ctlr; while(lgvnumfree(ctlr->devno, 0)){ bp = iallocb(ETHERMAXTU + 16); /* the header goes in the first 16 bytes. 16 to hopefully align the * receive area in a reasonable way */ if (! bp) panic("alloc bp in fillslot: no memory\n"); bp->flag |= Bipck | Budpck | Btcpck | Bpktck; /* now set up the dma for it */ nh = bp->wp; bp->wp += 16; bp->rp = bp->wp; v[0] = nh; len[0] = 10; v[1] = bp->wp; len[1] = ETHERMAXTU; lgvaddbuf(ctlr->devno, 0, v, len, 0, 2, bp); } } static void lgattach(Ether* edev) { int lgvirq(int devno, int ring); Ctlr *ctlr; int irq; iprint("LGATTACH edev %p ctlr %p\n", edev, edev->ctlr); ctlr = edev->ctlr; qlock(&ctlr->alock); if(ctlr->alloc == nil){ ctlr->alloc = xspanalloc(BY2PG, BY2PG, 0); lginit(edev); } qunlock(&ctlr->alock); /* enable interrupts */ /* receive is ring 0 */ irq = lgvirq(ctlr->devno, 0); print("ETHERIN: devno %d ring %d irq %d\n", ctlr->devno, 0, irq+32); intrenable(irq+32, lginterrupt, edev, BUSUNKNOWN, "rxether"); irq = lgvirq(ctlr->devno, 1); print("ETHEROUT: devno %d ring %d irq %d\n", ctlr->devno, 0, irq+32); intrenable(irq+32, lginterrupt, edev, BUSUNKNOWN, "txether"); /* now queue up some receive bps */ fillslot(edev); } /* we're talking to Linux. So every interface is ever so slightly different. It's amazing */ static void lgtxstart(Ether* edev) { int lgvaddbuf(int dev, int ring, void *v[], int len[], int out, int in, void *tag); /* gcc and kenc disagree about structs. Screw it. */ unsigned char nh[10]; void *v[2]; int len[2]; int size; Block *bp; Ctlr *ctlr; //iprint("lgtxstart\n"); ctlr = edev->ctlr; while(bp = qget(edev->oq)){ memset(nh, 0, sizeof(nh)); nh[0] = VIRTIO_NET_HDR_F_NEEDS_CSUM; nh[1] = VIRTIO_NET_HDR_GSO_NONE; v[0] = nh; len[0] = 10; size = BLEN(bp); v[1] = bp->rp; len[1] = size; /* non blocking IO. Basically, we'll get interrupted and do a getbuf, which is the bp, and free the bp */ lgvaddbuf(ctlr->devno, 1, v, len, 2, 0, bp); } } #ifdef NOT hflags = 0, hgso = 1, hdr_len = 2, /* Ethernet + IP + tcp/udp hdrs */ gso_size = 4, /* Bytes to append to gso_hdr_len per frame */ hcsum_start = 6, /* Position to start checksumming from */ hcsum_off = 8, /* Offset after that to place checksum */ }; struct netheader { unsigned char h[10]; }; #endif static void lgtransmit(Ether* edev) { Ctlr *ctlr; //print("lgtransmit...\n"); ctlr = edev->ctlr; ilock(&ctlr->tlock); lgtxstart(edev); iunlock(&ctlr->tlock); //iprint("lgtransmit done\n"); } static void lginterrupt(Ureg*, void* arg) { void *lgvgetbuf(int dev, int ring, int *plen); Ether *edev = arg; Ctlr *ctlr; Block *bp; int len; // iprint("L"); ctlr = edev->ctlr; /* suck up stuff while there's stuff to suck. */ while (bp = lgvgetbuf(ctlr->devno, 1, &len)) { // iprint("IF%d %p ", len, bp); free(bp); } // iprint("l"); while (bp = lgvgetbuf(ctlr->devno, 0, &len)) { bp->wp += len; // iprint("IR%d %p ", len, bp); // dumphex("RP", bp->rp, BLEN(bp)); etheriq(edev, bp, 1); fillslot(edev); } } static int lgpnp(Ether* edev) { int lgvfeature(int devno, unsigned int feature); void lgvdumpconfig(int devno); int findlgv(char *name); void lgvconfig(int devno, unsigned char *config, int off ,int len); Ctlr *ctlr = nil; uchar ea[Eaddrlen]; int i, devno; char name[32]; /* just loop through finding #Z/ether%d until no more */ /* * Make a list of all ethernet controllers * if not already done. */ if(ctlrhead == nil){ sprint(name, "net0"); for (i = 0, devno = findlgv(name); devno > -1; devno = findlgv(name)) { nether++; ctlr = malloc(sizeof(Ctlr)); ctlr->devno = devno; ctlr->id = i; lgvdumpconfig(devno); if(ctlrhead != nil) ctlrtail->next = ctlr; else ctlrhead = ctlr; ctlrtail = ctlr; iprint("found one enet ctlr\n"); sprint(name, "net%d", nether); } } if (edev->ctlrno >= nether) return -1; if (! ctlr) return -1; edev->ctlr = ctlr; edev->port = 0; edev->irq = NETINTR; edev->tbdf = BUSUNKNOWN; /* * Check if the adapter's station address is to be overridden. * If not, read it from the device and set in edev->ea. */ memset(ea, 0, Eaddrlen); if(memcmp(ea, edev->ea, Eaddrlen) == 0){ lgvconfig(ctlr->devno, edev->ea, config_mac, sizeof(edev->ea)); /* but if they did not give us one, make one up. Just add one to lowest octet for now */ if (! lgvfeature(ctlr->devno, VIRTIO_NET_F_MAC)) { iprint("Jiggering address\n"); edev->ea[5] += 1; } } edev->attach = lgattach; edev->transmit = lgtransmit; edev->interrupt = lginterrupt; edev->ifstat = lgifstat; edev->arg = edev; edev->promiscuous = lgpromiscuous; edev->multicast = lgmulticast; // edev->shutdown = lgshutdown; return 0; } void etherlglink(void) { addethercard("lg", lgpnp); } h 664 0 0 1701 10774531323 13153ustar00bootesbooteslguest25/ethermii.h 664 0 0 6272 10774531323 13343ustar00bootesbootestypedef struct Mii Mii; typedef struct MiiPhy MiiPhy; enum { /* registers */ Bmcr = 0x00, /* Basic Mode Control */ Bmsr = 0x01, /* Basic Mode Status */ Phyidr1 = 0x02, /* PHY Identifier #1 */ Phyidr2 = 0x03, /* PHY Identifier #2 */ Anar = 0x04, /* Auto-Negotiation Advertisement */ Anlpar = 0x05, /* AN Link Partner Ability */ Aner = 0x06, /* AN Expansion */ Annptr = 0x07, /* AN Next Page TX */ Annprr = 0x08, /* AN Next Page RX */ Mscr = 0x09, /* MASTER-SLAVE Control */ Mssr = 0x0A, /* MASTER-SLAVE Status */ Esr = 0x0F, /* Extended Status */ NMiiPhyr = 32, NMiiPhy = 32, }; enum { /* Bmcr */ BmcrSs1 = 0x0040, /* Speed Select[1] */ BmcrCte = 0x0080, /* Collision Test Enable */ BmcrDm = 0x0100, /* Duplex Mode */ BmcrRan = 0x0200, /* Restart Auto-Negotiation */ BmcrI = 0x0400, /* Isolate */ BmcrPd = 0x0800, /* Power Down */ BmcrAne = 0x1000, /* Auto-Negotiation Enable */ BmcrSs0 = 0x2000, /* Speed Select[0] */ BmcrLe = 0x4000, /* Loopback Enable */ BmcrR = 0x8000, /* Reset */ }; enum { /* Bmsr */ BmsrEc = 0x0001, /* Extended Capability */ BmsrJd = 0x0002, /* Jabber Detect */ BmsrLs = 0x0004, /* Link Status */ BmsrAna = 0x0008, /* Auto-Negotiation Ability */ BmsrRf = 0x0010, /* Remote Fault */ BmsrAnc = 0x0020, /* Auto-Negotiation Complete */ BmsrPs = 0x0040, /* Preamble Suppression Capable */ BmsrEs = 0x0100, /* Extended Status */ Bmsr100T2HD = 0x0200, /* 100BASE-T2 HD Capable */ Bmsr100T2FD = 0x0400, /* 100BASE-T2 FD Capable */ Bmsr10THD = 0x0800, /* 10BASE-T HD Capable */ Bmsr10TFD = 0x1000, /* 10BASE-T FD Capable */ Bmsr100TXHD = 0x2000, /* 100BASE-TX HD Capable */ Bmsr100TXFD = 0x4000, /* 100BASE-TX FD Capable */ Bmsr100T4 = 0x8000, /* 100BASE-T4 Capable */ }; enum { /* Anar/Anlpar */ Ana10HD = 0x0020, /* Advertise 10BASE-T */ Ana10FD = 0x0040, /* Advertise 10BASE-T FD */ AnaTXHD = 0x0080, /* Advertise 100BASE-TX */ AnaTXFD = 0x0100, /* Advertise 100BASE-TX FD */ AnaT4 = 0x0200, /* Advertise 100BASE-T4 */ AnaP = 0x0400, /* Pause */ AnaAP = 0x0800, /* Asymmetrical Pause */ AnaRf = 0x2000, /* Remote Fault */ AnaAck = 0x4000, /* Acknowledge */ AnaNp = 0x8000, /* Next Page Indication */ }; enum { /* Mscr */ Mscr1000THD = 0x0100, /* Advertise 1000BASE-T HD */ Mscr1000TFD = 0x0200, /* Advertise 1000BASE-T FD */ }; enum { /* Mssr */ Mssr1000THD = 0x0400, /* Link Partner 1000BASE-T HD able */ Mssr1000TFD = 0x0800, /* Link Partner 1000BASE-T FD able */ }; enum { /* Esr */ Esr1000THD = 0x1000, /* 1000BASE-T HD Capable */ Esr1000TFD = 0x2000, /* 1000BASE-T FD Capable */ Esr1000XHD = 0x4000, /* 1000BASE-X HD Capable */ Esr1000XFD = 0x8000, /* 1000BASE-X FD Capable */ }; typedef struct Mii { Lock; int nphy; int mask; MiiPhy* phy[NMiiPhy]; MiiPhy* curphy; void* ctlr; int (*mir)(Mii*, int, int); int (*miw)(Mii*, int, int, int); } Mii; typedef struct MiiPhy { Mii* mii; int oui; int phyno; int anar; int fc; int mscr; int link; int speed; int fd; int rfc; int tfc; }; extern int mii(Mii*, int); extern int miiane(Mii*, int, int, int); extern int miimir(Mii*, int); extern int miimiw(Mii*, int, int); extern int miireset(Mii*); extern int miistatus(Mii*); ].lgd->num_vq; j++){ l += snprint(p+l, READSTR-l, "[%d irq %d]", j, lgvirq(i, j)); } */ l += snprint(p+l, READSTR-l, "\n"); } n = readstr(offset, a, n, p); free(p); return n; } static int lgreset(Ether*e) { iprint("LG RESET: e->ctlrno %d\n", e->ctlrno); /* FIXME: be sure to probe the ethers, right now we julguest25/fns.h 664 0 0 11255 11022577643 12342ustar00bootesbootes#include "../port/portfns.h" void aamloop(int); Dirtab* addarchfile(char*, int, long(*)(Chan*,void*,long,vlong), long(*)(Chan*,void*,long,vlong)); void archinit(void); int bios32call(BIOS32ci*, u16int[3]); int bios32ci(BIOS32si*, BIOS32ci*); void bios32close(BIOS32si*); BIOS32si* bios32open(char*); void bootargs(void*); ulong cankaddr(ulong); void clockintr(Ureg*, void*); int (*cmpswap)(long*, long, long); int cmpswap486(long*, long, long); void (*coherence)(void); void cpuid(char*, int*, int*); int cpuidentify(void); void cpuidprint(void); void (*cycles)(uvlong*); void delay(int); int dmacount(int); int dmadone(int); void dmaend(int); int dmainit(int, int); long dmasetup(int, void*, long, int); #define evenaddr(x) /* x86 doesn't care */ void fpclear(void); void fpenv(FPsave*); void fpinit(void); void fpoff(void); void fprestore(FPsave*); void fpsave(FPsave*); ulong fpstatus(void); ulong getcr0(void); ulong getcr2(void); ulong getcr3(void); ulong getcr4(void); char* getconf(char*); void guesscpuhz(int); void halt(void); int i8042auxcmd(int); int i8042auxcmds(uchar*, int); void i8042auxenable(void (*)(int, int)); void i8042reset(void); void i8250console(void); void* i8250alloc(int, int, int); void i8250mouse(char*, int (*)(Queue*, int), int); void i8250setmouseputc(char*, int (*)(Queue*, int)); void i8253enable(void); void i8253init(void); void i8253link(void); uvlong i8253read(uvlong*); void i8253timerset(uvlong); int i8259disable(int); int i8259enable(Vctl*); void i8259init(void); int i8259isr(int); void i8259on(void); void i8259off(void); int i8259vecno(int); void idle(void); void idlehands(void); int inb(int); void insb(int, void*, int); ushort ins(int); void inss(int, void*, int); ulong inl(int); void insl(int, void*, int); int intrdisable(int, void (*)(Ureg *, void *), void*, int, char*); void intrenable(int, void (*)(Ureg*, void*), void*, int, char*); void introff(void); void intron(void); void invlpg(ulong); void iofree(int); void ioinit(void); int iounused(int, int); int ioalloc(int, int, int, char*); int ioreserve(int, int, int, char*); int iprint(char*, ...); int isaconfig(char*, int, ISAConf*); void* kaddr(ulong); void kbdenable(void); void kbdinit(void); #define kmapinval() void lgdt(ushort[3]); void lidt(ushort[3]); void links(void); void ltr(ulong); void mach0init(void); void mathinit(void); void mb386(void); void mb586(void); void meminit(void); void memorysummary(void); void mfence(void); #define mmuflushtlb(pdb) putcr3(pdb) void mmuinit(void); ulong* mmuwalk(ulong*, ulong, int, int); uchar nvramread(int); void nvramwrite(int, uchar); void outb(int, int); void outsb(int, void*, int); void outs(int, ushort); void outss(int, void*, int); void outl(int, ulong); void outsl(int, void*, int); ulong paddr(void*); ulong pcibarsize(Pcidev*, int); void pcibussize(Pcidev*, ulong*, ulong*); int pcicfgr8(Pcidev*, int); int pcicfgr16(Pcidev*, int); int pcicfgr32(Pcidev*, int); void pcicfgw8(Pcidev*, int, int); void pcicfgw16(Pcidev*, int, int); void pcicfgw32(Pcidev*, int, int); void pciclrbme(Pcidev*); void pciclrioe(Pcidev*); void pciclrmwi(Pcidev*); int pcigetpms(Pcidev*); void pcihinv(Pcidev*); uchar pciipin(Pcidev*, uchar); Pcidev* pcimatch(Pcidev*, int, int); Pcidev* pcimatchtbdf(int); void pcireset(void); int pciscan(int, Pcidev**); void pcisetbme(Pcidev*); void pcisetioe(Pcidev*); void pcisetmwi(Pcidev*); int pcisetpms(Pcidev*, int); void pcmcisread(PCMslot*); int pcmcistuple(int, int, int, void*, int); PCMmap* pcmmap(int, ulong, int, int); int pcmspecial(char*, ISAConf*); int (*_pcmspecial)(char *, ISAConf *); void pcmspecialclose(int); void (*_pcmspecialclose)(int); void pcmunmap(int, PCMmap*); int pdbmap(ulong*, ulong, ulong, int); void procrestore(Proc*); void procsave(Proc*); void procsetup(Proc*); void putcr3(ulong); void putcr4(ulong); void* rampage(void); void rdmsr(int, vlong*); void realmode(Ureg*); void screeninit(void); void (*screenputs)(char*, int); void syncclock(void); void* tmpmap(Page*); void tmpunmap(void*); void touser(void*); void trapenable(int, void (*)(Ureg*, void*), void*, char*); void trapinit(void); void trapinit0(void); int tas(void*); uvlong tscticks(uvlong*); ulong umbmalloc(ulong, int, int); void umbfree(ulong, int); ulong umbrwmalloc(ulong, int, int); void umbrwfree(ulong, int); ulong upaalloc(int, int); void upafree(ulong, int); void upareserve(ulong, int); #define userureg(ur) (((ur)->cs & 0xFFFF) == UESEL) void vectortable(void); void* vmap(ulong, int); int vmapsync(ulong); void vunmap(void*, int); void wrmsr(int, vlong); int xchgw(ushort*, int); void dumphex(char *, unsigned char *, int); #define waserror() (up->nerrlab++, setlabel(&up->errlab[up->nerrlab-1])) #define KADDR(a) kaddr(a) #define PADDR(a) paddr((void*)(a)) #define dcflush(a, b) Jiggering address\n"); edev->ea[5] += 1; } } edev->attach = lgattach; edev->transmit = lgtransmit; edev->interrupt = lginterrupt; edev->ifstat = lgifstat; edev->arg = edev; edev->promiscuous = lgpromiscuous; edev->multicast = lgmulticast; // edev->shutdown = lgshutdown; return 0; } void etherlglink(void) { addethercardlguest25/io.h 664 0 0 17640 10774531324 12166ustar00bootesbootes#define X86STEPPING(x) ((x) & 0x0F) #define X86MODEL(x) (((x)>>4) & 0x0F) #define X86FAMILY(x) (((x)>>8) & 0x0F) enum { VectorNMI = 2, /* non-maskable interrupt */ VectorBPT = 3, /* breakpoint */ VectorUD = 6, /* invalid opcode exception */ VectorCNA = 7, /* coprocessor not available */ Vector2F = 8, /* double fault */ VectorCSO = 9, /* coprocessor segment overrun */ VectorPF = 14, /* page fault */ Vector15 = 15, /* reserved */ VectorCERR = 16, /* coprocessor error */ VectorPIC = 32, /* external i8259 interrupts */ IrqCLOCK = 32, IrqKBD = 1, IrqUART1 = 3, IrqUART0 = 4, IrqPCMCIA = 5, IrqFLOPPY = 6, IrqLPT = 7, IrqIRQ7 = 7, IrqAUX = 12, /* PS/2 port */ IrqIRQ13 = 13, /* coprocessor on 386 */ IrqATA0 = 14, IrqATA1 = 15, MaxIrqPIC = 15, VectorLAPIC = VectorPIC+16, /* local APIC interrupts */ IrqLINT0 = 16, /* LINT[01] must be offsets 0 and 1 */ IrqLINT1 = 17, IrqTIMER = 18, IrqERROR = 19, IrqPCINT = 20, IrqSPURIOUS = 31, /* must have bits [3-0] == 0x0F */ MaxIrqLAPIC = 31, VectorSYSCALL = 64, VectorAPIC = 65, /* external APIC interrupts */ MaxVectorAPIC = 255, }; typedef struct Vctl { Vctl* next; /* handlers on this vector */ char name[KNAMELEN]; /* of driver */ int isintr; /* interrupt or fault/trap */ int irq; int tbdf; int (*isr)(int); /* get isr bit for this irq */ int (*eoi)(int); /* eoi */ void (*f)(Ureg*, void*); /* handler to call */ void* a; /* argument to call it with */ } Vctl; enum { BusCBUS = 0, /* Corollary CBUS */ BusCBUSII, /* Corollary CBUS II */ BusEISA, /* Extended ISA */ BusFUTURE, /* IEEE Futurebus */ BusINTERN, /* Internal bus */ BusISA, /* Industry Standard Architecture */ BusMBI, /* Multibus I */ BusMBII, /* Multibus II */ BusMCA, /* Micro Channel Architecture */ BusMPI, /* MPI */ BusMPSA, /* MPSA */ BusNUBUS, /* Apple Macintosh NuBus */ BusPCI, /* Peripheral Component Interconnect */ BusPCMCIA, /* PC Memory Card International Association */ BusTC, /* DEC TurboChannel */ BusVL, /* VESA Local bus */ BusVME, /* VMEbus */ BusXPRESS, /* Express System Bus */ }; #define MKBUS(t,b,d,f) (((t)<<24)|(((b)&0xFF)<<16)|(((d)&0x1F)<<11)|(((f)&0x07)<<8)) #define BUSFNO(tbdf) (((tbdf)>>8)&0x07) #define BUSDNO(tbdf) (((tbdf)>>11)&0x1F) #define BUSBNO(tbdf) (((tbdf)>>16)&0xFF) #define BUSTYPE(tbdf) ((tbdf)>>24) #define BUSBDF(tbdf) ((tbdf)&0x00FFFF00) #define BUSUNKNOWN (-1) enum { MaxEISA = 16, CfgEISA = 0xC80, }; /* * PCI support code. */ enum { /* type 0 and type 1 pre-defined header */ PciVID = 0x00, /* vendor ID */ PciDID = 0x02, /* device ID */ PciPCR = 0x04, /* command */ PciPSR = 0x06, /* status */ PciRID = 0x08, /* revision ID */ PciCCRp = 0x09, /* programming interface class code */ PciCCRu = 0x0A, /* sub-class code */ PciCCRb = 0x0B, /* base class code */ PciCLS = 0x0C, /* cache line size */ PciLTR = 0x0D, /* latency timer */ PciHDT = 0x0E, /* header type */ PciBST = 0x0F, /* BIST */ PciBAR0 = 0x10, /* base address */ PciBAR1 = 0x14, PciINTL = 0x3C, /* interrupt line */ PciINTP = 0x3D, /* interrupt pin */ }; enum { /* type 0 pre-defined header */ PciCIS = 0x28, /* cardbus CIS pointer */ PciSVID = 0x2C, /* subsystem vendor ID */ PciSID = 0x2E, /* cardbus CIS pointer */ PciEBAR0 = 0x30, /* expansion ROM base address */ PciMGNT = 0x3E, /* burst period length */ PciMLT = 0x3F, /* maximum latency between bursts */ }; enum { /* type 1 pre-defined header */ PciPBN = 0x18, /* primary bus number */ PciSBN = 0x19, /* secondary bus number */ PciUBN = 0x1A, /* subordinate bus number */ PciSLTR = 0x1B, /* secondary latency timer */ PciIBR = 0x1C, /* I/O base */ PciILR = 0x1D, /* I/O limit */ PciSPSR = 0x1E, /* secondary status */ PciMBR = 0x20, /* memory base */ PciMLR = 0x22, /* memory limit */ PciPMBR = 0x24, /* prefetchable memory base */ PciPMLR = 0x26, /* prefetchable memory limit */ PciPUBR = 0x28, /* prefetchable base upper 32 bits */ PciPULR = 0x2C, /* prefetchable limit upper 32 bits */ PciIUBR = 0x30, /* I/O base upper 16 bits */ PciIULR = 0x32, /* I/O limit upper 16 bits */ PciEBAR1 = 0x28, /* expansion ROM base address */ PciBCR = 0x3E, /* bridge control register */ }; enum { /* type 2 pre-defined header */ PciCBExCA = 0x10, PciCBSPSR = 0x16, PciCBPBN = 0x18, /* primary bus number */ PciCBSBN = 0x19, /* secondary bus number */ PciCBUBN = 0x1A, /* subordinate bus number */ PciCBSLTR = 0x1B, /* secondary latency timer */ PciCBMBR0 = 0x1C, PciCBMLR0 = 0x20, PciCBMBR1 = 0x24, PciCBMLR1 = 0x28, PciCBIBR0 = 0x2C, /* I/O base */ PciCBILR0 = 0x30, /* I/O limit */ PciCBIBR1 = 0x34, /* I/O base */ PciCBILR1 = 0x38, /* I/O limit */ PciCBSVID = 0x40, /* subsystem vendor ID */ PciCBSID = 0x42, /* subsystem ID */ PciCBLMBAR = 0x44, /* legacy mode base address */ }; typedef struct Pcisiz Pcisiz; struct Pcisiz { Pcidev* dev; int siz; int bar; }; typedef struct Pcidev Pcidev; struct Pcidev { int tbdf; /* type+bus+device+function */ ushort vid; /* vendor ID */ ushort did; /* device ID */ ushort pcr; uchar rid; uchar ccrp; uchar ccru; uchar ccrb; uchar cls; uchar ltr; struct { ulong bar; /* base address */ int size; } mem[6]; struct { ulong bar; int size; } rom; uchar intl; /* interrupt line */ Pcidev* list; Pcidev* link; /* next device on this bno */ Pcidev* bridge; /* down a bus */ struct { ulong bar; int size; } ioa, mema; int pmrb; /* power management register block */ }; #define PCIWINDOW 0 #define PCIWADDR(va) (PADDR(va)+PCIWINDOW) #define ISAWINDOW 0 #define ISAWADDR(va) (PADDR(va)+ISAWINDOW) /* SMBus transactions */ enum { SMBquick, /* sends address only */ /* write */ SMBsend, /* sends address and cmd */ SMBbytewrite, /* sends address and cmd and 1 byte */ SMBwordwrite, /* sends address and cmd and 2 bytes */ /* read */ SMBrecv, /* sends address, recvs 1 byte */ SMBbyteread, /* sends address and cmd, recv's byte */ SMBwordread, /* sends address and cmd, recv's 2 bytes */ }; typedef struct SMBus SMBus; struct SMBus { QLock; /* mutex */ Rendez r; /* rendezvous point for completion interrupts */ void *arg; /* implementation dependent */ ulong base; /* port or memory base of smbus */ int busy; void (*transact)(SMBus*, int, int, int, uchar*); }; /* * PCMCIA support code. */ typedef struct PCMslot PCMslot; typedef struct PCMconftab PCMconftab; /* * Map between ISA memory space and PCMCIA card memory space. */ struct PCMmap { ulong ca; /* card address */ ulong cea; /* card end address */ ulong isa; /* ISA address */ int len; /* length of the ISA area */ int attr; /* attribute memory */ int ref; }; /* configuration table entry */ struct PCMconftab { int index; ushort irqs; /* legal irqs */ uchar irqtype; uchar bit16; /* true for 16 bit access */ struct { ulong start; ulong len; } io[16]; int nio; uchar vpp1; uchar vpp2; uchar memwait; ulong maxwait; ulong readywait; ulong otherwait; }; /* a card slot */ struct PCMslot { Lock; int ref; void *cp; /* controller for this slot */ long memlen; /* memory length */ uchar base; /* index register base */ uchar slotno; /* slot number */ /* status */ uchar special; /* in use for a special device */ uchar already; /* already inited */ uchar occupied; uchar battery; uchar wrprot; uchar powered; uchar configed; uchar enabled; uchar busy; /* cis info */ ulong msec; /* time of last slotinfo call */ char verstr[512]; /* version string */ int ncfg; /* number of configurations */ struct { ushort cpresent; /* config registers present */ ulong caddr; /* relative address of config registers */ } cfg[8]; int nctab; /* number of config table entries */ PCMconftab ctab[8]; PCMconftab *def; /* default conftab */ /* memory maps */ Lock mlock; /* lock down the maps */ int time; PCMmap mmap[4]; /* maps, last is always for the kernel */ }; pcmmap(int, ulong, int, int); int pcmspecial(char*, ISAConf*); int (*_pcmspecial)(char *, ISAColguest25/l.s 664 0 0 72176 11033021363 12014ustar00bootesbootes#include "mem.h" #include "/sys/src/boot/pc/x16.h" #define LHCALL_FLUSH_ASYNC 0 #define LHCALL_LGUEST_INIT 1 #define LHCALL_CRASH 2 #define LHCALL_LOAD_GDT 3 #define LHCALL_NEW_PGTABLE 4 #define LHCALL_FLUSH_TLB 5 #define LHCALL_LOAD_IDT_ENTRY 6 #define LHCALL_SET_STACK 7 #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_BIND_DMA 12 #define LHCALL_SET_PTE 14 #define LHCALL_SET_PMD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 /* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ #define LGUEST_SHUTDOWN_POWEROFF 1 #define LGUEST_SHUTDOWN_RESTART 2 #define LGUEST_TRAP_ENTRY 0x1F #undef DELAY #define PADDR(a) ((a) & ~KZERO) #define KADDR(a) (KZERO|(a)) /* * Some machine instructions not handled by 8[al]. */ #define OP16 BYTE $0x66 #define DELAY BYTE $0xEB; BYTE $0x00 /* JMP .+2 */ #define CPUID BYTE $0x0F; BYTE $0xA2 /* CPUID, argument in AX */ #define WRMSR BYTE $0x0F; BYTE $0x30 /* WRMSR, argument in AX/DX (lo/hi) */ #define RDTSC BYTE $0x0F; BYTE $0x31 /* RDTSC, result in AX/DX (lo/hi) */ #define RDMSR BYTE $0x0F; BYTE $0x32 /* RDMSR, result in AX/DX (lo/hi) */ #define HLT BYTE $0xF4 #define INVLPG BYTE $0x0F; BYTE $0x01; BYTE $0x39 /* INVLPG (%ecx) */ /* * Macros for calculating offsets within the page directory base * and page tables. Note that these are assembler-specific hence * the '<<2'. */ #define PDO(a) (((((a))>>22) & 0x03FF)<<2) #define PTO(a) (((((a))>>12) & 0x03FF)<<2) /* * For backwards compatiblity with 9load - should go away when 9load is changed * 9load currently sets up the mmu, however the first 16MB of memory is identity * mapped, so behave as if the mmu was not setup */ TEXT _startKADDR(SB), 1, $0 MOVL $_startPADDR(SB), AX ANDL $~KZERO, AX JMP* AX /* * Must be 4-byte aligned. */ TEXT _multibootheader(SB), 1, $0 LONG $0x1BADB002 /* magic */ LONG $0x00010003 /* flags */ LONG $-(0x1BADB002 + 0x00010003) /* checksum */ LONG $_multibootheader-KZERO(SB) /* header_addr */ LONG $_startKADDR-KZERO(SB) /* load_addr */ LONG $edata-KZERO(SB) /* load_end_addr */ LONG $end-KZERO(SB) /* bss_end_addr */ LONG $_startKADDR-KZERO(SB) /* entry_addr */ LONG $0 /* mode_type */ LONG $0 /* width */ LONG $0 /* height */ LONG $0 /* depth */ /* * In protected mode with paging turned off and segment registers setup * to linear map all memory. Entered via a jump to PADDR(entry), * the physical address of the virtual kernel entry point of KADDR(entry). * Make the basic page tables for processor 0. Six pages are needed for * the basic set: * a page directory; * page tables for mapping the first 8MB of physical memory to KZERO; * a page for the GDT; * virtual and physical pages for mapping the Mach structure. * The remaining PTEs will be allocated later when memory is sized. * An identity mmu map is also needed for the switch to virtual mode. * This identity mapping is removed once the MMU is going and the JMP has * been made to virtual memory. */ TEXT _startPADDR(SB), 1, $0 /* Here begins the code for running under lguest. * A few things: memory is zero'd, we have gdt/paging, * but not set up where we need it. * we do the init, DO NOT do the gdt's, do the paging setup in assembly * so the CPUMACH -> CPU0MACH mapping is right, then go to C */ /* set up the gdt so we have sane plan 9 style gdts. */ /* the very first thing we have to do is tell lguest where our * lguest_data is */ MOVL $LHCALL_LGUEST_INIT, AX MOVL $lguest_data - KZERO(SB), DX INT $LGUEST_TRAP_ENTRY /* NOTE: 12 is KESEG. NOT THE SAME AS PC */ /* JMP $(12<<3):$mode32bit(SB) /**/ BYTE $0xEA LONG $mode32bit-KZERO(SB) WORD $(12<<3) /* * gdt to get us to 32-bit/segmented/unpaged mode */ TEXT tgdt(SB), 1, $0 /* null descriptor */ LONG $0 LONG $0 /* data segment descriptor for 4 gigabytes (PL 0) */ LONG $(0xFFFF) LONG $(SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(0)|SEGDATA|SEGW) /* exec segment descriptor for 4 gigabytes (PL 0) */ LONG $(0xFFFF) LONG $(SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(0)|SEGEXEC|SEGR) /* * pointer to initial gdt * Note the -KZERO which puts the physical address in the gdtptr. * that's needed as we start executing in physical addresses. */ TEXT tgdtptr(SB), 1, $0 WORD $(3*8) LONG $tgdt-KZERO(SB) TEXT m0rgdtptr(SB), 1, $0 WORD $(NGDT*8-1) LONG $(CPU0GDT-KZERO) TEXT m0gdtptr(SB), 1, $0 WORD $(NGDT*8-1) LONG $CPU0GDT TEXT m0idtptr(SB), 1, $0 WORD $(256*8-1) LONG $IDTADDR TEXT mode32bit(SB), 1, $0 /* At this point, the GDT setup is done. */ MOVL $PADDR(CPU0PDB), DI /* clear 4 pages for the tables etc. */ XORL AX, AX MOVL $(4*BY2PG), CX SHRL $2, CX CLD REP; STOSL MOVL $PADDR(CPU0PDB), AX ADDL $PDO(KZERO), AX /* page directory offset for KZERO */ MOVL $PADDR(CPU0PTE), (AX) /* PTE's for KZERO */ MOVL $(PTEWRITE|PTEVALID), BX /* page permissions */ ORL BX, (AX) ADDL $4, AX MOVL $PADDR(CPU0PTE1), (AX) /* PTE's for KZERO+4MB */ MOVL $(PTEWRITE|PTEVALID), BX /* page permissions */ ORL BX, (AX) MOVL $PADDR(CPU0PTE), AX /* first page of page table */ MOVL $1024, CX /* 1024 pages in 4MB */ _setpte: MOVL BX, (AX) ADDL $(1<machno */ ADDL $(MACHSIZE-4), SP /* initialise stack */ /* * Need to do one final thing to ensure a clean machine environment, * clear the EFLAGS register, which can only be done once there is a stack. */ MOVL $0, AX PUSHL AX POPFL CALL main(SB) /* * Save registers. */ TEXT saveregs(SB), 1, $0 /* appease 8l */ SUBL $32, SP POPL AX POPL AX POPL AX POPL AX POPL AX POPL AX POPL AX POPL AX PUSHL AX PUSHL BX PUSHL CX PUSHL DX PUSHL BP PUSHL DI PUSHL SI PUSHFL XCHGL 32(SP), AX /* swap return PC and saved flags */ XCHGL 0(SP), AX XCHGL 32(SP), AX RET TEXT restoreregs(SB), 1, $0 /* appease 8l */ PUSHL AX PUSHL AX PUSHL AX PUSHL AX PUSHL AX PUSHL AX PUSHL AX PUSHL AX ADDL $32, SP XCHGL 32(SP), AX /* swap return PC and saved flags */ XCHGL 0(SP), AX XCHGL 32(SP), AX POPFL POPL SI POPL DI POPL BP POPL DX POPL CX POPL BX POPL AX RET /* * Assumed to be in protected mode at time of call. * Switch to real mode, execute an interrupt, and * then switch back to protected mode. * * Assumes: * * - no device interrupts are going to come in * - 0-16MB is identity mapped in page tables * - realmode() has copied us down from 0x100000 to 0x8000 * - can use code segment 0x0800 in real mode * to get at l.s code * - l.s code is less than 1 page */ #define RELOC (RMCODE-KTZERO) TEXT realmodeidtptr(SB), 1, $0 WORD $(4*256-1) LONG $0 TEXT realmode0(SB), 1, $0 CALL saveregs(SB) /* switch to low code address */ LEAL physcode-KZERO(SB), AX JMP *AX TEXT physcode(SB), 1, $0 /* switch to low stack */ MOVL SP, AX MOVL $0x7C00, SP PUSHL AX /* change gdt to physical pointer */ MOVL m0rgdtptr-KZERO(SB), GDTR /* load IDT with real-mode version*/ MOVL realmodeidtptr-KZERO(SB), IDTR /* edit INT $0x00 instruction below */ MOVL $(RMUADDR-KZERO+48), AX /* &rmu.trap */ MOVL (AX), AX MOVB AX, realmodeintrinst+(-KZERO+1+RELOC)(SB) /* disable paging */ MOVL CR0, AX ANDL $0x7FFFFFFF, AX MOVL AX, CR0 /* JMP .+2 to clear prefetch queue*/ BYTE $0xEB; BYTE $0x00 /* jump to 16-bit code segment */ /* JMPFAR SELECTOR(KESEG16, SELGDT, 0):$again16bit(SB) /**/ BYTE $0xEA LONG $again16bit-KZERO(SB) WORD $SELECTOR(KESEG16, SELGDT, 0) TEXT again16bit(SB), 1, $0 /* * Now in 16-bit compatibility mode. * These are 32-bit instructions being interpreted * as 16-bit instructions. I'm being lazy and * not using the macros because I know when * the 16- and 32-bit instructions look the same * or close enough. */ /* disable protected mode and jump to real mode cs */ OPSIZE; MOVL CR0, AX OPSIZE; XORL BX, BX OPSIZE; INCL BX OPSIZE; XORL BX, AX OPSIZE; MOVL AX, CR0 /* JMPFAR 0x0800:now16real */ BYTE $0xEA WORD $now16real-KZERO(SB) WORD $0x0800 TEXT now16real(SB), 1, $0 /* copy the registers for the bios call */ LWI(0x0000, rAX) MOVW AX,SS LWI(RMUADDR, rBP) /* offsets are in Ureg */ LXW(44, xBP, rAX) MOVW AX, DS LXW(40, xBP, rAX) MOVW AX, ES OPSIZE; LXW(0, xBP, rDI) OPSIZE; LXW(4, xBP, rSI) OPSIZE; LXW(16, xBP, rBX) OPSIZE; LXW(20, xBP, rDX) OPSIZE; LXW(24, xBP, rCX) OPSIZE; LXW(28, xBP, rAX) CLC TEXT realmodeintrinst(SB), 1, $0 INT $0x00 /* save the registers after the call */ LWI(0x7bfc, rSP) OPSIZE; PUSHFL OPSIZE; PUSHL AX LWI(0, rAX) MOVW AX,SS LWI(RMUADDR, rBP) OPSIZE; SXW(rDI, 0, xBP) OPSIZE; SXW(rSI, 4, xBP) OPSIZE; SXW(rBX, 16, xBP) OPSIZE; SXW(rDX, 20, xBP) OPSIZE; SXW(rCX, 24, xBP) OPSIZE; POPL AX OPSIZE; SXW(rAX, 28, xBP) MOVW DS, AX OPSIZE; SXW(rAX, 44, xBP) MOVW ES, AX OPSIZE; SXW(rAX, 40, xBP) OPSIZE; POPL AX OPSIZE; SXW(rAX, 64, xBP) /* flags */ /* re-enter protected mode and jump to 32-bit code */ OPSIZE; MOVL $1, AX OPSIZE; MOVL AX, CR0 /* JMPFAR SELECTOR(KESEG, SELGDT, 0):$again32bit(SB) /**/ OPSIZE BYTE $0xEA LONG $again32bit-KZERO(SB) WORD $SELECTOR(KESEG, SELGDT, 0) TEXT again32bit(SB), 1, $0 MOVW $SELECTOR(KDSEG, SELGDT, 0),AX MOVW AX,DS MOVW AX,SS MOVW AX,ES MOVW AX,FS MOVW AX,GS /* enable paging and jump to kzero-address code */ MOVL CR0, AX ORL $0x80010000, AX /* PG|WP */ MOVL AX, CR0 LEAL again32kzero(SB), AX JMP* AX TEXT again32kzero(SB), 1, $0 /* breathe a sigh of relief - back in 32-bit protected mode */ /* switch to old stack */ PUSHL AX /* match popl below for 8l */ MOVL $0x7BFC, SP POPL SP /* restore idt */ MOVL m0idtptr(SB),IDTR /* restore gdt */ MOVL m0gdtptr(SB), GDTR CALL restoreregs(SB) RET /* * BIOS32. */ TEXT bios32call(SB), 1, $0 MOVL ci+0(FP), BP MOVL 0(BP), AX MOVL 4(BP), BX MOVL 8(BP), CX MOVL 12(BP), DX MOVL 16(BP), SI MOVL 20(BP), DI PUSHL BP MOVL 12(SP), BP /* ptr */ BYTE $0xFF; BYTE $0x5D; BYTE $0x00 /* CALL FAR 0(BP) */ POPL BP MOVL DI, 20(BP) MOVL SI, 16(BP) MOVL DX, 12(BP) MOVL CX, 8(BP) MOVL BX, 4(BP) MOVL AX, 0(BP) XORL AX, AX JCC _bios32xxret INCL AX _bios32xxret: RET /* * Port I/O. * in[bsl] input a byte|short|long * ins[bsl] input a string of bytes|shorts|longs * out[bsl] output a byte|short|long * outs[bsl] output a string of bytes|shorts|longs */ TEXT inb(SB), 1, $0 MOVL port+0(FP), DX XORL AX, AX INB RET TEXT insb(SB), 1, $0 MOVL port+0(FP), DX MOVL address+4(FP), DI MOVL count+8(FP), CX CLD REP; INSB RET TEXT ins(SB), 1, $0 MOVL port+0(FP), DX XORL AX, AX OP16; INL RET TEXT inss(SB), 1, $0 MOVL port+0(FP), DX MOVL address+4(FP), DI MOVL count+8(FP), CX CLD REP; OP16; INSL RET TEXT inl(SB), 1, $0 MOVL port+0(FP), DX INL RET TEXT insl(SB), 1, $0 MOVL port+0(FP), DX MOVL address+4(FP), DI MOVL count+8(FP), CX CLD REP; INSL RET TEXT outb(SB), 1, $0 MOVL port+0(FP), DX MOVL byte+4(FP), AX OUTB RET TEXT outsb(SB), 1, $0 MOVL port+0(FP), DX MOVL address+4(FP), SI MOVL count+8(FP), CX CLD REP; OUTSB RET TEXT outs(SB), 1, $0 MOVL port+0(FP), DX MOVL short+4(FP), AX OP16; OUTL RET TEXT outss(SB), 1, $0 MOVL port+0(FP), DX MOVL address+4(FP), SI MOVL count+8(FP), CX CLD REP; OP16; OUTSL RET TEXT outl(SB), 1, $0 MOVL port+0(FP), DX MOVL long+4(FP), AX OUTL RET TEXT outsl(SB), 1, $0 MOVL port+0(FP), DX MOVL address+4(FP), SI MOVL count+8(FP), CX CLD REP; OUTSL RET /* * Read/write various system registers. * CR4 and the 'model specific registers' should only be read/written * after it has been determined the processor supports them */ TEXT invlpg(SB), 1, $0 /* 486+ only */ MOVL va+0(FP), CX INVLPG RET TEXT _cycles(SB), 1, $0 /* time stamp counter */ RDTSC MOVL vlong+0(FP), CX /* &vlong */ MOVL AX, 0(CX) /* lo */ MOVL DX, 4(CX) /* hi */ RET /* * stub for: * time stamp counter; low-order 32 bits of 64-bit cycle counter * Runs at fasthz/4 cycles per second (m->clkin>>3) */ TEXT lcycles(SB), 1,$0 RDTSC RET TEXT rdmsr(SB), 1, $0 /* model-specific register */ MOVL index+0(FP), CX RDMSR MOVL vlong+4(FP), CX /* &vlong */ MOVL AX, 0(CX) /* lo */ MOVL DX, 4(CX) /* hi */ RET TEXT wrmsr(SB), 1, $0 MOVL index+0(FP), CX MOVL lo+4(FP), AX MOVL hi+8(FP), DX WRMSR RET /* * Try to determine the CPU type which requires fiddling with EFLAGS. * If the Id bit can be toggled then the CPUID instruction can be used * to determine CPU identity and features. First have to check if it's * a 386 (Ac bit can't be set). If it's not a 386 and the Id bit can't be * toggled then it's an older 486 of some kind. * * cpuid(id[], &ax, &dx); */ TEXT cpuid(SB), 1, $0 MOVL $0x240000, AX PUSHL AX POPFL /* set Id|Ac */ PUSHFL POPL BX /* retrieve value */ MOVL $0, AX PUSHL AX POPFL /* clear Id|Ac, EFLAGS initialised */ PUSHFL POPL AX /* retrieve value */ XORL BX, AX TESTL $0x040000, AX /* Ac */ JZ _cpu386 /* can't set this bit on 386 */ TESTL $0x200000, AX /* Id */ JZ _cpu486 /* can't toggle this bit on some 486 */ MOVL $0, AX CPUID MOVL id+0(FP), BP MOVL BX, 0(BP) /* "Genu" "Auth" "Cyri" */ MOVL DX, 4(BP) /* "ineI" "enti" "xIns" */ MOVL CX, 8(BP) /* "ntel" "cAMD" "tead" */ MOVL $1, AX CPUID JMP _cpuid _cpu486: MOVL $0x400, AX MOVL $0, DX JMP _cpuid _cpu386: MOVL $0x300, AX MOVL $0, DX _cpuid: MOVL ax+4(FP), BP MOVL AX, 0(BP) MOVL dx+8(FP), BP MOVL DX, 0(BP) RET /* * Basic timing loop to determine CPU frequency. */ TEXT aamloop(SB), 1, $0 MOVL count+0(FP), CX _aamloop: AAM LOOP _aamloop RET /* * Floating point. * Note: the encodings for the FCLEX, FINIT, FSAVE, FSTCW, FSENV and FSTSW * instructions do NOT have the WAIT prefix byte (i.e. they act like their * FNxxx variations) so WAIT instructions must be explicitly placed in the * code as necessary. */ #define FPOFF(l) ;\ MOVL CR0, AX ;\ ANDL $0xC, AX /* EM, TS */ ;\ CMPL AX, $0x8 ;\ JEQ l ;\ WAIT ;\ l: ;\ MOVL CR0, AX ;\ ANDL $~0x4, AX /* EM=0 */ ;\ ORL $0x28, AX /* NE=1, TS=1 */ ;\ MOVL AX, CR0 #define FPON ;\ MOVL CR0, AX ;\ ANDL $~0xC, AX /* EM=0, TS=0 */ ;\ MOVL AX, CR0 TEXT fpoff(SB), 1, $0 /* disable */ // FPOFF(l1) RET TEXT fpinit(SB), 1, $0 /* enable and init */ FPON FINIT WAIT /* setfcr(FPPDBL|FPRNR|FPINVAL|FPZDIV|FPOVFL) */ /* note that low 6 bits are masks, not enables, on this chip */ PUSHW $0x0232 FLDCW 0(SP) POPW AX WAIT RET TEXT fpsave(SB), 1, $0 /* save state and disable */ MOVL p+0(FP), AX FSAVE 0(AX) /* no WAIT */ FPOFF(l2) RET TEXT fprestore(SB), 1, $0 /* enable and restore state */ FPON MOVL p+0(FP), AX FRSTOR 0(AX) WAIT RET TEXT fpstatus(SB), 1, $0 /* get floating point status */ FSTSW AX RET TEXT fpenv(SB), 1, $0 /* save state without waiting */ MOVL p+0(FP), AX FSTENV 0(AX) RET TEXT fpclear(SB), 1, $0 /* clear pending exceptions */ FPON FCLEX /* no WAIT */ FPOFF(l3) RET /* * Test-And-Set */ TEXT tas(SB), 1, $0 MOVL $0xDEADDEAD, AX MOVL lock+0(FP), BX XCHGL AX, (BX) /* lock->key */ RET TEXT _xinc(SB), 1, $0 /* void _xinc(long*); */ MOVL l+0(FP), AX LOCK; INCL 0(AX) RET TEXT _xdec(SB), 1, $0 /* long _xdec(long*); */ MOVL l+0(FP), BX XORL AX, AX LOCK; DECL 0(BX) JLT _xdeclt JGT _xdecgt RET _xdecgt: INCL AX RET _xdeclt: DECL AX RET TEXT mb386(SB), 1, $0 POPL AX /* return PC */ PUSHFL PUSHL CS PUSHL AX IRETL TEXT mb586(SB), 1, $0 XORL AX, AX CPUID RET TEXT sfence(SB), 1, $0 BYTE $0x0f BYTE $0xae BYTE $0xf8 RET TEXT lfence(SB), 1, $0 BYTE $0x0f BYTE $0xae BYTE $0xe8 RET TEXT mfence(SB), 1, $0 BYTE $0x0f BYTE $0xae BYTE $0xf0 RET TEXT xchgw(SB), 1, $0 MOVL v+4(FP), AX MOVL p+0(FP), BX XCHGW AX, (BX) RET TEXT cmpswap486(SB), 1, $0 MOVL addr+0(FP), BX MOVL old+4(FP), AX MOVL new+8(FP), CX LOCK BYTE $0x0F; BYTE $0xB1; BYTE $0x0B /* CMPXCHGL CX, (BX) */ JNZ didnt MOVL $1, AX RET didnt: XORL AX,AX RET TEXT mul64fract(SB), 1, $0 /* * Multiply two 64-bit number s and keep the middle 64 bits from the 128-bit result * See ../port/tod.c for motivation. */ MOVL r+0(FP), CX XORL BX, BX /* BX = 0 */ MOVL a+8(FP), AX MULL b+16(FP) /* a1*b1 */ MOVL AX, 4(CX) /* r2 = lo(a1*b1) */ MOVL a+8(FP), AX MULL b+12(FP) /* a1*b0 */ MOVL AX, 0(CX) /* r1 = lo(a1*b0) */ ADDL DX, 4(CX) /* r2 += hi(a1*b0) */ MOVL a+4(FP), AX MULL b+16(FP) /* a0*b1 */ ADDL AX, 0(CX) /* r1 += lo(a0*b1) */ ADCL DX, 4(CX) /* r2 += hi(a0*b1) + carry */ MOVL a+4(FP), AX MULL b+12(FP) /* a0*b0 */ ADDL DX, 0(CX) /* r1 += hi(a0*b0) */ ADCL BX, 4(CX) /* r2 += carry */ RET /* * label consists of a stack pointer and a PC */ TEXT gotolabel(SB), 1, $0 MOVL label+0(FP), AX MOVL 0(AX), SP /* restore sp */ MOVL 4(AX), AX /* put return pc on the stack */ MOVL AX, 0(SP) MOVL $1, AX /* return 1 */ RET TEXT setlabel(SB), 1, $0 MOVL label+0(FP), AX MOVL SP, 0(AX) /* store sp */ MOVL 0(SP), BX /* store return pc */ MOVL BX, 4(AX) MOVL $0, AX /* return 0 */ RET /* * Interrupt/exception handling. * Each entry in the vector table calls either _strayintr or _strayintrx depending * on whether an error code has been automatically pushed onto the stack * (_strayintrx) or not, in which case a dummy entry must be pushed before retrieving * the trap type from the vector table entry and placing it on the stack as part * of the Ureg structure. * The size of each entry in the vector table (6 bytes) is known in trapinit(). */ TEXT _strayintr(SB), 1, $0 PUSHL AX /* save AX */ MOVL 4(SP), AX /* return PC from vectortable(SB) */ JMP intrcommon TEXT _strayintrx(SB), 1, $0 XCHGL AX, (SP) /* swap AX with vectortable CALL PC */ intrcommon: PUSHL DS /* save DS */ PUSHL $(KDSEL) POPL DS /* fix up DS */ MOVBLZX (AX), AX /* trap type -> AX */ XCHGL AX, 4(SP) /* exchange trap type with saved AX */ PUSHL ES /* save ES */ PUSHL $(KDSEL) POPL ES /* fix up ES */ PUSHL FS /* save the rest of the Ureg struct */ PUSHL GS PUSHAL PUSHL SP /* Ureg* argument to trap */ CALL trap(SB) POPL AX POPAL POPL GS POPL FS POPL ES POPL DS ADDL $8, SP /* pop error code and trap type */ /* the lguest iret. This is about 1e6 times simpler than Xen ... */ /* just tell the kernel not to interrupt you in the shower. */ TEXT lguest_iret(SB), $0 PUSHL AX MOVL 12(SP), AX TEXT lguest_noirq_start(SB), $0 /* rusty very wisely put irq_enabled as the first struct member */ /* talk to jmk about how to do externs */ MOVL AX, lguest_data+0(SB) POPL AX IRETL TEXT lguest_noirq_end(SB), $0 TEXT forkret(SB), $0 POPL AX POPAL POPL GS POPL FS POPL ES POPL DS ADDL $8, SP /* pop error code and trap type */ IRETL /* LGUEST stuff. */ /* lguest hypercall. Always has call # and 3 parameters */ /* these saves of regs are not needed ... */ TEXT hcall(SB), $0 /* PUSHL AX PUSHL BX PUSHL CX PUSHL DX*/ MOVL ARG3+12(FP), CX MOVL ARG2+8(FP), BX MOVL ARG1+4(FP), DX MOVL C+0(FP), AX INT $0x1F /* POPL DX POPL CX POPL BX POPL AX*/ RET TEXT native_cpuid+0(SB),0,$0 MOVL data+4(FP), BP MOVL function+0(FP), AX CPUID MOVL AX, 0(BP) MOVL BX, 4(BP) MOVL CX, 8(BP) MOVL DX, 12(BP) RET TEXT vectortable(SB), 1, $0 CALL _strayintr(SB); BYTE $0x00 /* divide error */ CALL _strayintr(SB); BYTE $0x01 /* debug exception */ CALL _strayintr(SB); BYTE $0x02 /* NMI interrupt */ CALL _strayintr(SB); BYTE $0x03 /* breakpoint */ CALL _strayintr(SB); BYTE $0x04 /* overflow */ CALL _strayintr(SB); BYTE $0x05 /* bound */ CALL _strayintr(SB); BYTE $0x06 /* invalid opcode */ CALL _strayintr(SB); BYTE $0x07 /* no coprocessor available */ CALL _strayintrx(SB); BYTE $0x08 /* double fault */ CALL _strayintr(SB); BYTE $0x09 /* coprocessor segment overflow */ CALL _strayintrx(SB); BYTE $0x0A /* invalid TSS */ CALL _strayintrx(SB); BYTE $0x0B /* segment not available */ CALL _strayintrx(SB); BYTE $0x0C /* stack exception */ CALL _strayintrx(SB); BYTE $0x0D /* general protection error */ CALL _strayintrx(SB); BYTE $0x0E /* page fault */ CALL _strayintr(SB); BYTE $0x0F /* */ CALL _strayintr(SB); BYTE $0x10 /* coprocessor error */ CALL _strayintrx(SB); BYTE $0x11 /* alignment check */ CALL _strayintr(SB); BYTE $0x12 /* machine check */ CALL _strayintr(SB); BYTE $0x13 CALL _strayintr(SB); BYTE $0x14 CALL _strayintr(SB); BYTE $0x15 CALL _strayintr(SB); BYTE $0x16 CALL _strayintr(SB); BYTE $0x17 CALL _strayintr(SB); BYTE $0x18 CALL _strayintr(SB); BYTE $0x19 CALL _strayintr(SB); BYTE $0x1A CALL _strayintr(SB); BYTE $0x1B CALL _strayintr(SB); BYTE $0x1C CALL _strayintr(SB); BYTE $0x1D CALL _strayintr(SB); BYTE $0x1E CALL _strayintr(SB); BYTE $0x1F CALL _strayintr(SB); BYTE $0x20 /* VectorLAPIC */ CALL _strayintr(SB); BYTE $0x21 CALL _strayintr(SB); BYTE $0x22 CALL _strayintr(SB); BYTE $0x23 CALL _strayintr(SB); BYTE $0x24 CALL _strayintr(SB); BYTE $0x25 CALL _strayintr(SB); BYTE $0x26 CALL _strayintr(SB); BYTE $0x27 CALL _strayintr(SB); BYTE $0x28 CALL _strayintr(SB); BYTE $0x29 CALL _strayintr(SB); BYTE $0x2A CALL _strayintr(SB); BYTE $0x2B CALL _strayintr(SB); BYTE $0x2C CALL _strayintr(SB); BYTE $0x2D CALL _strayintr(SB); BYTE $0x2E CALL _strayintr(SB); BYTE $0x2F CALL _strayintr(SB); BYTE $0x30 CALL _strayintr(SB); BYTE $0x31 CALL _strayintr(SB); BYTE $0x32 CALL _strayintr(SB); BYTE $0x33 CALL _strayintr(SB); BYTE $0x34 CALL _strayintr(SB); BYTE $0x35 CALL _strayintr(SB); BYTE $0x36 CALL _strayintr(SB); BYTE $0x37 CALL _strayintr(SB); BYTE $0x38 CALL _strayintr(SB); BYTE $0x39 CALL _strayintr(SB); BYTE $0x3A CALL _strayintr(SB); BYTE $0x3B CALL _strayintr(SB); BYTE $0x3C CALL _strayintr(SB); BYTE $0x3D CALL _strayintr(SB); BYTE $0x3E CALL _strayintr(SB); BYTE $0x3F CALL _syscallintr(SB); BYTE $0x40 /* VectorSYSCALL */ CALL _strayintr(SB); BYTE $0x41 CALL _strayintr(SB); BYTE $0x42 CALL _strayintr(SB); BYTE $0x43 CALL _strayintr(SB); BYTE $0x44 CALL _strayintr(SB); BYTE $0x45 CALL _strayintr(SB); BYTE $0x46 CALL _strayintr(SB); BYTE $0x47 CALL _strayintr(SB); BYTE $0x48 CALL _strayintr(SB); BYTE $0x49 CALL _strayintr(SB); BYTE $0x4A CALL _strayintr(SB); BYTE $0x4B CALL _strayintr(SB); BYTE $0x4C CALL _strayintr(SB); BYTE $0x4D CALL _strayintr(SB); BYTE $0x4E CALL _strayintr(SB); BYTE $0x4F CALL _strayintr(SB); BYTE $0x50 CALL _strayintr(SB); BYTE $0x51 CALL _strayintr(SB); BYTE $0x52 CALL _strayintr(SB); BYTE $0x53 CALL _strayintr(SB); BYTE $0x54 CALL _strayintr(SB); BYTE $0x55 CALL _strayintr(SB); BYTE $0x56 CALL _strayintr(SB); BYTE $0x57 CALL _strayintr(SB); BYTE $0x58 CALL _strayintr(SB); BYTE $0x59 CALL _strayintr(SB); BYTE $0x5A CALL _strayintr(SB); BYTE $0x5B CALL _strayintr(SB); BYTE $0x5C CALL _strayintr(SB); BYTE $0x5D CALL _strayintr(SB); BYTE $0x5E CALL _strayintr(SB); BYTE $0x5F CALL _strayintr(SB); BYTE $0x60 CALL _strayintr(SB); BYTE $0x61 CALL _strayintr(SB); BYTE $0x62 CALL _strayintr(SB); BYTE $0x63 CALL _strayintr(SB); BYTE $0x64 CALL _strayintr(SB); BYTE $0x65 CALL _strayintr(SB); BYTE $0x66 CALL _strayintr(SB); BYTE $0x67 CALL _strayintr(SB); BYTE $0x68 CALL _strayintr(SB); BYTE $0x69 CALL _strayintr(SB); BYTE $0x6A CALL _strayintr(SB); BYTE $0x6B CALL _strayintr(SB); BYTE $0x6C CALL _strayintr(SB); BYTE $0x6D CALL _strayintr(SB); BYTE $0x6E CALL _strayintr(SB); BYTE $0x6F CALL _strayintr(SB); BYTE $0x70 CALL _strayintr(SB); BYTE $0x71 CALL _strayintr(SB); BYTE $0x72 CALL _strayintr(SB); BYTE $0x73 CALL _strayintr(SB); BYTE $0x74 CALL _strayintr(SB); BYTE $0x75 CALL _strayintr(SB); BYTE $0x76 CALL _strayintr(SB); BYTE $0x77 CALL _strayintr(SB); BYTE $0x78 CALL _strayintr(SB); BYTE $0x79 CALL _strayintr(SB); BYTE $0x7A CALL _strayintr(SB); BYTE $0x7B CALL _strayintr(SB); BYTE $0x7C CALL _strayintr(SB); BYTE $0x7D CALL _strayintr(SB); BYTE $0x7E CALL _strayintr(SB); BYTE $0x7F CALL _strayintr(SB); BYTE $0x80 /* Vector[A]PIC */ CALL _strayintr(SB); BYTE $0x81 CALL _strayintr(SB); BYTE $0x82 CALL _strayintr(SB); BYTE $0x83 CALL _strayintr(SB); BYTE $0x84 CALL _strayintr(SB); BYTE $0x85 CALL _strayintr(SB); BYTE $0x86 CALL _strayintr(SB); BYTE $0x87 CALL _strayintr(SB); BYTE $0x88 CALL _strayintr(SB); BYTE $0x89 CALL _strayintr(SB); BYTE $0x8A CALL _strayintr(SB); BYTE $0x8B CALL _strayintr(SB); BYTE $0x8C CALL _strayintr(SB); BYTE $0x8D CALL _strayintr(SB); BYTE $0x8E CALL _strayintr(SB); BYTE $0x8F CALL _strayintr(SB); BYTE $0x90 CALL _strayintr(SB); BYTE $0x91 CALL _strayintr(SB); BYTE $0x92 CALL _strayintr(SB); BYTE $0x93 CALL _strayintr(SB); BYTE $0x94 CALL _strayintr(SB); BYTE $0x95 CALL _strayintr(SB); BYTE $0x96 CALL _strayintr(SB); BYTE $0x97 CALL _strayintr(SB); BYTE $0x98 CALL _strayintr(SB); BYTE $0x99 CALL _strayintr(SB); BYTE $0x9A CALL _strayintr(SB); BYTE $0x9B CALL _strayintr(SB); BYTE $0x9C CALL _strayintr(SB); BYTE $0x9D CALL _strayintr(SB); BYTE $0x9E CALL _strayintr(SB); BYTE $0x9F CALL _strayintr(SB); BYTE $0xA0 CALL _strayintr(SB); BYTE $0xA1 CALL _strayintr(SB); BYTE $0xA2 CALL _strayintr(SB); BYTE $0xA3 CALL _strayintr(SB); BYTE $0xA4 CALL _strayintr(SB); BYTE $0xA5 CALL _strayintr(SB); BYTE $0xA6 CALL _strayintr(SB); BYTE $0xA7 CALL _strayintr(SB); BYTE $0xA8 CALL _strayintr(SB); BYTE $0xA9 CALL _strayintr(SB); BYTE $0xAA CALL _strayintr(SB); BYTE $0xAB CALL _strayintr(SB); BYTE $0xAC CALL _strayintr(SB); BYTE $0xAD CALL _strayintr(SB); BYTE $0xAE CALL _strayintr(SB); BYTE $0xAF CALL _strayintr(SB); BYTE $0xB0 CALL _strayintr(SB); BYTE $0xB1 CALL _strayintr(SB); BYTE $0xB2 CALL _strayintr(SB); BYTE $0xB3 CALL _strayintr(SB); BYTE $0xB4 CALL _strayintr(SB); BYTE $0xB5 CALL _strayintr(SB); BYTE $0xB6 CALL _strayintr(SB); BYTE $0xB7 CALL _strayintr(SB); BYTE $0xB8 CALL _strayintr(SB); BYTE $0xB9 CALL _strayintr(SB); BYTE $0xBA CALL _strayintr(SB); BYTE $0xBB CALL _strayintr(SB); BYTE $0xBC CALL _strayintr(SB); BYTE $0xBD CALL _strayintr(SB); BYTE $0xBE CALL _strayintr(SB); BYTE $0xBF CALL _strayintr(SB); BYTE $0xC0 CALL _strayintr(SB); BYTE $0xC1 CALL _strayintr(SB); BYTE $0xC2 CALL _strayintr(SB); BYTE $0xC3 CALL _strayintr(SB); BYTE $0xC4 CALL _strayintr(SB); BYTE $0xC5 CALL _strayintr(SB); BYTE $0xC6 CALL _strayintr(SB); BYTE $0xC7 CALL _strayintr(SB); BYTE $0xC8 CALL _strayintr(SB); BYTE $0xC9 CALL _strayintr(SB); BYTE $0xCA CALL _strayintr(SB); BYTE $0xCB CALL _strayintr(SB); BYTE $0xCC CALL _strayintr(SB); BYTE $0xCD CALL _strayintr(SB); BYTE $0xCE CALL _strayintr(SB); BYTE $0xCF CALL _strayintr(SB); BYTE $0xD0 CALL _strayintr(SB); BYTE $0xD1 CALL _strayintr(SB); BYTE $0xD2 CALL _strayintr(SB); BYTE $0xD3 CALL _strayintr(SB); BYTE $0xD4 CALL _strayintr(SB); BYTE $0xD5 CALL _strayintr(SB); BYTE $0xD6 CALL _strayintr(SB); BYTE $0xD7 CALL _strayintr(SB); BYTE $0xD8 CALL _strayintr(SB); BYTE $0xD9 CALL _strayintr(SB); BYTE $0xDA CALL _strayintr(SB); BYTE $0xDB CALL _strayintr(SB); BYTE $0xDC CALL _strayintr(SB); BYTE $0xDD CALL _strayintr(SB); BYTE $0xDE CALL _strayintr(SB); BYTE $0xDF CALL _strayintr(SB); BYTE $0xE0 CALL _strayintr(SB); BYTE $0xE1 CALL _strayintr(SB); BYTE $0xE2 CALL _strayintr(SB); BYTE $0xE3 CALL _strayintr(SB); BYTE $0xE4 CALL _strayintr(SB); BYTE $0xE5 CALL _strayintr(SB); BYTE $0xE6 CALL _strayintr(SB); BYTE $0xE7 CALL _strayintr(SB); BYTE $0xE8 CALL _strayintr(SB); BYTE $0xE9 CALL _strayintr(SB); BYTE $0xEA CALL _strayintr(SB); BYTE $0xEB CALL _strayintr(SB); BYTE $0xEC CALL _strayintr(SB); BYTE $0xED CALL _strayintr(SB); BYTE $0xEE CALL _strayintr(SB); BYTE $0xEF CALL _strayintr(SB); BYTE $0xF0 CALL _strayintr(SB); BYTE $0xF1 CALL _strayintr(SB); BYTE $0xF2 CALL _strayintr(SB); BYTE $0xF3 CALL _strayintr(SB); BYTE $0xF4 CALL _strayintr(SB); BYTE $0xF5 CALL _strayintr(SB); BYTE $0xF6 CALL _strayintr(SB); BYTE $0xF7 CALL _strayintr(SB); BYTE $0xF8 CALL _strayintr(SB); BYTE $0xF9 CALL _strayintr(SB); BYTE $0xFA CALL _strayintr(SB); BYTE $0xFB CALL _strayintr(SB); BYTE $0xFC CALL _strayintr(SB); BYTE $0xFD CALL _strayintr(SB); BYTE $0xFE CALL _strayintr(SB); BYTE $0xFF L $(KDSEL) POPL ES /* fix up ES */ PUSHL FS /* save the rest of the Ureg struct */ PUSHL GS PUSHAL PUSHL SP /* Ureg* argument to trap */ CALL trap(SB) POPL AX POPAL POPL GS POPL FS POPL ES POPL DS ADDL $8, SP /* pop error code and trap type */ /* the lguest iret. This is about 1e6 times simpler than Xen ... */ /* just tell the kernel not to interrupt you in thlguest25/lg.h 664 0 0 23563 10774531324 12162ustar00bootesbootes#ifndef _LGUEST_H #define _LGUEST_H #include #define GDT_ENTRY_LGUEST_CS 10 #define GDT_ENTRY_LGUEST_DS 11 #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) #ifndef __ASSEMBLY__ #include #include #include #include #include #include #include #include #include #include "irq_vectors.h" #define GUEST_PL 1 struct lguest_regs { /* Manually saved part. */ unsigned long ebx, ecx, edx; unsigned long esi, edi, ebp; unsigned long gs; unsigned long eax; unsigned long fs, ds, es; unsigned long trapnum, errcode; /* Trap pushed part */ unsigned long eip; unsigned long cs; unsigned long eflags; unsigned long esp; unsigned long ss; }; void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); /* Full 4G segment descriptors, suitable for CS and DS. */ #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) struct lguest_dma_info { struct list_head list; union futex_key key; unsigned long dmas; u16 next_dma; u16 num_dmas; u16 guestid; u8 interrupt; /* 0 when not registered */ }; /*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He * reviewed the original code which used "u32" for all page table entries, and * insisted that it would be far clearer with explicit typing. I thought it * was overkill, but he was right: it is much clearer than it was before. * * We have separate types for the Guest's ptes & pgds and the shadow ptes & * pgds. There's already a Linux type for these (pte_t and pgd_t) but they * change depending on kernel config options (PAE). */ /* Each entry is identical: lower 12 bits of flags and upper 20 bits for the * "page frame number" (0 == first physical page, etc). They are different * types so the compiler will warn us if we mix them improperly. */ typedef union { struct { unsigned flags:12, pfn:20; }; struct { unsigned long val; } raw; } spgd_t; typedef union { struct { unsigned flags:12, pfn:20; }; struct { unsigned long val; } raw; } spte_t; typedef union { struct { unsigned flags:12, pfn:20; }; struct { unsigned long val; } raw; } gpgd_t; typedef union { struct { unsigned flags:12, pfn:20; }; struct { unsigned long val; } raw; } gpte_t; /* We have two convenient macros to convert a "raw" value as handed to us by * the Guest into the correct Guest PGD or PTE type. */ #define mkgpte(_val) ((gpte_t){.raw.val = _val}) #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) /*:*/ struct pgdir { unsigned long cr3; spgd_t *pgdir; }; /* This is the TSS defined by the hardware. */ struct i386_hw_tss { unsigned short back_link,__blh; unsigned long esp0; unsigned short ss0,__ss0h; unsigned long esp1; unsigned short ss1,__ss1h; unsigned long esp2; unsigned short ss2,__ss2h; unsigned long __cr3; unsigned long eip; unsigned long eflags; unsigned long eax,ecx,edx,ebx; unsigned long esp; unsigned long ebp; unsigned long esi; unsigned long edi; unsigned short es, __esh; unsigned short cs, __csh; unsigned short ss, __ssh; unsigned short ds, __dsh; unsigned short fs, __fsh; unsigned short gs, __gsh; unsigned short ldt, __ldth; unsigned short trace, io_bitmap_base; } __attribute__((packed)); /* This is a guest-specific page (mapped ro) into the guest. */ struct lguest_ro_state { /* Host information we need to restore when we switch back. */ u32 host_cr3; struct Xgt_desc_struct host_idt_desc; struct Xgt_desc_struct host_gdt_desc; u32 host_sp; /* Fields which are used when guest is running. */ struct Xgt_desc_struct guest_idt_desc; struct Xgt_desc_struct guest_gdt_desc; struct i386_hw_tss guest_tss; struct desc_struct guest_idt[IDT_ENTRIES]; struct desc_struct guest_gdt[GDT_ENTRIES]; }; /* We have two pages shared with guests, per cpu. */ struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; struct lguest_regs regs; /* This is the host state & guest descriptor page, ro in guest */ struct lguest_ro_state state; } __attribute__((aligned(PAGE_SIZE))); #define CHANGED_IDT 1 #define CHANGED_GDT 2 #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ #define CHANGED_ALL 3 /* The private info the thread maintains about the guest. */ struct lguest { /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_data __user *lguest_data; struct task_struct *tsk; struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ u16 guestid; u32 pfn_limit; u32 page_offset; u32 cr2; int halted; int ts; u32 last_timer; u32 next_hcall; u32 esp1; u8 ss1; /* Bitmap of what has changed: see CHANGED_* above. */ int changed; struct lguest_pages *last_pages; /* We keep a small number of these. */ u32 pgdidx; struct pgdir pgdirs[4]; /* Cached wakeup: we hold a reference to this task. */ struct task_struct *wake; unsigned long noirq_start, noirq_end; int dma_is_pending; unsigned long pending_dma; /* struct lguest_dma */ unsigned long pending_key; /* address they're sending to */ unsigned int stack_pages; struct lguest_dma_info dma[LGUEST_MAX_DMA]; /* Dead? */ const char *dead; /* The GDT entries copied into lguest_ro_state when running. */ struct desc_struct gdt[GDT_ENTRIES]; /* The IDT entries: some copied into lguest_ro_state when running. */ struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS]; struct desc_struct syscall_idt; /* Pending virtual interrupts */ DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); }; extern struct lguest lguests[]; extern struct mutex lguest_lock; /* core.c: */ u32 lgread_u32(struct lguest *lg, unsigned long addr); void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val); void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len); void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len); int find_free_guest(void); int lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); int run_guest(struct lguest *lg, unsigned long __user *user); /* interrupts_and_traps.c: */ void maybe_do_interrupt(struct lguest *lg); int deliver_trap(struct lguest *lg, unsigned int num); void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); void pin_stack_pages(struct lguest *lg); void setup_default_idt_entries(struct lguest_ro_state *state, const unsigned long *def); void copy_traps(const struct lguest *lg, struct desc_struct *idt, const unsigned long *def); /* segments.c: */ void setup_default_gdt_entries(struct lguest_ro_state *state); void setup_guest_gdt(struct lguest *lg); void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); void guest_load_tls(struct lguest *lg, unsigned long tls_array); void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); /* page_tables.c: */ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); void guest_pagetable_clear_all(struct lguest *lg); void guest_pagetable_flush_user(struct lguest *lg); void guest_set_pte(struct lguest *lg, unsigned long cr3, unsigned long vaddr, gpte_t val); void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); int demand_page(struct lguest *info, unsigned long cr2, int errcode); void pin_page(struct lguest *lg, unsigned long vaddr); /* lguest_user.c: */ int lguest_device_init(void); void lguest_device_remove(void); /* io.c: */ void lguest_io_init(void); int bind_dma(struct lguest *lg, unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt); void send_dma(struct lguest *info, unsigned long key, unsigned long udma); void release_all_dma(struct lguest *lg); unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, unsigned long *interrupt); void set_wakeup_process(struct lguest *lg, struct task_struct *p); /* hypercalls.c: */ void do_hypercalls(struct lguest *lg); /*L:035 * Let's step aside for the moment, to study one important routine that's used * widely in the Host code. * * There are many cases where the Guest does something invalid, like pass crap * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite * acceptable to simply terminate the Guest and give the Launcher a nicely * formatted reason. It's also simpler for the Guest itself, which doesn't * need to check most hypercalls for "success"; if you're still running, it * succeeded. * * Once this is called, the Guest will never run again, so most Host code can * call this then continue as if nothing had happened. This means many * functions don't have to explicitly return an error code, which keeps the * code simple. * * It also means that this can be called more than once: only the first one is * remembered. The only trick is that we still need to kill the Guest even if * we can't allocate memory to store the reason. Linux has a neat way of * packing error codes into invalid pointers, so we use that here. * * Like any macro which uses an "if", it is safely wrapped in a run-once "do { * } while(0)". */ #define kill_guest(lg, fmt...) \ do { \ if (!(lg)->dead) { \ (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ if (!(lg)->dead) \ (lg)->dead = ERR_PTR(-ENOMEM); \ } \ } while(0) /* (End of aside) :*/ static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) { return vaddr - lg->page_offset; } #endif /* __ASSEMBLY__ */ #endif /* _LGUEST_H */ lguest25/lgcpu.rc 775 0 0 1622 11001461215 13003ustar00bootesbootes#!/boot/rc -m /boot/rcmain /boot/bind /boot /bin cputype=386 objtype=$cputype service=cpu user=glenda rootdir=/root rootspec='' echo -n $user > /dev/hostowner auth=() fs=() bind '#c' /dev bind '#d' /fd bind -c '#e' /env bind '#p' /proc bind -c '#s' /srv bind -a '#S' /dev bind -a '#I' /net #echo -n $user > /dev/hostowner #i=`{sed '' /net/ipifc/clone} #echo bind loopback /dev/null > /net/ipifc/$i/ctl #echo add 127.0.0.1 255.0.0.0 127.0.0.0 > /net/ipifc/$i/ctl #fdisk -p /dev/sd00/data > /dev/sd00/ctl #prep -p /dev/sd00/plan9 > /dev/sd00/ctl #ls /dev/sd00 echo -n tcp!127.1!17034 > /env/venti /boot/echo "HI THERE!" #venti -c /dev/sd00/arenas -a tcp!127.1!17034 #fossil -c 'srv -p fscons' -c 'srv -A boot' -f /dev/sd00/fossil #mount -c /srv/boot /root #bind -bc /root / #ip/ipconfig ether /net/ether0 192.168.19.2 255.255.255.0 #factotum #/386/init while(/boot/echo Hello Squidboy) . -i '#d/0' iewed the original code which used "u32" for all page table entries, and * insisted that it would be far clealguest25/lgkbd.c 664 0 0 412 11022045270 12545ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "../port/error.h" #pragma profile 0 void kbdenable(void) { kbdq = qopen(4*1024, 0, 0, 0); if(kbdq == nil) panic("kbdinit"); qnoblock(kbdq, 1); } lguest25/lgtod.c 664 0 0 6612 11022045275 12630ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "../port/error.h" #include "lguest.h" #pragma profile 0 /* I'm really tired of trying to make port/tod.c work right, and it makes * no sense to try -- we have a ns clock tied to tod, so let's use it. * also the clock on lguest is 4*1024*1024*1024, not 1000000000 * tod.c coughs and dies on this, it seems. */ /* * Compute nanosecond epoch time from the fastest ticking clock * on the system. Converting the time to nanoseconds requires * the following formula * * t = (((1000000000<<31)/f)*ticks)>>31 * * where * * 'f' is the clock frequency * 'ticks' are clock ticks * * to avoid too much calculation in todget(), we calculate * * mult = (1000000000<<32)/f * * each time f is set. f is normally set by a user level * program writing to /dev/fastclock. mul64fract will then * take that fractional multiplier and a 64 bit integer and * return the resulting integer product. * * We assume that the cpu's of a multiprocessor are synchronized. * This assumption needs to be questioned with each new architecture. */ /* frequency of the tod clock */ #define TODFREQ 1000000000ULL #define MicroFREQ 1000000ULL struct { int init; // true if initialized ulong cnt; Lock; uvlong multiplier; // ns = off + (multiplier*ticks)>>31 uvlong divider; // ticks = (divider*(ns-off))>>31 uvlong umultiplier; // µs = (µmultiplier*ticks)>>31 uvlong udivider; // ticks = (µdivider*µs)>>31 vlong hz; // frequency of fast clock vlong last; // last reading of fast clock vlong off; // offset from epoch to last vlong lasttime; // last return value from todget vlong delta; // add 'delta' each slow clock tick from sstart to send ulong sstart; // ... ulong send; // ... } tod; void todinit(void) { if(tod.init) return; iprint("todiit ... "); ilock(&tod); iprint("ilock &tod ok\n"); tod.last = fastticks((uvlong*)&tod.hz); iunlock(&tod); tod.init = 1; } void todsetfreq(vlong ) { } /* * Set the time of day struct -- we don't allow this on lguest. nop. */ void todset(vlong , vlong, int ) { if(!tod.init) todinit(); } /* * get time of day */ vlong todget(vlong *ticksp) { uvlong x; if(!tod.init) todinit(); // we don't want time to pass twixt the measuring of fastticks // and grabbing tod.last. Also none of the vlongs are atomic so // we have to look at them inside the lock. ilock(&tod); tod.cnt++; x = fastticks(nil); // time can't go backwards if(x < tod.lasttime) x = tod.lasttime; else tod.lasttime = x; iunlock(&tod); /* what a mess this is. */ /* scale lguest time to real nanoseconds. */ x /= 4398; x *= 1024; /* this is now scaled to ns. */ if(ticksp != nil) *ticksp = x; //iprint("todget %#ulx:%#ulx\n", (long)(x>>32), (long)x); return x; } long seconds(void) { int i; i = lguest_get_wallclock(); return i; } uvlong fastticks2us(uvlong ticks) { uvlong res; if(!tod.init) todinit(); res = ticks / 1000; return res; } uvlong us2fastticks(uvlong us) { uvlong res; if(!tod.init) todinit(); res = us * 1000; return res; } /* * convert milliseconds to fast ticks */ uvlong ms2fastticks(ulong ms) { if(!tod.init) todinit(); return ms * 1000000ULL; } /* * convert nanoseconds to fast ticks */ uvlong ns2fastticks(uvlong ns) { return ns; } /* * convert fast ticks to ns */ uvlong fastticks2ns(uvlong ticks) { return ticks; } struct lguest *lg, unsigned int i, u32 low, u32 hi); void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsignelguest25/lguest 664 0 0 1616 10774531324 12610ustar00bootesbootesdev root cons arch env pipe proc mnt srv dup ssl tls cap kprof ether netif ip arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum386 inferno uart rtc sd link ethermedium netdevmedium loopbackmedium etherlg misc uartlg sdlg ip il tcp udp ipifc icmp icmp6 gre port int cpuserver = 0; boot boot #S/sd00/data tcp il local bootdir # bootlguest.out boot /386/bin/bind /386/bin/echo /386/bin/date /386/bin/ls /386/bin/dd /386/bin/mount /386/bin/ps /386/bin/cat /386/bin/rc /386/bin/sleep /386/bin/sed /386/bin/test /386/bin/auth/factotum /386/bin/ip/ipconfig /386/bin/ip/ping /386/bin/import /386/bin/ip/traceroute /386/bin/aux/listen1 /386/bin/disk/prep /386/bin/disk/fdisk /386/bin/fossil/fossil /386/bin/venti/venti /rc/lib/rcmain /386/bin/9660srv kfs /386/bin/ns /386/bin/ramfs /386/bin/netstat diskpart lgcpu.rc bootinit(void); int bind_dma(struct lguest *lg, unsigned long key, unsigned long udma, u16 numdmas, u8 interruptlguest25/lguest.h 664 0 0 20714 11000314671 13041ustar00bootesbootes/* * Copyright (C) 2006, Rusty Russell IBM Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* mods for Plan 9 by Ron Minnich, but the GPL stays I think. */ /* lguest structs */ /* these were automatically created by the program buildlgh.c in this directory */ #define GUEST_PL 1 #define IDT_ENTRIES 256 #define GDT_ENTRIES 32 typedef unsigned long u32; typedef unsigned short u16; typedef unsigned char u8; struct Xgt_desc_struct { u8 data[8]; }; struct desc_struct { u32 a,b; }; /* futex_key has no meaning to Plan 9 ...*/ struct futex_key { u32 pgoff; void *v; u32 offset; }; struct lguest_regs { /* Manually saved part. */ u32 ebx, ecx, edx; u32 esi, edi, ebp; u32 gs; u32 eax; u32 fs, ds, es; u32 trapnum, errcode; /* Trap pushed part */ u32 eip; u32 cs; u32 eflags; u32 esp; u32 ss; }; /* Full 4G segment descriptors, suitable for CS and DS. */ #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) struct list_head { struct list_head *next, *prev; }; struct lguest_dma_info { struct list_head list; struct futex_key key; u32 dmas; u16 next_dma; u16 num_dmas; u16 guestid; u8 interrupt; /* 0 when not registered */ }; struct pgdir { unsigned long cr3; u32 *pgdir; }; struct tss_struct { u8 data[104];}; /* This is a guest-specific page (mapped ro) into the guest. */ struct lguest_ro_state { /* Host information we need to restore when we switch back. */ u32 host_cr3; struct Xgt_desc_struct host_idt_desc; struct Xgt_desc_struct host_gdt_desc; u32 host_sp; /* Fields which are used when guest is running. */ struct Xgt_desc_struct guest_idt_desc; struct Xgt_desc_struct guest_gdt_desc; struct tss_struct guest_tss; struct desc_struct guest_idt[IDT_ENTRIES]; struct desc_struct guest_gdt[GDT_ENTRIES]; }; /* We have two pages shared with guests, per cpu. */ struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[BY2PG - sizeof(struct lguest_regs)]; struct lguest_regs regs; /* This is the host state & guest descriptor page, ro in guest */ struct lguest_ro_state state; }; #define LHCALL_FLUSH_ASYNC 0 #define LHCALL_LGUEST_INIT 1 #define LHCALL_CRASH 2 #define LHCALL_LOAD_GDT 3 #define LHCALL_NEW_PGTABLE 4 #define LHCALL_FLUSH_TLB 5 #define LHCALL_LOAD_IDT_ENTRY 6 #define LHCALL_SET_STACK 7 #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_BIND_DMA 12 #define LHCALL_SET_PTE 14 #define LHCALL_SET_PMD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 /* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ #define LGUEST_SHUTDOWN_POWEROFF 1 #define LGUEST_SHUTDOWN_RESTART 2 #define LGUEST_TRAP_ENTRY 0x1F #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX unsigned long hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3); /*:*/ void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3); /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_ring { u32 eax, edx, ebx, ecx; }; struct timespec { u32 seconds, nanoseconds; }; /*G:032 The second method of communicating with the Host is to via "struct * lguest_data". The Guest's very first hypercall is to tell the Host where * this is, and then the Guest and Host both publish information in it. :*/ #define NR_IRQS 32 struct lguest_data { /* 512 == enabled (same as eflags in normal hardware). The Guest * changes interrupts so often that a hypercall is too slow. */ unsigned int irq_enabled; /* Fine-grained interrupt disabling by the Guest */ u32 blocked_interrupts[LGUEST_IRQS/32]; /* The Host writes the virtual address of the last page fault here, * which saves the Guest a hypercall. CR2 is the native register where * this address would normally be found. */ unsigned long cr2; /* Wallclock time set by the Host. */ struct timespec time; /* Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. * This batching can be quite efficient. */ /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ u8 hcall_status[LHCALL_RING_SIZE]; /* The actual registers for the hypercalls. */ struct hcall_ring hcalls[LHCALL_RING_SIZE]; /* Fields initialized by the Host at boot: */ /* Memory not to try to access */ unsigned long reserve_mem; /* KHz for the TSC clock. */ u32 tsc_khz; /* Page where the top-level pagetable is */ unsigned long pgdir; /* Fields initialized by the Guest at boot: */ /* Instruction range to suppress interrupts even if enabled */ unsigned long noirq_start, noirq_end; /* Address above which page tables are all identical. */ unsigned long kernel_address; /* The vector to try to use for system calls (0x40 or 0x80). */ unsigned int syscall_vec; }; extern void lguest_noirq_start(void); extern void lguest_noirq_end(void); extern struct lguest_data lguest_data; /* defines for I/O etc. */ /* Everything the "lguest" userspace program needs to know. */ struct lguest_device_desc { /* The device type: console, network, disk etc. Type 0 terminates. */ u8 type; /* The number of virtqueues (first in config array) */ u8 num_vq; /* The number of bytes of feature bits. Multiply by 2: one for host * features and one for Guest acknowledgements. */ u8 feature_len; /* The number of bytes of the config array after virtqueues. */ u8 config_len; /* A status byte, written by the Guest. */ u8 status; u8 config[]; }; /* layout: These come first in the config array, there are num_vq of them */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ u16 num; /* The interrupt we get when something happens. */ u16 irq; /* The page number of the virtio ring for this device. */ u32 pfn; }; enum lguest_req { LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ LHREQ_GETDMA, /* + unused (old lguest) */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ }; /* and a few bits from Linux itself. */ #define PARAVIRT_LAZY_NONE 0 #define PARAVIRT_LAZY_MMU 1 #define PARAVIRT_LAZY_CPU 2 #define PARAVIRT_LAZY_FLUSH 3 #define X86_EFLAGS_IF 0x00000200 /* e820 structs as Linux sets them up */ #define E820MAP 0x2d0 /* our map */ #define E820MAX 128 /* number of entries in E820MAP */ #define E820NR 0x1e8 /* # entries in E820MAP */ #define E820_RAM 1 #define E820_RESERVED 2 #define E820_ACPI 3 #define E820_NVS 4 #define HIGH_MEMORY (1024*1024) struct e820map { int nr_map; struct e820entry { unsigned long long addr; /* start of memory segment */ unsigned long long size; /* size of memory segment */ unsigned long type; /* type of memory segment */ } map[E820MAX]; }; /* prototypes */ void lguest_send_dma(unsigned long key, struct lguest_dma *dma); void lguest_load_esp0(u32 stack); void lguest_flush_tlb_kernel(void); void lguest_flush_tlb_single(u32 addr); void lguest_set_pmd(u32 *pmdp, u32 pmdval); void lguest_set_pte(u32 *ptep, u32 pteval); void lguest_set_pte_at(ulong pdb, u32 addr, u32 *ptep, u32 pteval); int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, unsigned int num, u8 irq); unsigned long lguest_get_wallclock(void); uvlong lguest_get_ns(void); void lguest_interval_timer(u32); int setupdma(void *buf, int len, struct lguest_dma *dma); /* interrupt numbers -- stupid hardcodes */ /* FIX ME SOON */ #define CONSINTR 33 #define BLOCKINTR 34 #define NETINTR 40 lguest25/lguestcpu 664 0 0 1615 11035171357 13314ustar00bootesbootesdev root cons arch env pipe proc mnt srv dup ssl tls cap lgv ether netif ip arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum386 inferno uart rtc sd link ethermedium netdevmedium loopbackmedium etherlg misc uartlg sdlg ip tcp udp ipifc icmp icmp6 gre port int cpuserver = 1; boot cpu boot #S/sd00/data tcp il local bootdir bootlguestcpu.out boot /386/bin/bind /386/bin/echo /386/bin/date /386/bin/ls /386/bin/dd /386/bin/mount /386/bin/ps /386/bin/cat /386/bin/rc /386/bin/sleep /386/bin/sed /386/bin/test /386/bin/auth/factotum /386/bin/ip/ipconfig /386/bin/ip/ping /386/bin/import /386/bin/ip/traceroute /386/bin/aux/listen1 /386/bin/disk/prep /386/bin/disk/fdisk /386/bin/fossil/fossil /386/bin/venti/venti /rc/lib/rcmain /386/bin/9660srv kfs /386/bin/ns /386/bin/ramfs /386/bin/netstat diskpart # lgcpu.rc bootlguest25/lguesthcall.c 664 0 0 72410 11033174457 14054ustar00bootesbootes/*P:010 * A hypervisor allows multiple Operating Systems to run on a single machine. * To quote David Wheeler: "Any problem in computer science can be solved with * another layer of indirection." * * We keep things simple in two ways. First, we start with a normal Linux * kernel and insert a module (lg.ko) which allows us to run other Linux * kernels the same way we'd run processes. We call the first kernel the Host, * and the others the Guests. The program which sets up and configures Guests * (such as the example in Documentation/lguest/lguest.c) is called the * Launcher. * * Secondly, we only run specially modified Guests, not normal kernels. When * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows * how to be a Guest. This means that you can use the same kernel you boot * normally (ie. as a Host) as a Guest. * * These Guests know that they cannot do privileged operations, such as disable * interrupts, and that they have to ask the Host to do such things explicitly. * This file consists of all the replacements for such low-level native * hardware operations: these special Guest versions call the Host. * * So how does the kernel know it's a Guest? The very first instructions the * 32-bit kernel runs are at startup_32 in arch/i386/kernel/head.S. This tests * if we're fully privileged: if we're not, we know we're under some kind of * hypervisor. We end up here, where we replace the native functions in * "struct paravirt_ops" with our Guest versions. :*/ #pragma profile 0 /* * Copyright (C) 2006, Rusty Russell IBM Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /*G:010 Welcome to the Guest! * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the * same kernel as the Host (or at least, built from the same source code). :*/ #include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "ureg.h" #include "pool.h" #include "lguest.h" extern void lguest_iret(void); extern struct lguest_data lguest_data; struct lguest_device_desc *lguest_devices; /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first * real optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls * are reasonably expensive, batching them up makes sense. For example, a * large mmap might update dozens of page table entries: that code calls * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls * lguest_lazy_mode(PARAVIRT_LAZY_NONE). * * So, when we're in lazy mode, we call async_hypercall() to store the call for * future processing. When lazy mode is turned off we issue a hypercall to * flush the stored calls. */ static int lazy_mode; /* Note: not SMP-safe! */ static void lguest_lazy_mode(int mode) { lazy_mode = mode; if (mode == PARAVIRT_LAZY_NONE) hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); } static void lazy_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3) { if (lazy_mode == PARAVIRT_LAZY_NONE) hcall(call, arg1, arg2, arg3); else async_hcall(call, arg1, arg2, arg3); } /* async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */ void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; int s; /* Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing * one! */ s = splhi(); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ hcall(call, arg1, arg2, arg3); } else { lguest_data.hcalls[next_call].eax = call; lguest_data.hcalls[next_call].edx = arg1; lguest_data.hcalls[next_call].ebx = arg2; lguest_data.hcalls[next_call].ecx = arg3; /* Arguments must all be written before we mark it to go */ coherence(); lguest_data.hcall_status[next_call] = 0; if (++next_call == LHCALL_RING_SIZE) next_call = 0; } splx(s); } /*:*/ void *lguest_map(unsigned long phys_addr, unsigned long pages) { dumpstack(); panic("called lguest_map with %#lx, pages %ld", phys_addr, pages); return nil; } void lguest_unmap(void *addr) { dumpstack(); panic("called lguest_unmap with %p", addr); } /*G:033 * Here are our first native-instruction replacements: four functions for * interrupt control. * * The simplest way of implementing these would be to have "turn interrupts * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: * these by far the most commonly called functions of those we override. * * So instead we keep an "irq_enabled" field inside our "struct lguest_data", * which the Guest can update with a single instruction. The Host knows to * check there when it wants to deliver an interrupt. */ /* save_flags() is expected to return the processor state (ie. "eflags"). The * eflags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ int islo(void) { return lguest_data.irq_enabled; } /* "restore_flags" just sets the flags back to the value given. */ void splx(int flags) { lguest_data.irq_enabled = flags; } /* Interrupts go off... */ int splhi(void) { int old = lguest_data.irq_enabled; lguest_data.irq_enabled = 0; return old; } /* Interrupts go on... */ int spllo(void) { int old = lguest_data.irq_enabled; /* we use the Linux define here since it may change in lguest at some point */ lguest_data.irq_enabled = X86_EFLAGS_IF; return old; } /*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way * would be to put the "irq_enabled" field in a page by itself, and have the * Host write-protect it when an interrupt comes in when irqs are disabled. * There will then be a page fault as soon as interrupts are re-enabled. :*/ static inline void write_dt_entry(void *dt, int entry, u32 entry_a, u32 entry_b) { u32 *lp = (u32 *)((char *)dt + entry*8); *lp = entry_a; *(lp+1) = entry_b; } /*G:034 * The Interrupt Descriptor Table (IDT). * * The IDT tells the processor what to do when an interrupt comes in. Each * entry in the table is a 64-bit descriptor: this holds the privilege level, * address of the handler, and... well, who cares? The Guest just asks the * Host to make the change anyway, because the Host controls the real IDT. */ void lguest_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) { /* Keep the local copy up to date. */ write_dt_entry(dt, entrynum, low, high); /* Tell Host about this new entry. */ hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); } /* Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the * Host about them. */ void lidt(ushort *idtp) { unsigned int i; /* size is the first short */ u16 size = idtp[0]; /* descriptor is in the long after the first short */ u32 address = *(u32 *)&idtp[1]; struct desc_struct *idt = (void *)address; /* only do 64, no matter how many it says. lguest ignores the others * and, if you do 128 by mistake, it will overwrite the plan 9 system call * entry at 64. We have to fix lguest to take system call # as a parameter */ size = size > 64 ? 64 : size; for (i = 0; i < size; i++){ iprint("Vec %d low %p, high %p\n", i, (void *)idt[i].a, (void *)idt[i].b); hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); } /* now do the system call entry */ i = 0x40; hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); } /* * The Global Descriptor Table. * * The Intel architecture defines another table, called the Global Descriptor * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" * instruction, and then several other instructions refer to entries in the * table. There are three entries which the Switcher needs, so the Host simply * controls the entire thing and the Guest asks it to make changes using the * LOAD_GDT hypercall. * * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY * hypercall and use that repeatedly to load a new IDT. I don't think it * really matters, but wouldn't it be nice if they were the same? */ void lgdt(ushort *gdtp) { /* size is the first short */ u16 size = gdtp[0]; /* descriptor is in the long after the first short */ u32 address = *(u32 *)&gdtp[1]; struct desc_struct *gdt = (struct desc_struct *) address; iprint("lgdt at %p, entries at %p\n", gdtp, gdt); if (((size+1)/8) != GDT_ENTRIES) panic("bad gdt in lguest_load_gdt: need %d entries, got %d\n", GDT_ENTRIES, (size+1)/8); // BUG_ON((desc->size+1)/8 != GDT_ENTRIES); iprint("Load the gdt at %p\n", gdt); hcall(LHCALL_LOAD_GDT, paddr(gdt), GDT_ENTRIES, 0); } /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare * that this naive implementation is reasonable. */ static void lguest_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) { write_dt_entry(dt, entrynum, low, high); hcall(LHCALL_LOAD_GDT, paddr(dt), GDT_ENTRIES, 0); } /*:*/ /*G:038 That's enough excitement for now, back to ploughing through each of * the paravirt_ops (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything * here, so they'll get an informative and friendly Segmentation Fault. */ static void lguest_set_ldt(const void *, unsigned) { dumpstack(); iprint("lguest_set_ldt: ignored\n"); } /* This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. * * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we * override the native version with a do-nothing version. */ void ltr(ulong) { } /* The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you * might imagine, after a decade and a half this treatment, it is now a giant * ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry * has been translated into 4 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned * off feature bits until the Guest booted. (Don't say that: you'll damage * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is * hardly future proof.) Noone's listening! They don't like you anyway, * parenthetic weirdo! * * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get * too worked up about it. */ void lguest_cpuid(u32 *eax, u32 *ebx,u32 *ecx, u32 *edx) { int function = *eax; u32 data[4]; void native_cpuid(u32, u32 *); native_cpuid(*eax, data); *eax = data[0]; *ebx = data[1]; *ecx = data[2]; *edx = data[3]; switch (function) { case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *ecx &= 0x00002201; /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ *edx &= 0x07808101; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ *edx |= 0x00002000; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended * processor information there is, limit it to known fields. */ if (*eax > 0x80000008) *eax = 0x80000008; break; } } /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). * * We start with CR0. CR0 allows you to turn on and off all kinds of basic * features, but Linux only really cares about one: the horrifically-named Task * Switched (TS) bit at bit 3 (ie. 8) * * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if * the floating point unit is used. Which allows us to restore FPU state * lazily after a task switch, and Linux uses that gratefully, but wouldn't a * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 (and cr3) locally, because the Host never changes it. The * Guest sometimes wants to read it and we'd prefer not to bother the Host * unnecessarily. */ static unsigned long current_cr0, current_cr3; void putcr0(unsigned long val) { /* 8 == TS bit. */ lazy_hcall(LHCALL_TS, val & 8, 0, 0); current_cr0 = val; } unsigned long getcr0(void) { return current_cr0; } /* Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all * the vowels have been optimized out. */ void lguest_clts(void) { lazy_hcall(LHCALL_TS, 0, 0, 0); current_cr0 &= ~8U; } /* CR2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we * just read it out of there. */ unsigned long getcr2(void) { return lguest_data.cr2; } /* CR3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. */ void putcr3(unsigned long cr3) { // iprint("putcr3 %#lx\n", cr3); // iprint("* of that is %#lx\n", *(unsigned long *)kaddr(cr3+0xc00)); /*lazy_*/hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); // iprint("back\n"); current_cr3 = cr3; // iprint("* of that is %#lx\n", *(unsigned long *)kaddr(cr3+0xc00)); // iprint("return from putcr3\n"); } unsigned long getcr3(void) { return current_cr3; } /* CR4 is used to enable and disable PGE, but we don't care. */ unsigned long getcr4(void) { return 0; } void putcr4(unsigned long) { } /* * Page Table Handling. * * Now would be a good time to take a rest and grab a coffee or similarly * relaxing stimulant. The easy parts are behind us, and the trek gradually * winds uphill from here. * * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU * maps virtual addresses to physical addresses using "page tables". We could * use one huge index of 1 million entries: each address is 4 bytes, so that's * 1024 pages just to hold the page tables. But since most virtual addresses * are unused, we use a two level index which saves space. The CR3 register * contains the physical address of the top level "page directory" page, which * contains physical addresses of up to 1024 second-level pages. Each of these * second level pages contains up to 1024 physical addresses of actual pages, * or Page Table Entries (PTEs). * * Here's a diagram, where arrows indicate physical addresses: * * CR3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | * Top-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | * | | | | * .... .... * * So to convert a virtual address to a physical address, we look up the top * level, which points us to the second level, which gives us the physical * address of that page. If the top level entry was not present, or the second * level entry was not present, then the virtual address is invalid (we * say "the page was not mapped"). * * Put another way, a 32-bit virtual address is divided up like so: * * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| * Index into top Index into second Offset within page * page directory page pagetable page * * The kernel spends a lot of time changing both the top-level page directory * and lower-level pagetable pages. The Guest doesn't know physical addresses, * so while it maintains these page tables exactly like normal, it also needs * to keep the Host informed whenever it makes a change: the Host will create * the real page tables based on the Guests'. */ /* The Guest calls this to set a second-level entry (pte), ie. to map a page * into a process' address space. We set the entry then tell the Host the * toplevel and address this corresponds to. The Guest uses one pagetable per * process, so we need to tell the Host which one we're changing (mm->pgd). */ void lguest_set_pte_at(ulong pdb, u32 addr, u32 *ptep, u32 pteval) { *ptep = pteval; lazy_hcall(LHCALL_SET_PTE, paddr((void *)pdb), addr, pteval&0xfff); } /* The Guest calls this to set a top-level entry. Again, we set the entry then * tell the Host which top-level page we changed, and the index of the entry we * changed. */ void lguest_set_pmd(u32 *pmdp, u32 pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, PPN(paddr(pmdp)), (paddr(pmdp)&(BY2PG-1))/4, 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. * * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow. So we don't even tell the Host * anything changed until we've done the first page table switch. */ void lguest_set_pte(u32 *ptep, u32 pteval) { *ptep = pteval; /* Don't bother with hypercall before initial setup. */ if (current_cr3) lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); } /* Unfortunately for Lguest, the paravirt_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). * * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present * bit is zero). */ void lguest_flush_tlb_single(u32 addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); } /* This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might * have changed, ie. virtual addresses below PAGE_OFFSET. */ void lguest_flush_tlb_user(void) { lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); } /* This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely * slow), so it's worth separating this from the user flushing above. */ void lguest_flush_tlb_kernel(void) { lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); } /* * The Unadvanced Programmable Interrupt Controller. * * This is an attempt to implement the simplest possible interrupt controller. * I spent some time looking though routines like set_irq_chip_and_handler, * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and * I *think* this is as simple as it gets. * * We can tell the Host what interrupts we want blocked ready for using the * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as * simple as setting a bit. We don't actually "ack" interrupts as such, we * just mask and unmask them. I wonder if we should be cleverer? */ void disable_lguest_irq(unsigned int irq) { /* probably some asm goo. Fooey */ /* we may pull over the xen stuff for this at some point. */ // set_bit(irq, lguest_data.blocked_interrupts); lguest_data.blocked_interrupts[irq/32] |= 1 << (irq % 32); } void enable_lguest_irq(unsigned int irq) { // clear_bit(irq, lguest_data.blocked_interrupts); lguest_data.blocked_interrupts[irq/32] &= ~(1 << (irq % 32)); } #ifdef NOT /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based * lguest interrupt controller. */ void lguest_init_IRQ(void) { unsigned int i; for (i = 0; i < LGUEST_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { set_intr_gate(vector, interrupt[i]); set_irq_chip_and_handler(i, &lguest_irq_controller, handle_level_irq); } } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ irq_ctx_init(smp_processor_id()); } #endif /* now begins the fun with Plan 9 time. how many ticks again? */ /* we may need to look at the Xen port. This is a nightmare in Plan 9 */ /* * Time. * * It would be far better for everyone if the Guest had its own clock, but * until then it must ask the Host for the time. */ unsigned long lguest_get_wallclock(void) { return lguest_data.time.seconds; } /* get the nanoseconds */ /* let's pretend we don't get interupted here. */ uvlong lguest_get_ns(void) { uvlong ret = lguest_data.time.seconds; ret <<= 32; ret |= lguest_data.time.nanoseconds; return ret; } /* * Miscellaneous bits and pieces. * * Here is an oddball collection of functions which the Guest needs for things * to work. They're pretty simple. */ void lguest_interval_timer(u32 nanoseconds){ lazy_hcall(LHCALL_SET_CLOCKEVENT, nanoseconds, 0, 0); } /* The Guest needs to tell the host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */ void lguest_load_esp0(u32 stack) { //iprint("load stack %p\n", (void *)stack); lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); lazy_hcall(LHCALL_SET_STACK, (KDSEG<<3)|1, stack, KSTACK/BY2PG); } /* Let's just say, I wouldn't do debugging under a Guest. */ void lguest_set_debugreg(int, unsigned long) { /* FIXME: Implement */ } /* There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush * (clflush) instruction is available and the kernel uses that. Otherwise, it * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. * Unlike clflush, wbinvd can only be run at privilege level 0. So we can * ignore clflush, but replace wbinvd. */ void lguest_wbinvd(void) { } /* If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It * does, however, allow us to get through the Linux boot code. */ #ifdef CONFIG_X86_LOCAL_APIC void lguest_apic_write(unsigned long reg, unsigned long v) { } unsigned long lguest_apic_read(unsigned long reg) { return 0; } #endif /* STOP! Until an interrupt comes in. */ void halt(void) { hcall(LHCALL_HALT, 0, 0, 0); } /* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a * message out when we're crashing as well as elegant termination like powering * off. * * Note that the Host always prefers that the Guest speak in physical addresses * rather than virtual addresses, so we use paddr() here. */ void lguest_power_off(void) { hcall(LHCALL_CRASH, paddr("Power down"), 0, 0); } /* * Panicing. * * Don't. But if you did, this is what happens. */ int lguest_panic( void *p) { hcall(LHCALL_CRASH, paddr(p), 0, 0); return -1; } /* Setting up memory is fairly easy. */ /* The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ /* add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);*/ /*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops * structure in the kernel provides a single point for (almost) every routine * we have to override to avoid privileged instructions. */ /* FIXME: use the boot info passed in, sasuming it is good */ void lguest_init(void */* boot info -- use this someday */) { #ifdef NOT /* Copy boot parameters first: the Launcher put the physical location * in %esi, and head.S converted that to a virtual address and handed * it to us. */ memcpy(boot_params, boot, PARAM_SIZE); /* The boot parameters also tell us where the command-line is: save * that, too. */ memcpy(boot_command_line, __va(*(unsigned long *)(boot_params + NEW_CL_POINTER)), COMMAND_LINE_SIZE); #endif /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). :*/ /*G:070 Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup * occurs. * * The Host expects our first hypercall to tell it where our "struct * lguest_data" is, so we do that first. */ hcall(LHCALL_LGUEST_INIT, paddr(&lguest_data), 0, 0); /* The native boot code sets up initial page tables immediately after * the kernel itself, and sets init_pg_tables_end so they're not * clobbered. The Launcher places our initial pagetables somewhere at * the top of our physical memory, so we don't need extra space: set * init_pg_tables_end to the end of the kernel. */ /* not on plan 9, sadly ... */ // init_pg_tables_end = paddr(pg0); /* Set up the Percpu Data Area (going away in 2.6.22: yay!) */ // load_gdt(&early_gdt_descr); // asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); /* The Host uses the top of the Guest's virtual address space for the * Host<->Guest Switcher, and it tells us how much it needs in * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ // reserve_top_address(lguest_data.reserve_mem); /* This is messy CPU setup stuff which the native boot code does before * start_kernel, so we have to do, too: */ // cpu_detect(&new_cpu_data); /* Need this before paging_init. */ // set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability); /* Math is always hard! */ // new_cpu_data.hard_math = 1; /* Turn off stuff which a Guest never has to deal with */ } /* * This marks the end of stage II of our journey, The Guest. * * It is now time for us to explore the nooks and crannies of the three Guest * devices and complete our understanding of the Guest in "make Drivers". */ PPN(paddr(pmdp)), (paddr(pmdp)&(BY2PG-1))/4, 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing orlguest25/main.c 664 0 0 43470 11033021127 12456ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "ureg.h" #include "init.h" #include "pool.h" #include "reboot.h" #include "lguest.h" #pragma profile 0 Mach *m; void lgconswrite(char *n, int len); /* lguest naughty bits. lguest data struct */ struct lguest_data lguest_data = { /* what the hell is the compiler doing here? Warning which makes no sense */ .noirq_start = (unsigned long) lguest_noirq_start, .noirq_end = (unsigned long) lguest_noirq_end, .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = VectorSYSCALL, }; u32 *lguest_ptep = &lguest_data.pgdir; //struct lguest_device_desc *lguest_devices; /* * Where configuration info is left for the loaded programme. * This will turn into a structure as more is done by the boot loader * (e.g. why parse the .ini file twice?). * There are 3584 bytes available at CONFADDR. */ //#define BOOTLINE ((char*)CONFADDR) #define BOOTLINE ((char*)(KZERO + 4096)) #define BOOTLINELEN 64 #define BOOTARGS ((char*)(CONFADDR+BOOTLINELEN)) #define BOOTARGSLEN (4096) #define MAXCONF 64 char bootdisk[KNAMELEN]; Conf conf; char *confname[MAXCONF]; char *confval[MAXCONF]; int nconf; uchar *sp; /* user stack of init proc */ int delaylink; static char bootcopy[BOOTARGSLEN]; void dumphex(char *name, unsigned char *s, int len){ int i, j; unsigned char *b; iprint("%s: ", name); for(i = 0, b = s; i ' '. * and Change ';' to \n */ p = cp; for(q = cp; *q; q++){ if(*q == '\r') continue; if(*q == ';') *q = '\n'; if(*q == '\t') *q = ' '; *p++ = *q; } *p = 0; n = getfields(cp, line, MAXCONF, 1, "\n"); for(i = 0; i < n; i++){ if(*line[i] == '#') continue; cp = strchr(line[i], '='); if(cp == nil) continue; *cp++ = '\0'; confname[nconf] = line[i]; confval[nconf] = cp; nconf++; } } extern void mmuinit0(void); extern void (*i8237alloc)(void); unsigned long x = 123456; extern Uart lguart; void main(ulong physboot) { void lgconsole(void); int i; void *boot = KADDR(physboot); // hcall(LHCALL_CRASH, paddr("time to die"), 0, 0); hcall(LHCALL_NOTIFY, paddr("hello squidboy"), 0, 0); for(i = 0; i < LHCALL_RING_SIZE; i++) lguest_data.hcall_status[i] = 0xff; if (x != 123456) hcall(LHCALL_CRASH, paddr("data is not aligned"), 0, 0); // hcall(LHCALL_CRASH, paddr("time to die"), 0, 0); lgconswrite("mach0init\n", 10); mach0init(); lgconswrite("options\n", 8); options(4096 + (char *)boot); lgconswrite("LGU\n", 4); lgconsole(); lgconswrite("qfmti\n", 6); quotefmtinstall(); lgconswrite("p9\n", 3); print(" bootargs %s\n", bootargs); print("physboot %#lx (%s)\n", physboot, kaddr(physboot+4096)); print("\nPlan 9\n"); // panic("test panic"); // hcall(LHCALL_CRASH, paddr("DIE NOW"), 0, 0); trapinit0(); iprint("trapinit0() ..."); mmuinit0(); iprint("mmuinit0()..."); cpuidentify(); iprint("cpuid ...\n"); meminit();iprint(" meminit();\n"); confinit();iprint(" confinit();\n"); archinit();iprint(" archinit();\n"); xinit();iprint(" xinit();\n"); trapinit();iprint(" trapinit();\n"); print("call printinit\n"); printinit();iprint(" printinit();\n"); cpuidprint();iprint(" cpuidprint();\n"); mmuinit();iprint(" mmuinit();\n"); iprint("arch->intrinit is %p\n", arch->intrinit); if(arch->intrinit) /* launches other processors on an mp */ arch->intrinit(); timersinit();iprint(" timersinit();\n"); mathinit();iprint(" mathinit();\n"); if(arch->clockenable) arch->clockenable(); procinit0();iprint(" procinit0();\n"); initseg();iprint(" initseg();\n"); if(delaylink){ bootlinks(); }else links(); conf.monitor = 1; chandevreset();iprint(" chandevreset();\n"); /* the serialoq is totally pointless and it causes a panic I don't understand */ /* the whole thing needs rework. */ serialoq = nil; pageinit();iprint(" pageinit();\n"); swapinit();iprint(" swapinit();\n"); // consintrenable(); iprint(" consintrenable\n"); kbdenable(); userinit();iprint(" userinit();\n"); active.thunderbirdsarego = 1;iprint(" active.thunderbirdsarego = 1;\n"); iprint("consuart %p lguart %p consuart->console %d\n", consuart, &lguart, consuart->console); schedinit();iprint(" schedinit();\n"); } void mach0init(void) { conf.nmach = 1; MACHP(0) = (Mach*)CPU0MACH; m->pdb = (ulong*)CPU0PDB; m->gdt = (Segdesc*)CPU0GDT; machinit(); active.machs = 1; active.exiting = 0; } void machinit(void) { int machno; ulong *pdb; Segdesc *gdt; lgconswrite("machinit\n", 9); machno = m->machno; pdb = m->pdb; gdt = m->gdt; memset(m, 0, sizeof(Mach)); m->machno = machno; m->pdb = pdb; m->gdt = gdt; m->perf.period = 1; lgconswrite("machinitb\n", 10); /* * For polled uart output at boot, need * a default delay constant. 100000 should * be enough for a while. Cpuidentify will * calculate the real value later. */ m->loopconst = 100000; } void init0(void) { int i; char buf[2*KNAMELEN]; up->nerrlab = 0; iprint("init0\n"); spllo(); /* * These are o.k. because rootinit is null. * Then early kproc's will have a root and dot. */ up->slash = namec("#/", Atodir, 0, 0); pathclose(up->slash->path); up->slash->path = newpath("/"); up->dot = cclone(up->slash); iprint("init0 before chandevinit\n"); chandevinit(); if(!waserror()){ snprint(buf, sizeof(buf), "%s %s", arch->id, conffile); ksetenv("terminal", buf, 0); ksetenv("cputype", "386", 0); if(cpuserver) ksetenv("service", "cpu", 0); else ksetenv("service", "terminal", 0); for(i = 0; i < nconf; i++){ if(confname[i][0] != '*') ksetenv(confname[i], confval[i], 0); ksetenv(confname[i], confval[i], 1); } poperror(); } kproc("alarm", alarmkproc, 0); iprint("touser\n"); touser(sp); } void userinit(void) { void *v; Proc *p; Segment *s; Page *pg; p = newproc(); iprint("newproc done\n"); p->pgrp = newpgrp(); iprint("newpgrp\n"); p->egrp = smalloc(sizeof(Egrp)); p->egrp->ref = 1; p->fgrp = dupfgrp(nil); p->rgrp = newrgrp(); p->procmode = 0640; iprint("kstrdup eve\n"); kstrdup(&eve, ""); kstrdup(&p->text, "*init*"); kstrdup(&p->user, eve); p->fpstate = FPinit; iprint("fpoff?\n"); fpoff(); iprint("fpoff done\n"); /* * Kernel Stack * * N.B. make sure there's enough space for syscall to check * for valid args and * 4 bytes for gotolabel's return PC */ p->sched.pc = (ulong)init0; p->sched.sp = (ulong)p->kstack+KSTACK-(sizeof(Sargs)+BY2WD); /* * User Stack * * N.B. cannot call newpage() with clear=1, because pc kmap * requires up != nil. use tmpmap instead. */ iprint("user satck\n"); s = newseg(SG_STACK, USTKTOP-USTKSIZE, USTKSIZE/BY2PG); p->seg[SSEG] = s; pg = newpage(0, 0, USTKTOP-BY2PG); iprint("tmpmap .. this will break right?\n"); v = tmpmap(pg); print("tmpmap, v is %p\n", v); memset(v, 0, BY2PG); segpage(s, pg); bootargs(v); tmpunmap(v); /* * Text */ s = newseg(SG_TEXT, UTZERO, 1); s->flushme++; p->seg[TSEG] = s; pg = newpage(0, 0, UTZERO); memset(pg->cachectl, PG_TXTFLUSH, sizeof(pg->cachectl)); segpage(s, pg); v = tmpmap(pg); memset(v, 0, BY2PG); memmove(v, initcode, sizeof initcode); tmpunmap(v); ready(p); } uchar * pusharg(char *p) { int n; n = strlen(p)+1; sp -= n; memmove(sp, p, n); return sp; } void bootargs(void *base) { int i, ac; uchar *av[32]; uchar **lsp; char *cp = BOOTLINE; char buf[64]; sp = (uchar*)base + BY2PG - MAXSYSARG*BY2WD; ac = 0; av[ac++] = pusharg("/386/9dos"); /* when boot is changed to only use rc, this code can go away */ cp[BOOTLINELEN-1] = 0; buf[0] = 0; if(strncmp(cp, "fd", 2) == 0){ sprint(buf, "local!#f/fd%lddisk", strtol(cp+2, 0, 0)); av[ac++] = pusharg(buf); } else if(strncmp(cp, "sd", 2) == 0){ sprint(buf, "local!#S/sd%c%c/fs", *(cp+2), *(cp+3)); av[ac++] = pusharg(buf); } else if(strncmp(cp, "ether", 5) == 0) av[ac++] = pusharg("-n"); /* 4 byte word align stack */ sp = (uchar*)((ulong)sp & ~3); /* build argc, argv on stack */ sp -= (ac+1)*sizeof(sp); lsp = (uchar**)sp; for(i = 0; i < ac; i++) *lsp++ = av[i] + ((USTKTOP - BY2PG) - (ulong)base); *lsp = 0; sp += (USTKTOP - BY2PG) - (ulong)base - sizeof(ulong); } char* getconf(char *name) { int i; for(i = 0; i < nconf; i++) if(cistrcmp(confname[i], name) == 0) return confval[i]; return 0; } static void writeconf(void) { char *p, *q; int n; p = getconfenv(); if(waserror()) { free(p); nexterror(); } /* convert to name=value\n format */ for(q=p; *q; q++) { q += strlen(q); *q = '='; q += strlen(q); *q = '\n'; } n = q - p + 1; if(n >= BOOTARGSLEN) error("kernel configuration too large"); memset(BOOTLINE, 0, BOOTLINELEN); memmove(BOOTARGS, p, n); poperror(); free(p); } void confinit(void) { char *p; int i, userpcnt; ulong kpages; if(p = getconf("*kernelpercent")) userpcnt = 100 - strtol(p, 0, 0); else userpcnt = 0; conf.npage = 0; for(i=0; i 2000) conf.nproc = 2000; conf.nimage = 200; conf.nswap = conf.nproc*80; conf.nswppo = 4096; if(cpuserver) { if(userpcnt < 10) userpcnt = 70; kpages = conf.npage - (conf.npage*userpcnt)/100; /* * Hack for the big boys. Only good while physmem < 4GB. * Give the kernel fixed max + enough to allocate the * page pool. * This is an overestimate as conf.upages < conf.npages. * The patch of nimage is a band-aid, scanning the whole * page list in imagereclaim just takes too long. */ if(kpages > (64*MB + conf.npage*sizeof(Page))/BY2PG){ kpages = (64*MB + conf.npage*sizeof(Page))/BY2PG; conf.nimage = 2000; kpages += (conf.nproc*KSTACK)/BY2PG; } } else { if(userpcnt < 10) { if(conf.npage*BY2PG < 16*MB) userpcnt = 40; else userpcnt = 60; } kpages = conf.npage - (conf.npage*userpcnt)/100; /* * Make sure terminals with low memory get at least * 4MB on the first Image chunk allocation. */ if(conf.npage*BY2PG < 16*MB) imagmem->minarena = 4*1024*1024; } /* * can't go past the end of virtual memory * (ulong)-KZERO is 2^32 - KZERO */ if(kpages > ((ulong)-KZERO)/BY2PG) kpages = ((ulong)-KZERO)/BY2PG; conf.upages = conf.npage - kpages; conf.ialloc = (kpages/2)*BY2PG; /* * Guess how much is taken by the large permanent * datastructures. Mntcache and Mntrpc are not accounted for * (probably ~300KB). */ kpages *= BY2PG; kpages -= conf.upages*sizeof(Page) + conf.nproc*sizeof(Proc) + conf.nimage*sizeof(Image) + conf.nswap + conf.nswppo*sizeof(Page); mainmem->maxsize = kpages; if(!cpuserver){ /* * give terminals lots of image memory, too; the dynamic * allocation will balance the load properly, hopefully. * be careful with 32-bit overflow. */ imagmem->maxsize = kpages; } } static char* mathmsg[] = { nil, /* handled below */ "denormalized operand", "division by zero", "numeric overflow", "numeric underflow", "precision loss", }; static void mathnote(void) { int i; ulong status; char *msg, note[ERRMAX]; status = up->fpsave.status; /* * Some attention should probably be paid here to the * exception masks and error summary. */ msg = "unknown exception"; for(i = 1; i <= 5; i++){ if(!((1<fpsave.pc, status); postnote(up, 1, note, NDebug); } /* * math coprocessor error */ static void matherror(Ureg *ur, void*) { /* * a write cycle to port 0xF0 clears the interrupt latch attached * to the error# line from the 387 */ if(!(m->cpuiddx & 0x01)) outb(0xF0, 0xFF); /* * save floating point state to check out error */ fpenv(&up->fpsave); mathnote(); if((ur->pc & 0xf0000000) == KZERO) panic("fp: status %ux fppc=0x%lux pc=0x%lux", up->fpsave.status, up->fpsave.pc, ur->pc); } /* * math coprocessor emulation fault */ static void mathemu(Ureg *ureg, void*) { if(up->fpstate & FPillegal){ /* someone did floating point in a note handler */ postnote(up, 1, "sys: floating point in note handler", NDebug); return; } switch(up->fpstate){ case FPinit: fpinit(); up->fpstate = FPactive; break; case FPinactive: /* * Before restoring the state, check for any pending * exceptions, there's no way to restore the state without * generating an unmasked exception. * More attention should probably be paid here to the * exception masks and error summary. */ if((up->fpsave.status & ~up->fpsave.control) & 0x07F){ mathnote(); break; } fprestore(&up->fpsave); up->fpstate = FPactive; break; case FPactive: panic("math emu pid %ld %s pc 0x%lux", up->pid, up->text, ureg->pc); break; } } /* * math coprocessor segment overrun */ static void mathover(Ureg*, void*) { pexit("math overrun", 0); } void mathinit(void) { trapenable(VectorCERR, matherror, 0, "matherror"); if(X86FAMILY(m->cpuidax) == 3) intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror"); trapenable(VectorCNA, mathemu, 0, "mathemu"); trapenable(VectorCSO, mathover, 0, "mathover"); } /* * set up floating point for a new process */ void procsetup(Proc*p) { p->fpstate = FPinit; fpoff(); } void procrestore(Proc *p) { uvlong t; if(p->kp) return; cycles(&t); p->pcycles -= t; } /* * Save the mach dependent part of the process state. */ void procsave(Proc *p) { uvlong t; cycles(&t); p->pcycles += t; if(p->fpstate == FPactive){ if(p->state == Moribund) fpclear(); else{ /* * Fpsave() stores without handling pending * unmasked exeptions. Postnote() can't be called * here as sleep() already has up->rlock, so * the handling of pending exceptions is delayed * until the process runs again and generates an * emulation fault to activate the FPU. */ fpsave(&p->fpsave); } p->fpstate = FPinactive; } /* * While this processor is in the scheduler, the process could run * on another processor and exit, returning the page tables to * the free list where they could be reallocated and overwritten. * When this processor eventually has to get an entry from the * trashed page tables it will crash. * * If there's only one processor, this can't happen. * You might think it would be a win not to do this in that case, * especially on VMware, but it turns out not to matter. */ mmuflushtlb(PADDR(m->pdb)); } static void shutdown(int ispanic) { int ms, once; lock(&active); if(ispanic) active.ispanic = ispanic; else if(m->machno == 0 && (active.machs & (1<machno)) == 0) active.ispanic = 0; once = active.machs & (1<machno); active.machs &= ~(1<machno); active.exiting = 1; unlock(&active); if(once) iprint("cpu%d: exiting\n", m->machno); spllo(); for(ms = 5*1000; ms > 0; ms -= TK2MS(2)){ delay(TK2MS(2)); if(active.machs == 0 && consactive() == 0) break; } if(getconf("*debug")) delay(5*60*1000); if(active.ispanic){ if(!cpuserver) for(;;) halt(); delay(10000); }else delay(1000); } void reboot(void *entry, void *code, ulong size) { void (*f)(ulong, ulong, ulong); ulong *pdb; writeconf(); shutdown(0); /* * should be the only processor running now */ print("shutting down...\n"); delay(200); splhi(); /* turn off buffered serial console */ serialoq = nil; /* shutdown devices */ chandevshutdown(); /* * Modify the machine page table to directly map the low 4MB of memory * This allows the reboot code to turn off the page mapping */ pdb = m->pdb; pdb[PDX(0)] = pdb[PDX(KZERO)]; mmuflushtlb(PADDR(pdb)); /* setup reboot trampoline function */ f = (void*)REBOOTADDR; memmove(f, rebootcode, sizeof(rebootcode)); print("rebooting...\n"); /* off we go - never to return */ (*f)(PADDR(entry), PADDR(code), size); } void exit(int ispanic) { shutdown(ispanic); arch->reset(); } int isaconfig(char *class, int ctlrno, ISAConf *isa) { char cc[32], *p; int i; snprint(cc, sizeof cc, "%s%d", class, ctlrno); p = getconf(cc); if(p == nil) return 0; isa->type = ""; isa->nopt = tokenize(p, isa->opt, NISAOPT); for(i = 0; i < isa->nopt; i++){ p = isa->opt[i]; if(cistrncmp(p, "type=", 5) == 0) isa->type = p + 5; else if(cistrncmp(p, "port=", 5) == 0) isa->port = strtoul(p+5, &p, 0); else if(cistrncmp(p, "irq=", 4) == 0) isa->irq = strtoul(p+4, &p, 0); else if(cistrncmp(p, "dma=", 4) == 0) isa->dma = strtoul(p+4, &p, 0); else if(cistrncmp(p, "mem=", 4) == 0) isa->mem = strtoul(p+4, &p, 0); else if(cistrncmp(p, "size=", 5) == 0) isa->size = strtoul(p+5, &p, 0); else if(cistrncmp(p, "freq=", 5) == 0) isa->freq = strtoul(p+5, &p, 0); } return 1; } int cistrcmp(char *a, char *b) { int ac, bc; for(;;){ ac = *a++; bc = *b++; if(ac >= 'A' && ac <= 'Z') ac = 'a' + (ac - 'A'); if(bc >= 'A' && bc <= 'Z') bc = 'a' + (bc - 'A'); ac -= bc; if(ac) return ac; if(bc == 0) break; } return 0; } int cistrncmp(char *a, char *b, int n) { unsigned ac, bc; while(n > 0){ ac = *a++; bc = *b++; n--; if(ac >= 'A' && ac <= 'Z') ac = 'a' + (ac - 'A'); if(bc >= 'A' && bc <= 'Z') bc = 'a' + (bc - 'A'); ac -= bc; if(ac) return ac; if(bc == 0) break; } return 0; } /* * put the processor in the halt state if we've no processes to run. * an interrupt will get us going again. */ void idlehands(void) { if(conf.nmach == 1) halt(); } _profin() { } _profout() { } lguest25/mem.h 664 0 0 12532 11024566604 12326ustar00bootesbootes/* * Memory and machine-specific definitions. Used in C and assembler. */ /* * Sizes */ #define BI2BY 8 /* bits per byte */ #define BI2WD 32 /* bits per word */ #define BY2WD 4 /* bytes per word */ #define BY2V 8 /* bytes per double word */ #define BY2PG 4096 /* bytes per page */ #define WD2PG (BY2PG/BY2WD) /* words per page */ #define BY2XPG (4096*1024) /* bytes per big page */ #define PGSHIFT 12 /* log(BY2PG) */ #define ROUND(s, sz) (((s)+((sz)-1))&~((sz)-1)) #define PGROUND(s) ROUND(s, BY2PG) #define BLOCKALIGN 8 #define MAXMACH 8 /* max # cpus system can run */ #define KSTACK 4096 /* Size of kernel stack */ /* * Time */ #define HZ (100) /* clock frequency */ #define MS2HZ (1000/HZ) /* millisec per clock tick */ #define TK2SEC(t) ((t)/HZ) /* ticks to seconds */ /* * Address spaces */ /* let's try C, not F, to make lguest happier? */ #define KZERO 0xC0000000 /* base of kernel address space */ #define KTZERO (KZERO+0x100000) /* first address in kernel text - 9load sits below */ #define VPT (KZERO-VPTSIZE) #define VPTSIZE BY2XPG #define NVPT (VPTSIZE/BY2WD) #define KMAP (VPT-KMAPSIZE) #define KMAPSIZE BY2XPG #define VMAP (KMAP-VMAPSIZE) #define VMAPSIZE (0x10000000-VPTSIZE-KMAPSIZE) #define UZERO 0 /* base of user address space */ #define UTZERO (UZERO+BY2PG) /* first address in user text */ #define USTKTOP (VMAP-BY2PG) /* byte just beyond user stack */ #define USTKSIZE (16*1024*1024) /* size of user stack */ #define TSTKTOP (USTKTOP-USTKSIZE) /* end of new stack in sysexec */ #define TSTKSIZ 100 /* * Fundamental addresses - bottom 64kB saved for return to real mode */ #define CONFADDR (KZERO+0x1200) /* info passed from boot loader */ #define TMPADDR (KZERO+0x2000) /* used for temporary mappings */ #define APBOOTSTRAP (KZERO+0x3000) /* AP bootstrap code */ #define RMUADDR (KZERO+0x7C00) /* real mode Ureg */ #define RMCODE (KZERO+0x8000) /* copy of first page of KTEXT */ #define RMBUF (KZERO+0x9000) /* buffer for user space - known to vga */ #define IDTADDR (KZERO+0x10800) /* idt */ #define REBOOTADDR (0x11000) /* reboot code - physical address */ #define CPU0PDB (KZERO+0x12000) /* bootstrap processor PDB */ #define CPU0PTE (KZERO+0x13000) /* bootstrap processor PTE's for 0-4MB */ #define CPU0GDT (KZERO+0x14000) /* bootstrap processor GDT */ #define MACHADDR (KZERO+0x15000) /* as seen by current processor */ #define CPU0MACH (KZERO+0x16000) /* Mach for bootstrap processor */ /* HACK */ #undef CPU0MACH #define CPU0MACH (KZERO+0x15000) /* Mach for bootstrap processor */ #define MACHSIZE BY2PG #define CPU0PTE1 (KZERO+0x17000) /* bootstrap processor PTE's for 4MB-8MB */ #define CPU0END (CPU0PTE1+BY2PG) /* * N.B. ramscan knows that CPU0MACH+BY2PG is the end of reserved data * N.B. _startPADDR knows that CPU0PDB is the first reserved page * and that there are 5 of them. */ /* * known x86 segments (in GDT) and their selectors */ #define NULLSEG 0 /* null segment */ #define KDSEG 13 /* kernel data/stack */ #define KESEG 12 /* kernel executable */ #define UDSEG 15 /* user data/stack */ #define UESEG 14 /* user executable */ #define TSSSEG 16 /* task segment */ #define APMCSEG 6 /* APM code segment */ #define APMCSEG16 7 /* APM 16-bit code segment */ #define APMDSEG 8 /* APM data segment */ #define KESEG16 9 /* kernel executable 16-bit */ #define NGDT 10 /* number of GDT entries required */ /* #define APM40SEG 8 /* APM segment 0x40 */ #define SELGDT (0<<2) /* selector is in gdt */ #define SELLDT (1<<2) /* selector is in ldt */ #define SELECTOR(i, t, p) (((i)<<3) | (t) | (p)) #define NULLSEL SELECTOR(NULLSEG, SELGDT, 0) #define KDSEL SELECTOR(KDSEG, SELGDT, 1) #define KESEL SELECTOR(KESEG, SELGDT, 1) #define UESEL SELECTOR(UESEG, SELGDT, 3) #define UDSEL SELECTOR(UDSEG, SELGDT, 3) #define TSSSEL SELECTOR(TSSSEG, SELGDT, 1) #define APMCSEL SELECTOR(APMCSEG, SELGDT, 1) #define APMCSEL16 SELECTOR(APMCSEG16, SELGDT, 1) #define APMDSEL SELECTOR(APMDSEG, SELGDT, 1) /* #define APM40SEL SELECTOR(APM40SEG, SELGDT, 1) */ /* * fields in segment descriptors */ #define SEGDATA (0x10<<8) /* data/stack segment */ #define SEGEXEC (0x18<<8) /* executable segment */ #define SEGTSS (0x9<<8) /* TSS segment */ #define SEGCG (0x0C<<8) /* call gate */ #define SEGIG (0x0E<<8) /* interrupt gate */ #define SEGTG (0x0F<<8) /* trap gate */ #define SEGTYPE (0x1F<<8) #define SEGP (1<<15) /* segment present */ #define SEGPL(x) ((x)<<13) /* priority level */ #define SEGB (1<<22) /* granularity 1==4k (for expand-down) */ #define SEGG (1<<23) /* granularity 1==4k (for other) */ #define SEGE (1<<10) /* expand down */ #define SEGW (1<<9) /* writable (for data/stack) */ #define SEGR (1<<9) /* readable (for code) */ #define SEGD (1<<22) /* default 1==32bit (for code) */ /* * virtual MMU */ #define PTEMAPMEM (1024*1024) #define PTEPERTAB (PTEMAPMEM/BY2PG) #define SEGMAPSIZE 1984 #define SSEGMAPSIZE 16 #define PPN(x) ((x)&~(BY2PG-1)) /* * physical MMU */ #define PTEVALID (1<<0) #define PTEWT (1<<3) #define PTEUNCACHED (1<<4) #define PTEWRITE (1<<1) #define PTERONLY (0<<1) #define PTEKERNEL (0<<2) #define PTEUSER (1<<2) #define PTESIZE (1<<7) #define PTEGLOBAL (1<<8) /* * Macros for calculating offsets within the page directory base * and page tables. */ #define PDX(va) ((((ulong)(va))>>22) & 0x03FF) #define PTX(va) ((((ulong)(va))>>12) & 0x03FF) #define getpgcolor(a) 0 s(&t); p->pcycles += t; if(p->fpstate == FPactive){ if(p->state == Moribund) fpclear(); else{ /* * Fpsave() stores without handling pending * unmalguest25/memory.c 664 0 0 32364 11024601564 13053ustar00bootesbootes/* * Size memory and create the kernel page-tables on the fly while doing so. * Called from main(), this code should only be run by the bootstrap processor. */ #include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "ureg.h" #include "lguest.h" #pragma profile 0 #define MEMDEBUG 1 enum { MemUPA = 0, /* unbacked physical address */ MemRAM = 1, /* physical memory */ MemUMB = 2, /* upper memory block (<16MB) */ MemReserved = 3, NMemType = 4, KB = 1024, MemMin = 8*MB, /* minimum physical memory (<=4MB) */ MemMax = (3*1024+768)*MB, /* maximum physical memory to check */ NMemBase = 10, }; typedef struct Map Map; struct Map { ulong size; ulong addr; }; typedef struct RMap RMap; struct RMap { char* name; Map* map; Map* mapend; Lock; }; /* * Memory allocation tracking. */ static Map mapupa[16]; static RMap rmapupa = { "unallocated unbacked physical memory", mapupa, &mapupa[nelem(mapupa)-1], }; static Map xmapupa[16]; static RMap xrmapupa = { "unbacked physical memory", xmapupa, &xmapupa[nelem(xmapupa)-1], }; static Map mapram[16]; static RMap rmapram = { "physical memory", mapram, &mapram[nelem(mapram)-1], }; static Map mapumb[64]; static RMap rmapumb = { "upper memory block", mapumb, &mapumb[nelem(mapumb)-1], }; static Map mapumbrw[16]; static RMap rmapumbrw = { "UMB device memory", mapumbrw, &mapumbrw[nelem(mapumbrw)-1], }; /* the lguest device structs live at last page above memory. We will * map them in here. Jmk approves. */ struct lguest_device_desc *lgd; void mapprint(RMap *rmap) { Map *mp; print("%s\n", rmap->name); for(mp = rmap->map; mp->size; mp++) print("\t%8.8luX %8.8luX (%lud)\n", mp->addr, mp->addr+mp->size, mp->size); } void memdebug(void) { mapprint(&rmapram); mapprint(&rmapumb); mapprint(&rmapumbrw); mapprint(&rmapupa); } void mapfree(RMap* rmap, ulong addr, ulong size) { Map *mp; ulong t; if(size <= 0) return; lock(rmap); for(mp = rmap->map; mp->addr <= addr && mp->size; mp++) ; if(mp > rmap->map && (mp-1)->addr+(mp-1)->size == addr){ (mp-1)->size += size; if(addr+size == mp->addr){ (mp-1)->size += mp->size; while(mp->size){ mp++; (mp-1)->addr = mp->addr; (mp-1)->size = mp->size; } } } else{ if(addr+size == mp->addr && mp->size){ mp->addr -= size; mp->size += size; } else do{ if(mp >= rmap->mapend){ print("mapfree: %s: losing 0x%luX, %ld\n", rmap->name, addr, size); break; } t = mp->addr; mp->addr = addr; addr = t; t = mp->size; mp->size = size; mp++; }while(size = t); } unlock(rmap); } ulong mapalloc(RMap* rmap, ulong addr, int size, int align) { Map *mp; ulong maddr, oaddr; lock(rmap); for(mp = rmap->map; mp->size; mp++){ maddr = mp->addr; if(addr){ /* * A specific address range has been given: * if the current map entry is greater then * the address is not in the map; * if the current map entry does not overlap * the beginning of the requested range then * continue on to the next map entry; * if the current map entry does not entirely * contain the requested range then the range * is not in the map. */ if(maddr > addr) break; if(mp->size < addr - maddr) /* maddr+mp->size < addr, but no overflow */ continue; if(addr - maddr > mp->size - size) /* addr+size > maddr+mp->size, but no overflow */ break; maddr = addr; } if(align > 0) maddr = ((maddr+align-1)/align)*align; if(mp->addr+mp->size-maddr < size) continue; oaddr = mp->addr; mp->addr = maddr+size; mp->size -= maddr-oaddr+size; if(mp->size == 0){ do{ mp++; (mp-1)->addr = mp->addr; }while((mp-1)->size = mp->size); } unlock(rmap); if(oaddr != maddr) mapfree(rmap, oaddr, maddr-oaddr); return maddr; } unlock(rmap); return 0; } /* * Allocate from the ram map directly to make page tables. * Called by mmuwalk during e820scan. */ void* rampage(void) { ulong m; m = mapalloc(&rmapram, 0, BY2PG, BY2PG); if(m == 0) return nil; return KADDR(m); } static void umbexclude(void) { int size; ulong addr; char *op, *p, *rptr; if((p = getconf("umbexclude")) == nil) return; while(p && *p != '\0' && *p != '\n'){ op = p; addr = strtoul(p, &rptr, 0); if(rptr == nil || rptr == p || *rptr != '-'){ print("umbexclude: invalid argument <%s>\n", op); break; } p = rptr+1; size = strtoul(p, &rptr, 0) - addr + 1; if(size <= 0){ print("umbexclude: bad range <%s>\n", op); break; } if(rptr != nil && *rptr == ',') *rptr++ = '\0'; p = rptr; mapalloc(&rmapumb, addr, size, 0); } } static void map(ulong base, ulong len, int type); static void lguestscan(void) { ulong flags, base; /* Linux standard is to have an entry, not a map, at 2d0 */ struct e820map *e820map = (struct e820map *)kaddr(E820MAP-4); ulong addr, size; /**/ iprint("map is at %p\n", e820map); iprint("# in map is %d\n", e820map->nr_map); iprint("addr is at %p, size is at %p\n", &e820map->map[0].addr, &e820map->map[0].size); /**/ addr = e820map->map[0].addr; size = e820map->map[0].size; if (! size) panic("memory size is 0"); iprint("map %p %p \n", (void *)addr, (void *)size); map(addr, size, MemRAM); /* now we have to map in the last page. This will contain * device info. We don't want it in any memory map, though. */ /* it really is writeable! */ flags = PTEWRITE|PTEVALID; base = size; lgd = (void *)(base + KZERO); iprint("pdbmap %p %p %p %p\n", (void *)(m->pdb), (void *) (base|flags), (void *)lgd, (void *)BY2PG); pdbmap(m->pdb, base|flags, (ulong) lgd, BY2PG); } static void umbscan(void) { uchar *p; /* * Scan the Upper Memory Blocks (0xA0000->0xF0000) for pieces * which aren't used; they can be used later for devices which * want to allocate some virtual address space. * Check for two things: * 1) device BIOS ROM. This should start with a two-byte header * of 0x55 0xAA, followed by a byte giving the size of the ROM * in 512-byte chunks. These ROM's must start on a 2KB boundary. * 2) device memory. This is read-write. * There are some assumptions: there's VGA memory at 0xA0000 and * the VGA BIOS ROM is at 0xC0000. Also, if there's no ROM signature * at 0xE0000 then the whole 64KB up to 0xF0000 is theoretically up * for grabs; check anyway. */ p = KADDR(0xD0000); while(p < (uchar*)KADDR(0xE0000)){ /* * Test for 0x55 0xAA before poking obtrusively, * some machines (e.g. Thinkpad X20) seem to map * something dynamic here (cardbus?) causing weird * problems if it is changed. */ if(p[0] == 0x55 && p[1] == 0xAA){ p += p[2]*512; continue; } p[0] = 0xCC; p[2*KB-1] = 0xCC; if(p[0] != 0xCC || p[2*KB-1] != 0xCC){ p[0] = 0x55; p[1] = 0xAA; p[2] = 4; if(p[0] == 0x55 && p[1] == 0xAA){ p += p[2]*512; continue; } if(p[0] == 0xFF && p[1] == 0xFF) mapfree(&rmapumb, PADDR(p), 2*KB); } else mapfree(&rmapumbrw, PADDR(p), 2*KB); p += 2*KB; } p = KADDR(0xE0000); if(p[0] != 0x55 || p[1] != 0xAA){ p[0] = 0xCC; p[64*KB-1] = 0xCC; if(p[0] != 0xCC && p[64*KB-1] != 0xCC) mapfree(&rmapumb, PADDR(p), 64*KB); } umbexclude(); } static void lowraminit(void) { ulong n, pa, x; uchar *bda; /* * Initialise the memory bank information for conventional memory * (i.e. less than 640KB). The base is the first location after the * bootstrap processor MMU information and the limit is obtained from * the BIOS data area. */ x = PADDR(CPU0MACH+BY2PG); bda = (uchar*)KADDR(0x400); n = ((bda[0x14]<<8)|bda[0x13])*KB-x; mapfree(&rmapram, x, n); memset(KADDR(x), 0, n); /* keep us honest */ x = PADDR(PGROUND((ulong)end)); pa = MemMin; mapfree(&rmapram, x, pa-x); memset(KADDR(x), 0, pa-x); /* keep us honest */ } static void ramscan(ulong) { } /* * BIOS Int 0x15 E820 memory map. */ enum { SMAP = ('S'<<24)|('M'<<16)|('A'<<8)|'P', Ememory = 1, Ereserved = 2, Carry = 1, }; typedef struct Emap Emap; struct Emap { uvlong base; uvlong len; ulong type; }; static Emap emap[16]; int nemap; static char *etypes[] = { "type=0", "memory", "reserved", "acpi reclaim", "acpi nvs", }; static int emapcmp(const void *va, const void *vb) { Emap *a, *b; a = (Emap*)va; b = (Emap*)vb; if(a->base < b->base) return -1; if(a->base > b->base) return 1; if(a->len < b->len) return -1; if(a->len > b->len) return 1; return a->type - b->type; } static void map(ulong base, ulong len, int type) { ulong e, n; ulong *table, flags, maxkpa; /* * Split any call crossing MemMin to make below simpler. */ if(base < MemMin && len > MemMin-base){ n = MemMin - base; map(base, n, type); map(MemMin, len-n, type); } /* * Let lowraminit and umbscan hash out the low 4MB. */ if(base < MemMin) return; /* * Any non-memory below 16*MB is used as upper mem blocks. */ if(type == MemUPA && base < 16*MB && base+len > 16*MB){ map(base, 16*MB-base, MemUMB); map(16*MB, len-(16*MB-base), MemUPA); return; } /* * Memory below CPU0END is reserved for the kernel * and already mapped. */ if(base < PADDR(CPU0END)){ n = PADDR(CPU0END) - base; if(len <= n) return; map(PADDR(CPU0END), len-n, type); return; } /* * Memory between KTZERO and end is the kernel itself * and is already mapped. */ if(base < PADDR(KTZERO) && base+len > PADDR(KTZERO)){ map(base, PADDR(KTZERO)-base, type); return; } if(PADDR(KTZERO) < base && base < PADDR(PGROUND((ulong)end))){ n = PADDR(PGROUND((ulong)end)); if(len <= n) return; map(PADDR(PGROUND((ulong)end)), len-n, type); return; } /* * Now we have a simple case. */ // print("map %.8lux %.8lux %d\n", base, base+len, type); switch(type){ case MemRAM: mapfree(&rmapram, base, len); flags = PTEWRITE|PTEVALID; break; case MemUMB: mapfree(&rmapumb, base, len); flags = PTEWRITE|PTEUNCACHED|PTEVALID; break; case MemUPA: mapfree(&rmapupa, base, len); flags = 0; break; default: case MemReserved: flags = 0; break; } /* * bottom 4MB is already mapped - just twiddle flags. * (not currently used - see above) */ if(base < 4*MB){ table = KADDR(PPN(m->pdb[PDX(base)])); e = base+len; base = PPN(base); for(; base= maxkpa) return; if(len > maxkpa-base) len = maxkpa - base; pdbmap(m->pdb, base|flags, base+KZERO, len); } } static int e820scan(void) { return 0; } void meminit(void) { int i; Map *mp; Confmem *cm; ulong lost; ulong pa, x, n; /* get the bits from end to MemMin -- we need them for pte pages * which are allocated before xalloc is working. * we had to add this back in 6/13/2008*/ /* * Initialise the memory bank information for memory below lguest */ x = PADDR(CPU0END); n = 0x100000-x; mapfree(&rmapram, x, n); memset(KADDR(x), 0, n); /* keep us honest */ x = PADDR(PGROUND((ulong)end)); pa = MemMin; if(x > pa) panic("kernel too big"); mapfree(&rmapram, x, pa-x); memset(KADDR(x), 0, pa-x); /* keep us honest */ /* life is easier now. Get the e820 map from lguest, and use it */ lguestscan(); /* * Set the conf entries describing banks of allocatable memory. */ for(i=0; ibase = mp->addr; cm->npage = mp->size/BY2PG; //iprint("%d: base %p npage %d\n", i, cm->base, cm->npage); } lost = 0; for(; i$p$stem.gz # pcflop and pccd need all the space they can get 9pcflop.gz: 9pcflop strip -o /fd/1 9pcflop | gzip -9 >9pcflop.gz 9pccd.gz: 9pccd strip -o /fd/1 9pccd | gzip -9 >9pccd.gz install:V: $p$CONF $p$CONF.gz cp $p$CONF $p$CONF.gz /$objtype/ for(i in $EXTRACOPIES) import $i / /n/$i && cp $p$CONF $p$CONF.gz /n/$i/$objtype/ <../boot/bootmkfile <../port/portmkfile <|../port/mkbootrules $CONF <../pc/pcmkfile $ETHER: etherif.h ../port/netif.h ether8003.$O ether8390.$O: ether8390.h $VGA mouse.$O: screen.h devfloppy.$O: floppy.h archmp.$O mp.$O: apbootstrap.h apic.$O archmp.$O mp.$O: mp.h $SDEV: ../port/sd.h sd53c8xx.$O: sd53c8xx.i main.$O: init.h reboot.h wavelan.$O: wavelan.c ../pc/wavelan.c ../pc/wavelan.h etherwavelan.$O: etherwavelan.c ../pc/wavelan.h devusb.$O usbuhci.$O usbohci.$O: usb.h trap.$O: /sys/include/tos.h uartaxp.$O: uartaxp.i init.h: ../port/initcode.c ../pc/init9.c $CC ../port/initcode.c $CC ../pc/init9.c $LD -l -R1 -o init.out init9.$O initcode.$O /386/lib/libc.a strip init.out {echo 'uchar initcode[]={' cat init.out | xd -1x | sed -e 's/^[0-9a-f]+ //' -e 's/ ([0-9a-f][0-9a-f])/0x\1,/g' echo '};'} > init.h reboot.h: rebootcode.s $AS rebootcode.s $LD -l -s -T0x11000 -R4 -o reboot.out rebootcode.$O {echo 'uchar rebootcode[]={' xd -1x reboot.out | sed -e '1,2d' -e 's/^[0-9a-f]+ //' -e 's/ ([0-9a-f][0-9a-f])/0x\1,/g' echo '};'} > reboot.h apbootstrap.h: apbootstrap.s mem.h $AS $prereq $LD -o apbootstrap.out -T$APBOOTSTRAP -R4 -l -s apbootstrap.$O {echo 'uchar apbootstrap[]={' xd -1x apbootstrap.out | sed -e '1,2d' -e 's/^[0-9a-f]+ //' -e 's/ ([0-9a-f][0-9a-f])/0x\1,/g' echo '};'} > $target sd53c8xx.i: sd53c8xx.n aux/na $prereq > $target uartaxp.i: a100p.cp {echo 'static uchar uartaxpcp[] = {' xd -1x $prereq | sed -e 's/^[0-9a-f]+ //' -e '/^$/d' -e 's/ ([0-9a-f][0-9a-f])/0x\1,/g' echo '};' } > $target acid:V: 8c -a -w -I. i8253.c>acid %.checkether:VQ: for (i in ether*.c){ x=`{echo $i | sed 's/\.c//'} if(! ~ $x ether8390 && ! grep -s '^ '^$x^'([ ]|$)' $stem) echo $x not included in $stem } exit 0 %.checkvga:VQ: for (i in vga*.c){ x=`{echo $i | sed 's/\.c//'} if(! ~ $x vga vgax vgasavage && ! grep -s '^ '^$x^'([ ]|$)' $stem) echo $x not included in $stem } exit 0 checkdist:VQ: for(i in pcdisk pcflop) for(j in checkvga checkether) mk $i.$j %.clean:V: rm -f $stem.c [9bz]$stem [9bz]$stem.gz boot$stem.* reboot.h apbootstrap.h init.h # testing 9load:D: /usr/rsc/boot/$O.load 9pcload cat $prereq >$target 9load.flp: 9load disk/format -b /386/pbs -df $target $prereq $p$CONF.flp: /386/9load plan9.ini $p$CONF.gz disk/format -b /386/pbs -df $target $prereq , "reserved", "acpi reclaim", "acpi nvs", }; static int emapcmp(const void *va, const void *vb) { Emap *a, *b; a = (Emap*)va; b = (Emap*)vb; if(a->base < b->base) return -1; if(a->base > b->base) return 1; if(a->len < b->len) return -1; if(a->len > b->len) return 1; return a->type - b->type; } static void map(ulong base, ulong len, int type) { ulong e, n; ulong *table, flags, maxkpa; /* * Split any call crossing MemMin to lguest25/mmu.c 664 0 0 67637 11033167171 12355ustar00bootesbootes/* * Memory mappings. Life was easier when 2G of memory was enough. * * The kernel memory starts at KZERO, with the text loaded at KZERO+1M * (9load sits under 1M during the load). The memory from KZERO to the * top of memory is mapped 1-1 with physical memory, starting at physical * address 0. All kernel memory and data structures (i.e., the entries stored * into conf.mem) must sit in this physical range: if KZERO is at 0xF0000000, * then the kernel can only have 256MB of memory for itself. * * The 256M below KZERO comprises three parts. The lowest 4M is the * virtual page table, a virtual address representation of the current * page table tree. The second 4M is used for temporary per-process * mappings managed by kmap and kunmap. The remaining 248M is used * for global (shared by all procs and all processors) device memory * mappings and managed by vmap and vunmap. The total amount (256M) * could probably be reduced somewhat if desired. The largest device * mapping is that of the video card, and even though modern video cards * have embarrassing amounts of memory, the video drivers only use one * frame buffer worth (at most 16M). Each is described in more detail below. * * The VPT is a 4M frame constructed by inserting the pdb into itself. * This short-circuits one level of the page tables, with the result that * the contents of second-level page tables can be accessed at VPT. * We use the VPT to edit the page tables (see mmu) after inserting them * into the page directory. It is a convenient mechanism for mapping what * might be otherwise-inaccessible pages. The idea was borrowed from * the Exokernel. * * The VPT doesn't solve all our problems, because we still need to * prepare page directories before we can install them. For that, we * use tmpmap/tmpunmap, which map a single page at TMPADDR. */ #pragma profile 0 #include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "lguest.h" /* * Simple segment descriptors with no translation. */ #define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW } #define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR } #define EXEC16SEGM(p) { 0xFFFF, SEGG|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR } #define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\ ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP } /* this was NGDT but lguest demands GDT_ENTRIES */ Segdesc gdt[GDT_ENTRIES] = { [NULLSEG] { 0, 0}, /* null descriptor */ [KDSEG] DATASEGM(1), /* kernel data/stack */ [KESEG] EXECSEGM(1), /* kernel code */ [UDSEG] DATASEGM(3), /* user data/stack */ [UESEG] EXECSEGM(3), /* user code */ [TSSSEG] TSSSEGM(0,1), /* tss segment */ //[KESEG16] EXEC16SEGM(0), /* kernel code 16-bit */ }; static int didmmuinit; static void taskswitch(ulong, ulong); static void memglobal(void); #define vpt ((ulong*)VPT) #define VPTX(va) (((ulong)(va))>>12) #define vpd (vpt+VPTX(VPT)) void mmuinit0(void) { #ifdef NOT u32 *pcr3, *pte; int i; /* do the bits here that were supposed to have been done in assembly */ pcr3 = (u32 *) CPU0PDB; memset(pcr3, 0, CPU0GDT-CPU0PDB); /* do the PMDs */ pcr3[KZERO>>22] = paddr((void *)CPU0PTE) | PTEWRITE|PTEVALID; // iprint("Set %p to %#x\n", &pcr3[KZERO>>22], paddr((void *)CPU0PTE) | PTEWRITE|PTEVALID); pte = (u32 *) CPU0PTE; for(i = 0; i < 1024; i++){ // iprint("Set %p to %#x\n", &pte[i], i*4096 | PTEWRITE|PTEVALID); pte[i] = i*4096 | PTEWRITE|PTEVALID; } /* for(i = 0; i < 1024; i++) if (pcr3[i]) iprint("mmuinit0 pdb[%#x] is %#x\n", i, pcr3[i]);*/ /* don't do this just yet ... * / pte[(MACHADDR>>12)&0x3ff] = paddr((void *)CPU0MACH)|PTEWRITE|PTEVALID; /**/ /* for(i = 0; i < 1024; i++) if (pte[i]) iprint("mmuinit0 pte[%#x] is %#x\n", i, pte[i]);*/ #endif memmove(m->gdt, gdt, sizeof gdt); } void fixmach(){ unsigned long pcr3 = getcr3(); /* it's in the very first place */ u32 *cr3 = (u32 *) (KZERO + pcr3); u32 *pmd, *pte; pmd = (u32 *)cr3[PTX(CPU0MACH)]; pte = (u32 *)pmd[PTX(CPU0MACH)]; *pte = CPU0MACH|PTEVALID|PTEWRITE; hcall(LHCALL_SET_PTE,pcr3, MACHADDR|PTEVALID|PTEWRITE, CPU0MACH); } void mmuinit(void) { ulong x, *p; ushort ptr[3]; didmmuinit = 1; memglobal(); m->pdb[PDX(VPT)] = PADDR(m->pdb)|PTEWRITE|PTEVALID; m->tss = malloc(sizeof(Tss)); memset(m->tss, 0, sizeof(Tss)); m->tss->iomap = 0xDFFF<<16; /* * We used to keep the GDT in the Mach structure, but it * turns out that that slows down access to the rest of the * page. Since the Mach structure is accessed quite often, * it pays off anywhere from a factor of 1.25 to 2 on real * hardware to separate them (the AMDs are more sensitive * than Intels in this regard). Under VMware it pays off * a factor of about 10 to 100. */ memmove(m->gdt, gdt, sizeof gdt); x = (ulong)m->tss; m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss); m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP; ptr[0] = sizeof(gdt)-1; x = (ulong)m->gdt; ptr[1] = x & 0xFFFF; ptr[2] = (x>>16) & 0xFFFF; lgdt(ptr); iprint("lgdt ...\n"); ptr[0] = sizeof(Segdesc)*256-1; x = IDTADDR; ptr[1] = x & 0xFFFF; ptr[2] = (x>>16) & 0xFFFF; lidt(ptr); iprint("lidt ...\n"); /* make kernel text unwritable */ /* This is mostly meaningless at this point but let's see if it works */ for(x = KTZERO; x < (ulong)etext; x += BY2PG){ p = mmuwalk(m->pdb, x, 2, 0); if(p == nil) panic("mmuinit"); *p &= ~PTEWRITE; /* make it writeable for probes */ //*p |= PTEWRITE; } iprint("write protected the kernel text\n"); iprint("First taskswitch... m->pdb is %p, PADDR(m->pdb) is %p\n", (void *)m->pdb, (void *) PADDR(m->pdb)); // iprint("Try to trash kernel text \n"); // *(unsigned char *)KTZERO = 0; taskswitch(PADDR(m->pdb), (ulong)m + BY2PG - 4); iprint(" ltr..."); ltr(TSSSEL); iprint("DONE mmuinit\n"); } /* * On processors that support it, we set the PTEGLOBAL bit in * page table and page directory entries that map kernel memory. * Doing this tells the processor not to bother flushing them * from the TLB when doing the TLB flush associated with a * context switch (write to CR3). Since kernel memory mappings * are never removed, this is safe. (If we ever remove kernel memory * mappings, we can do a full flush by turning off the PGE bit in CR4, * writing to CR3, and then turning the PGE bit back on.) * * See also mmukmap below. * * Processor support for the PTEGLOBAL bit is enabled in devarch.c. */ static void memglobal(void) { int i, j; ulong *pde, *pte; /* only need to do this once, on bootstrap processor */ if(m->machno != 0) return; iprint("m->havepge is %d\n", m->havepge); if(!m->havepge) return; pde = m->pdb; for(i=PDX(KZERO); i<1024; i++){ if(pde[i] & PTEVALID){ pde[i] |= PTEGLOBAL; if(!(pde[i] & PTESIZE)){ pte = KADDR(pde[i]&~(BY2PG-1)); for(j=0; j<1024; j++) if(pte[j] & PTEVALID) pte[j] |= PTEGLOBAL; } } } } /* * Flush all the user-space and device-mapping mmu info * for this process, because something has been deleted. * It will be paged back in on demand. */ void flushmmu(void) { int s; //iprint("flushmmu\n"); s = splhi(); up->newtlb = 1; mmuswitch(up); splx(s); //iprint("flushmmu done\n"); } /* * Flush a single page mapping from the tlb. */ void flushpg(ulong va) { lguest_flush_tlb_single(va); } /* * Allocate a new page for a page directory. * We keep a small cache of pre-initialized * page directories in each mach. */ static Page* mmupdballoc(void) { int s; Page *page; ulong *pdb; s = splhi(); m->pdballoc++; if(m->pdbpool == 0){ spllo(); page = newpage(0, 0, 0); page->va = (ulong)vpd; splhi(); pdb = tmpmap(page); memmove(pdb, m->pdb, BY2PG); pdb[PDX(VPT)] = page->pa|PTEWRITE|PTEVALID; /* set up VPT */ tmpunmap(pdb); }else{ page = m->pdbpool; m->pdbpool = page->next; m->pdbcnt--; } splx(s); return page; } static void mmupdbfree(Proc *proc, Page *p) { if(islo()) panic("mmupdbfree: islo"); m->pdbfree++; if(m->pdbcnt >= 10){ p->next = proc->mmufree; proc->mmufree = p; }else{ p->next = m->pdbpool; m->pdbpool = p; m->pdbcnt++; } } /* * A user-space memory segment has been deleted, or the * process is exiting. Clear all the pde entries for user-space * memory mappings and device mappings. Any entries that * are needed will be paged back in as necessary. */ static void mmuptefree(Proc* proc) { int s; ulong *pdb; Page **last, *page; if(proc->mmupdb == nil || proc->mmuused == nil) return; s = splhi(); pdb = tmpmap(proc->mmupdb); last = &proc->mmuused; for(page = *last; page; page = page->next){ pdb[page->daddr] = 0; last = &page->next; } tmpunmap(pdb); splx(s); *last = proc->mmufree; proc->mmufree = proc->mmuused; proc->mmuused = 0; } static void taskswitch(ulong pdb, ulong stack) { Tss *tss; tss = m->tss; tss->ss0 = KDSEL; tss->esp0 = stack; tss->ss1 = KDSEL; tss->esp1 = stack; tss->ss2 = KDSEL; tss->esp2 = stack; //iprint("taskswitch before load esp0 pdb %p stack %p \n", (void *) pdb, (void *)stack); lguest_load_esp0(stack); //iprint("ts after load esp0 ... now putcr3 %p\n", pdb); // iprint("* of that is %#lx\n", *(unsigned long *)kaddr(0x12000+0xc00)); putcr3(pdb); // iprint("* of that is %#lx\n", *(unsigned long *)kaddr(0x12000+0xc00)); //iprint("ts returens\n"); } void mmuswitch(Proc* proc) { ulong *pdb; if(proc->newtlb){ mmuptefree(proc); proc->newtlb = 0; } //iprint("mmuswitch to %p ...called from %#p\n", proc, (void *)getcallerpc(&proc)); // iprint("* of that is %#lx\n", *(unsigned long *)kaddr(0x12000+0xc00)); if(proc->mmupdb){ pdb = tmpmap(proc->mmupdb); pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)]; /* no need to do an HCALL as it will be caught by the taskswitch */ tmpunmap(pdb); taskswitch(proc->mmupdb->pa, (ulong)(proc->kstack+KSTACK)); //iprint("Use proc pdb\n"); }else taskswitch(PADDR(m->pdb), (ulong)(proc->kstack+KSTACK)); // iprint("* of that is %#lx\n", *(unsigned long *)kaddr(0x12000+0xc00)); //iprint("mmuswitch done return to %#p\n", (void *)getcallerpc(&proc)); } /* * Release any pages allocated for a page directory base or page-tables * for this process: * switch to the prototype pdb for this processor (m->pdb); * call mmuptefree() to place all pages used for page-tables (proc->mmuused) * onto the process' free list (proc->mmufree). This has the side-effect of * cleaning any user entries in the pdb (proc->mmupdb); * if there's a pdb put it in the cache of pre-initialised pdb's * for this processor (m->pdbpool) or on the process' free list; * finally, place any pages freed back into the free pool (palloc). * This routine is only called from schedinit() with palloc locked. */ void mmurelease(Proc* proc) { Page *page, *next; ulong *pdb; if(islo()) panic("mmurelease: islo"); taskswitch(PADDR(m->pdb), (ulong)m + BY2PG); if(proc->kmaptable){ if(proc->mmupdb == nil) panic("mmurelease: no mmupdb"); if(--proc->kmaptable->ref) panic("mmurelease: kmap ref %d\n", proc->kmaptable->ref); if(proc->nkmap) panic("mmurelease: nkmap %d\n", proc->nkmap); /* * remove kmaptable from pdb before putting pdb up for reuse. */ pdb = tmpmap(proc->mmupdb); if(PPN(pdb[PDX(KMAP)]) != proc->kmaptable->pa) panic("mmurelease: bad kmap pde %#.8lux kmap %#.8lux", pdb[PDX(KMAP)], proc->kmaptable->pa); pdb[PDX(KMAP)] = 0; tmpunmap(pdb); /* * move kmaptable to free list. */ pagechainhead(proc->kmaptable); proc->kmaptable = 0; } if(proc->mmupdb){ mmuptefree(proc); mmupdbfree(proc, proc->mmupdb); proc->mmupdb = 0; } for(page = proc->mmufree; page; page = next){ next = page->next; if(--page->ref) panic("mmurelease: page->ref %d\n", page->ref); pagechainhead(page); } if(proc->mmufree && palloc.r.p) wakeup(&palloc.r); proc->mmufree = 0; } /* * Allocate and install pdb for the current process. */ static void upallocpdb(void) { int s; ulong *pdb; Page *page; // iprint("upallocpdb: alread is %p\n", up->mmupdb); if(up->mmupdb != nil) return; page = mmupdballoc(); s = splhi(); if(up->mmupdb != nil){ /* * Perhaps we got an interrupt while * mmupdballoc was sleeping and that * interrupt allocated an mmupdb? * Seems unlikely. */ mmupdbfree(up, page); splx(s); return; } pdb = tmpmap(page); pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)]; tmpunmap(pdb); up->mmupdb = page; //iprint("@@@@@@@@Alloc'ed pdb (%p), so set cr3 to it\n", (void *)up->mmupdb->pa); putcr3(up->mmupdb->pa); splx(s); } /* * Update the mmu in response to a user fault. pa may have PTEWRITE set. */ void putmmu(ulong va, ulong pa, Page*) { int old, s; Page *page; int lguestpdb = 0; u32 *pdb = m->pdb; //iprint("putmmu, va %p, pa %p, pid %d, args %s\n", (void *)va, (void *)pa, //up->pid, up->args); /* simple test to see if we need to tell lguest. */ if(up->mmupdb && (getcr3() == up->mmupdb->pa)){ // iprint("putmmu: va %p pa %p tell LG\n", (void *)va, (void *)pa); lguestpdb = 1; } if(up->mmupdb == nil) upallocpdb(); /* * We should be able to get through this with interrupts * turned on (if we get interrupted we'll just pick up * where we left off) but we get many faults accessing * vpt[] near the end of this function, and they always happen * after the process has been switched out and then * switched back, usually many times in a row (perhaps * it cannot switch back successfully for some reason). * * In any event, I'm tired of searching for this bug. * Turn off interrupts during putmmu even though * we shouldn't need to. - rsc */ s = splhi(); if(!(vpd[PDX(va)]&PTEVALID)){ if(up->mmufree == 0){ spllo(); page = newpage(0, 0, 0); splhi(); } else{ page = up->mmufree; up->mmufree = page->next; } vpd[PDX(va)] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID; //iprint("Setting page at &vpd[PDX(va)] %p\n", vpd[PDX(va)]); if (lguestpdb) lguest_set_pte_at((ulong)pdb, VPT+PDX(va)*BY2PG, &vpd[PDX(va)], PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID); /* page is now mapped into the VPT - clear it */ memset((void*)(VPT+PDX(va)*BY2PG), 0, BY2PG); page->daddr = PDX(va); page->next = up->mmuused; up->mmuused = page; } old = vpt[VPTX(va)]; vpt[VPTX(va)] = pa|PTEUSER|PTEVALID; //iprint("old %#lx new %#lx\n", old, vpt[VPTX(va)]); if (lguestpdb) lguest_set_pte_at((ulong)pdb, va, &vpt[VPTX(va)], pa|PTEUSER|PTEVALID); if((old&PTEVALID) && lguestpdb) //{iprint("flushpg(%p)\n", va); flushpg((ulong) &vpt[VPTX(va)]); //} if(getcr3() != up->mmupdb->pa) print("bad cr3 %.8lux %.8lux\n", getcr3(), up->mmupdb->pa); splx(s); } /* * Double-check the user MMU. * Error checking only. */ void checkmmu(ulong va, ulong pa) { if(up->mmupdb == 0) return; if(!(vpd[PDX(va)]&PTEVALID) || !(vpt[VPTX(va)]&PTEVALID)) return; if(PPN(vpt[VPTX(va)]) != pa) print("%ld %s: va=0x%08lux pa=0x%08lux pte=0x%08lux\n", up->pid, up->text, va, pa, vpt[VPTX(va)]); } /* * Walk the page-table pointed to by pdb and return a pointer * to the entry for virtual address va at the requested level. * If the entry is invalid and create isn't requested then bail * out early. Otherwise, for the 2nd level walk, allocate a new * page-table page and register it in the 1st level. This is used * only to edit kernel mappings, which use pages from kernel memory, * so it's okay to use KADDR to look at the tables. */ ulong* mmuwalk(ulong* pdb, ulong va, int level, int create) { ulong *table; void *map; table = &pdb[PDX(va)]; if(!(*table & PTEVALID) && create == 0) return 0; switch(level){ default: return 0; case 1: return table; case 2: if(*table & PTESIZE) panic("mmuwalk2: va %luX entry %luX\n", va, *table); if(!(*table & PTEVALID)){ /* * Have to call low-level allocator from * memory.c if we haven't set up the xalloc * tables yet. */ if(didmmuinit) map = xspanalloc(BY2PG, BY2PG, 0); else map = rampage(); if(map == nil) panic("mmuwalk xspanalloc failed"); *table = PADDR(map)|PTEWRITE|PTEVALID; } table = KADDR(PPN(*table)); return &table[PTX(va)]; } } /* * Device mappings are shared by all procs and processors and * live in the virtual range VMAP to VMAP+VMAPSIZE. The master * copy of the mappings is stored in mach0->pdb, and they are * paged in from there as necessary by vmapsync during faults. */ static Lock vmaplock; static int findhole(ulong *a, int n, int count); static ulong vmapalloc(ulong size); static void pdbunmap(ulong*, ulong, int); /* * Add a device mapping to the vmap range. */ void* vmap(ulong pa, int size) { int osize; ulong o, va; //iprint("vmap %#lx %d\n", pa, size); /* * might be asking for less than a page. */ osize = size; o = pa & (BY2PG-1); pa -= o; size += o; size = ROUND(size, BY2PG); if(pa == 0){ print("vmap pa=0 pc=%#.8lux\n", getcallerpc(&pa)); return nil; } ilock(&vmaplock); if((va = vmapalloc(size)) == 0 || pdbmap(MACHP(0)->pdb, pa|PTEUNCACHED|PTEWRITE, va, size) < 0){ iunlock(&vmaplock); return 0; } iunlock(&vmaplock); /* avoid trap on local processor for(i=0; i %#.8lux\n", pa+o, osize, va+o); return (void*)(va + o); } static int findhole(ulong *a, int n, int count) { int have, i; //iprint("findhole a %p, n %d count %d\n", a, n, count); have = 0; for(i=0; i= count) return i+1 - have; } return -1; } /* * Look for free space in the vmap. */ static ulong vmapalloc(ulong size) { int i, n, o; ulong *vpdb; int vpdbsize; vpdb = &MACHP(0)->pdb[PDX(VMAP)]; vpdbsize = VMAPSIZE/(4*MB); if(size >= 4*MB){ n = (size+4*MB-1) / (4*MB); if((o = findhole(vpdb, vpdbsize, n)) != -1) return VMAP + o*4*MB; return 0; } n = (size+BY2PG-1) / BY2PG; for(i=0; i VMAP+VMAPSIZE) panic("vunmap va=%#.8lux size=%#x pc=%#.8lux\n", va, size, getcallerpc(&va)); pdbunmap(MACHP(0)->pdb, va, size); /* * Flush mapping from all the tlbs and copied pdbs. * This can be (and is) slow, since it is called only rarely. */ for(i=0; istate == Dead) continue; if(p != up) p->newtlb = 1; } for(i=0; iflushmmu = 1; } flushmmu(); for(i=0; imachno)) && nm->flushmmu) ; } //iprint("vunmap"); } /* * Add kernel mappings for pa -> va for a section of size bytes. */ int pdbmap(ulong *pdb, ulong pa, ulong va, int size) { int pse; ulong pgsz, *pte, *table; ulong flag, off; flag = pa&0xFFF; pa &= ~0xFFF; if((MACHP(0)->cpuiddx & 0x08) && (getcr4() & 0x10)) pse = 1; else pse = 0; //iprint("pdbmap: pa %p, va %p, size %d, pse %d\n", (void *)pa, (void *)va, size, pse); for(off=0; off= 4MB and processor can do it. */ if(pse && (pa+off)%(4*MB) == 0 && (va+off)%(4*MB) == 0 && (size-off) >= 4*MB){ *table = (pa+off)|flag|PTESIZE|PTEVALID; lguest_set_pmd(table, (pa+off)|flag|PTESIZE|PTEVALID); pgsz = 4*MB; }else{ pte = mmuwalk(pdb, va+off, 2, 1); if(*pte&PTEVALID) panic("vmap: va=%#.8lux pa=%#.8lux pte=%#.8lux", va+off, pa+off, *pte); *pte = (pa+off)|flag|PTEVALID; lguest_set_pte_at((ulong) pdb, va+off, pte, (pa+off)|flag|PTEVALID); pgsz = BY2PG; } } return 0; } /* * Remove mappings. Must already exist, for sanity. * Only used for kernel mappings, so okay to use KADDR. */ static void pdbunmap(ulong *pdb, ulong va, int size) { ulong vae; ulong *table; int lguestpdb = (paddr(pdb) == getcr3()); vae = va+size; while(va < vae){ table = &pdb[PDX(va)]; if(!(*table & PTEVALID)){ panic("vunmap: not mapped"); /* va = (va+4*MB-1) & ~(4*MB-1); continue; */ } if(*table & PTESIZE){ if (lguestpdb) lguest_set_pte_at((ulong) pdb, VPT+PDX(va)*BY2PG, table, 0); else *table = 0; va = (va+4*MB-1) & ~(4*MB-1); continue; } table = KADDR(PPN(*table)); if(!(table[PTX(va)] & PTEVALID)) panic("vunmap: not mapped"); if (lguestpdb) lguest_set_pte_at((ulong)pdb, va, &table[PTX(va)], 0); else table[PTX(va)] = 0; va += BY2PG; } } /* * Handle a fault by bringing vmap up to date. * Only copy pdb entries and they never go away, * so no locking needed. */ int vmapsync(ulong va) { ulong entry, *table; if(va < VMAP || va >= VMAP+VMAPSIZE) return 0; entry = MACHP(0)->pdb[PDX(va)]; if(!(entry&PTEVALID)) return 0; if(!(entry&PTESIZE)){ /* make sure entry will help the fault */ table = KADDR(PPN(entry)); if(!(table[PTX(va)]&PTEVALID)) return 0; } vpd[PDX(va)] = entry; lguest_set_pte(&vpd[PDX(va)], entry); /* * TLB doesn't cache negative results, so no flush needed. */ return 1; } /* * KMap is used to map individual pages into virtual memory. * It is rare to have more than a few KMaps at a time (in the * absence of interrupts, only two at a time are ever used, * but interrupts can stack). The mappings are local to a process, * so we can use the same range of virtual address space for * all processes without any coordination. */ #define kpt (vpt+VPTX(KMAP)) #define NKPT (KMAPSIZE/BY2PG) KMap* kmap(Page *page) { int i, o, s; if(up == nil) panic("kmap: up=0 pc=%#.8lux", getcallerpc(&page)); if(up->mmupdb == nil) upallocpdb(); if(up->nkmap < 0) panic("kmap %lud %s: nkmap=%d", up->pid, up->text, up->nkmap); /* * Splhi shouldn't be necessary here, but paranoia reigns. * See comment in putmmu above. */ s = splhi(); up->nkmap++; if(!(vpd[PDX(KMAP)]&PTEVALID)){ /* allocate page directory */ if(KMAPSIZE > BY2XPG) panic("bad kmapsize"); if(up->kmaptable != nil) panic("kmaptable"); spllo(); up->kmaptable = newpage(0, 0, 0); splhi(); vpd[PDX(KMAP)] = up->kmaptable->pa|PTEWRITE|PTEVALID; lguest_set_pte(&vpd[PDX(KMAP)], up->kmaptable->pa|PTEWRITE|PTEVALID); flushpg((ulong)kpt); memset(kpt, 0, BY2PG); kpt[0] = page->pa|PTEWRITE|PTEVALID; lguest_set_pte(&kpt[0], page->pa|PTEWRITE|PTEVALID); up->lastkmap = 0; splx(s); return (KMap*)KMAP; } if(up->kmaptable == nil) panic("no kmaptable"); o = up->lastkmap+1; for(i=0; ipa|PTEWRITE|PTEVALID; lguest_set_pte(&kpt[o], page->pa|PTEWRITE|PTEVALID); up->lastkmap = o; splx(s); return (KMap*)(KMAP+o*BY2PG); } } panic("out of kmap"); return nil; } void kunmap(KMap *k) { ulong va; va = (ulong)k; if(up->mmupdb == nil || !(vpd[PDX(KMAP)]&PTEVALID)) panic("kunmap: no kmaps"); if(va < KMAP || va >= KMAP+KMAPSIZE) panic("kunmap: bad address %#.8lux pc=%#.8lux", va, getcallerpc(&k)); if(!(vpt[VPTX(va)]&PTEVALID)) panic("kunmap: not mapped %#.8lux pc=%#.8lux", va, getcallerpc(&k)); up->nkmap--; if(up->nkmap < 0) panic("kunmap %lud %s: nkmap=%d", up->pid, up->text, up->nkmap); vpt[VPTX(va)] = 0; lguest_set_pte(&vpt[VPTX(va)], 0); flushpg(va); } /* * Temporary one-page mapping used to edit page directories. * * The fasttmp #define controls whether the code optimizes * the case where the page is already mapped in the physical * memory window. */ #define fasttmp 1 void* tmpmap(Page *p) { ulong i; ulong *entry; if(islo()) panic("tmpaddr: islo"); if(fasttmp && p->pa < -KZERO) return KADDR(p->pa); /* * PDX(TMPADDR) == PDX(MACHADDR), so this * entry is private to the processor and shared * between up->mmupdb (if any) and m->pdb. */ entry = &vpt[VPTX(TMPADDR)]; if(!(*entry&PTEVALID)){ for(i=KZERO; i<=CPU0MACH; i+=BY2PG) print("%.8lux: *%.8lux=%.8lux (vpt=%.8lux index=%.8lux)\n", i, &vpt[VPTX(i)], vpt[VPTX(i)], vpt, VPTX(i)); panic("tmpmap: no entry"); } if(PPN(*entry) != PPN(TMPADDR-KZERO)) panic("tmpmap: already mapped entry=%#.8lux", *entry); *entry = p->pa|PTEWRITE|PTEVALID; lguest_set_pte(entry, p->pa|PTEWRITE|PTEVALID); flushpg(TMPADDR); return (void*)TMPADDR; } void tmpunmap(void *v) { ulong *entry; if(islo()) panic("tmpaddr: islo"); if(fasttmp && (ulong)v >= KZERO && v != (void*)TMPADDR) return; if(v != (void*)TMPADDR) panic("tmpunmap: bad address"); entry = &vpt[VPTX(TMPADDR)]; if(!(*entry&PTEVALID) || PPN(*entry) == PPN(PADDR(TMPADDR))) panic("tmpmap: not mapped entry=%#.8lux", *entry); *entry = PPN(TMPADDR-KZERO)|PTEWRITE|PTEVALID; lguest_set_pte(entry, PPN(TMPADDR-KZERO)|PTEWRITE|PTEVALID); flushpg(TMPADDR); } /* * These could go back to being macros once the kernel is debugged, * but the extra checking is nice to have. */ void* kaddr(ulong pa) { if(pa > (ulong)-KZERO) panic("kaddr: pa=%#.8lux", pa); return (void*)(pa+KZERO); } ulong paddr(void *v) { ulong va; va = (ulong)v; if(va < KZERO) panic("paddr: va=%#.8lux pc=%#.8lux", va, getcallerpc(&v)); return va-KZERO; } /* * More debugging. */ void countpagerefs(ulong *ref, int print) { int i, n; Mach *mm; Page *pg; Proc *p; n = 0; for(i=0; immupdb){ if(print){ if(ref[pagenumber(p->mmupdb)]) iprint("page %#.8lux is proc %d (pid %lud) pdb\n", p->mmupdb->pa, i, p->pid); continue; } if(ref[pagenumber(p->mmupdb)]++ == 0) n++; else iprint("page %#.8lux is proc %d (pid %lud) pdb but has other refs!\n", p->mmupdb->pa, i, p->pid); } if(p->kmaptable){ if(print){ if(ref[pagenumber(p->kmaptable)]) iprint("page %#.8lux is proc %d (pid %lud) kmaptable\n", p->kmaptable->pa, i, p->pid); continue; } if(ref[pagenumber(p->kmaptable)]++ == 0) n++; else iprint("page %#.8lux is proc %d (pid %lud) kmaptable but has other refs!\n", p->kmaptable->pa, i, p->pid); } for(pg=p->mmuused; pg; pg=pg->next){ if(print){ if(ref[pagenumber(pg)]) iprint("page %#.8lux is on proc %d (pid %lud) mmuused\n", pg->pa, i, p->pid); continue; } if(ref[pagenumber(pg)]++ == 0) n++; else iprint("page %#.8lux is on proc %d (pid %lud) mmuused but has other refs!\n", pg->pa, i, p->pid); } for(pg=p->mmufree; pg; pg=pg->next){ if(print){ if(ref[pagenumber(pg)]) iprint("page %#.8lux is on proc %d (pid %lud) mmufree\n", pg->pa, i, p->pid); continue; } if(ref[pagenumber(pg)]++ == 0) n++; else iprint("page %#.8lux is on proc %d (pid %lud) mmufree but has other refs!\n", pg->pa, i, p->pid); } } if(!print) iprint("%d pages in proc mmu\n", n); n = 0; for(i=0; ipdbpool; pg; pg=pg->next){ if(print){ if(ref[pagenumber(pg)]) iprint("page %#.8lux is in cpu%d pdbpool\n", pg->pa, i); continue; } if(ref[pagenumber(pg)]++ == 0) n++; else iprint("page %#.8lux is in cpu%d pdbpool but has other refs!\n", pg->pa, i); } } if(!print){ iprint("%d pages in mach pdbpools\n", n); for(i=0; ipdballoc, MACHP(i)->pdbfree); } } void checkfault(ulong, ulong) { } /* * Return the number of bytes that can be accessed via KADDR(pa). * If pa is not a valid argument to KADDR, return 0. */ ulong cankaddr(ulong pa) { if(pa >= -KZERO) return 0; return -KZERO - pa; } h the trouble. * not going to be very much contention. */ return 0; } /* * Remove a devicelguest25/pcmkfile 664 0 0 145 10774531325 13054ustar00bootesbootesPCFILES=`{../port/mkfilelist ../pc} ^($PCFILES)\.$O:R: '../pc/\1.c' $CC $CFLAGS -.I. ../pc/$stem1.c lguest25/plan9l.s 664 0 0 1655 11033024324 12732ustar00bootesbootes#include "mem.h" /* * This must match io.h. */ #define VectorSYSCALL 0x40 /* * Used to get to the first process: * set up an interrupt return frame and IRET to user level. */ TEXT touser(SB), $0 PUSHL $(UDSEL) /* old ss */ MOVL sp+0(FP), AX /* old sp */ PUSHL AX MOVL $0x200, lguest_data+0(SB) MOVL $0x200, AX /* interrupt enable flag */ PUSHL AX /* old flags */ PUSHL $(UESEL) /* old cs */ PUSHL $(UTZERO+32) /* old pc */ MOVL $(UDSEL), AX MOVW AX, DS MOVW AX, ES MOVW AX, GS MOVW AX, FS IRETL /* * This is merely _strayintr from l.s optimised to vector * to syscall() without going through trap(). */ TEXT _syscallintr(SB), $0 PUSHL $VectorSYSCALL /* trap type */ PUSHL DS PUSHL ES PUSHL FS PUSHL GS PUSHAL MOVL $(KDSEL), AX MOVW AX, DS MOVW AX, ES PUSHL SP CALL syscall(SB) POPL AX POPAL POPL GS POPL FS POPL ES POPL DS ADDL $8, SP /* pop error code and trap type */ IRETL va+size; while(va < vae){ table = &pdb[PDX(va)]; if(!(*table & PTEVALID)){ lguest25/ptclbsum386.s 664 0 0 2725 11022044637 13633ustar00bootesbootesTEXT ptclbsum(SB), $1 MOVL addr+0(FP), SI MOVL len+4(FP), CX XORL AX, AX /* sum */ TESTL $1, SI /* byte aligned? */ MOVL SI, DI JEQ _2align DECL CX JLT _return MOVB 0x00(SI), AH INCL SI _2align: TESTL $2, SI /* word aligned? */ JEQ _32loop CMPL CX, $2 /* less than 2 bytes? */ JLT _1dreg SUBL $2, CX XORL BX, BX MOVW 0x00(SI), BX ADDL BX, AX ADCL $0, AX LEAL 2(SI), SI _32loop: CMPL CX, $0x20 JLT _8loop MOVL CX, BP SHRL $5, BP ANDL $0x1F, CX _32loopx: MOVL 0x00(SI), BX MOVL 0x1C(SI), DX ADCL BX, AX MOVL 0x04(SI), BX ADCL DX, AX MOVL 0x10(SI), DX ADCL BX, AX MOVL 0x08(SI), BX ADCL DX, AX MOVL 0x14(SI), DX ADCL BX, AX MOVL 0x0C(SI), BX ADCL DX, AX MOVL 0x18(SI), DX ADCL BX, AX LEAL 0x20(SI), SI ADCL DX, AX DECL BP JNE _32loopx ADCL $0, AX _8loop: CMPL CX, $0x08 JLT _2loop MOVL CX, BP SHRL $3, BP ANDL $0x07, CX _8loopx: MOVL 0x00(SI), BX ADCL BX, AX MOVL 0x04(SI), DX ADCL DX, AX LEAL 0x08(SI), SI DECL BP JNE _8loopx ADCL $0, AX _2loop: CMPL CX, $0x02 JLT _1dreg MOVL CX, BP SHRL $1, BP ANDL $0x01, CX _2loopx: MOVWLZX 0x00(SI), BX ADCL BX, AX LEAL 0x02(SI), SI DECL BP JNE _2loopx ADCL $0, AX _1dreg: TESTL $1, CX /* 1 byte left? */ JEQ _fold XORL BX, BX MOVB 0x00(SI), BX ADDL BX, AX ADCL $0, AX _fold: MOVL AX, BX SHRL $16, BX JEQ _swab ANDL $0xFFFF, AX ADDL BX, AX JMP _fold _swab: TESTL $1, addr+0(FP) /*TESTL $1, DI*/ JNE _return XCHGB AH, AL _return: RET aptable != nil) panic("kmaptable"); splguest25/rebootcode.s 664 0 0 1734 11022044616 13662ustar00bootesbootes#include "mem.h" /* * Turn off MMU, then memmory the new kernel to its correct location * in physical memory. Then jumps the to start of the kernel. */ TEXT main(SB),$1 MOVL p1+0(FP), DI /* destination */ MOVL DI, AX /* entry point */ MOVL p2+4(FP), SI /* source */ MOVL n+8(FP), CX /* byte count */ /* * disable paging */ MOVL CR0, DX ANDL $~0x80000000, DX /* ~(PG) */ MOVL DX, CR0 MOVL $0, DX MOVL DX, CR3 /* * the source and destination may overlap. * determine whether to copy forward or backwards */ CMPL SI, DI JGT _forward MOVL SI, DX ADDL CX, DX CMPL DX, DI JGT _back _forward: CLD REP; MOVSB JMP _startkernel _back: ADDL CX, DI ADDL CX, SI SUBL $1, DI SUBL $1, SI STD REP; MOVSB JMP _startkernel /* * JMP to kernel entry point. Note the true kernel entry point is * the virtual address KZERO|AX, but this must wait until * the MMU is enabled by the kernel in l.s */ _startkernel: ORL AX, AX /* NOP: avoid link bug */ JMP* AX *p) { ulong i; ulong *entry; ilguest25/sdlg.c 644 0 0 14112 11022515155 12457ustar00bootesbootes/* * lguest block storage, derived from: * Xen block storage device frontend * * The present implementation follows the principle of * "what's the simplest thing that could possibly work?". * We can think about performance later. * We can think about dynamically attaching and removing devices later. */ #pragma profile 0 #include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "ureg.h" #include "../port/error.h" #include "../port/sd.h" #include "lguest.h" extern SDifc sdlgifc; #define LOG(a) enum { Ndevs = 32, idstart = '0', BufPageSize = 16 * BY2PG, /* if we make MaxWriteSize large, then things go to hell. Not sure why. * but we very quickly corrupt the disk. I *think* that reads and writes * are interfering, but how knows? */ MaxWriteSize = 1 * BY2PG, MaxReadSize = 16 * BY2PG, /* features */ VIRTIO_BLK_F_BARRIER = 1, /* Does host support barriers? */ VIRTIO_BLK_F_SIZE_MAX = 2, /* Indicates maximum segment size */ VIRTIO_BLK_F_SEG_MAX = 4, /* Indicates maximum # of segments */ /* configuration information */ SDSize = 0, /* disk size */ SDMSS = 8, /* max segment size if VIRTIO_BLK_F_SIZE_MAX*/ SDMaxSet = 12, /* max # segments if VIRTIO_BLK_F_SEG_MAX */ /* which Queue is Which */ SDIN = 0, SDOUT, }; /* This is the first element of the read scatter-gather list. */ struct virtio_blk_outhdr { /* SDIN or SDOUT* */ u32 type; /* io priority. */ u32 ioprio; /* Sector (ie. 512 byte offset) */ uvlong sector; }; /* This is the first element of the write scatter-gather list */ struct virtio_blk_inhdr { unsigned char status; }; typedef struct Ctlr Ctlr; struct Ctlr { int online; /* we can't just copy it here, as there are state bits * which host and guest OS share */ int devno; /* at some point make this a linked list of qio. For now, synchronous */ ulong secsize; uvlong sectors; Lock dmalock; QLock iolock; int iodone; Rendez wiodone; }; /* */ static SDev* lgpnp(void) { void lgvdumpconfig(int devno); int findlgv(char *name); int lgvconfig(int devno, unsigned char *config, int off ,int len); SDev *sdev[Ndevs]; unsigned char idno = idstart; int i, lgno, devno; int numsd = 0; Ctlr *ctlr; char name[32]; /* just loop through finding #Z/block%d until no more */ //iprint("block dev lgpnp Ndevs %d\n", Ndevs); sprint(name, "block0"); for (i = 0, lgno = 0, devno = findlgv(name); devno > -1; devno = findlgv(name)) { numsd++; //iprint("Found block at %d\n", devno); sdev[i] = mallocz(sizeof(SDev), 1); sdev[i]->idno = idno; sdev[i]->nunit = 1; sdev[i]->ifc = &sdlgifc; sdev[i]->ctlr = (Ctlr*)mallocz(sizeof(Ctlr), 1); ctlr = sdev[i]->ctlr; ctlr->devno = devno; //iprint("ctlr->lbp is %p\n", ctlr->lbp); lgvdumpconfig(devno); /* get the size */ lgvconfig(devno, (unsigned char *)&ctlr->sectors, 0, sizeof(ctlr->sectors)); //iprint("sectors is %lld\n", ctlr->sectors); ctlr->secsize = 512; if (i > 0) sdev[i]->next = sdev[i-1]; idno++; lgno++; sprint(name, "block%d", lgno); } if (numsd > 0){ //iprint("block lgpnp returns %p\n", sdev[0]); return sdev[0]; } return nil; } static int lgverify(SDunit *unit) { if (unit->subno > unit->dev->nunit) return 0; unit->inquiry[0] = 0; // XXX how do we know if it's a CD? unit->inquiry[2] = 2; unit->inquiry[3] = 2; unit->inquiry[4] = sizeof(unit->inquiry)-4; strcpy((char*)&unit->inquiry[8], "Lguest block device"); return 1; } static int wiodone(void *a) { void *lgvgetbuf(int dev, int ring, int *plen); Ctlr *ctlr = (Ctlr *)a; int len; //iprint("wiodone ctlr %p dev %d\n", ctlr, ctlr->devno); lgvgetbuf(ctlr->devno, 0, &len); //iprint("WI%d", len); return ((Ctlr*)a)->iodone != 0; } static void sdlgintr(Ureg *, void *a) { Ctlr *ctlr = a; //iprint("SI"); ilock(&ctlr->dmalock); // XXX conservative ctlr->iodone = 1; iunlock(&ctlr->dmalock); if (ctlr->iodone) wakeup(&ctlr->wiodone); } static Ctlr *kickctlr; static void kickme(void) { Ctlr *ctlr = kickctlr; if (ctlr) { sdlgintr(0, ctlr); } } static int lgonline(SDunit *unit) { int lgvirq(int devno, int ring); Ctlr *ctlr; int irq; //iprint("lgonline return 1\n"); ctlr = unit->dev->ctlr; unit->sectors = ctlr->sectors; unit->secsize = ctlr->secsize; if (ctlr->online == 0) { irq = lgvirq(ctlr->devno, 0); print("devno %d ring %d irq %d\n", ctlr->devno, 0, irq+32); intrenable(irq+32, sdlgintr, ctlr, BUSUNKNOWN, "lgsd"); addclock0link(kickme, 10000); ctlr->online = 1; } return 1; } static int lgrio(SDreq*) { return -1; } static long lgbio(SDunit* unit, int, int write, void* data, long nb, uvlong bno) { int lgvaddbuf(int dev, int ring, void *v[], int len[], int out, int in, void *tag); void *v[3]; int len[3]; Ctlr *ctlr; long total = nb * unit->secsize, amt = 0; struct virtio_blk_outhdr hdr; struct virtio_blk_inhdr status; ctlr = unit->dev->ctlr; LOG(iprint("lgbio %c %lux %ld %ld total %d\n", write? 'w' : 'r', (ulong)data, nb, bno, total);) qlock(&ctlr->iolock); while(amt < total) { ctlr->iodone = 0; v[0] = &hdr; len[0] = sizeof(hdr); v[1] = data; len[1] = total; v[2] = &status; /* how amusing. gcc doesn't give the same answer as plan 9 for this */ len[2] = sizeof(status); len[2] = 1; if (write){ hdr.type = SDOUT; hdr.sector = bno; //iprint("addxmitbuf\n"); lgvaddbuf(ctlr->devno, 0, v, len, 2, 1, data); //iprint("added xmitbuf\n"); } else { hdr.type = SDIN; hdr.sector = bno; //iprint("addrecvfbuf\n"); lgvaddbuf(ctlr->devno, 0, v, len, 1, 2, data); //iprint("added recdvbuf\n"); } //iprint("sleep %p\n", ctlr); sleep(&ctlr->wiodone, wiodone, ctlr); if (ctlr->iodone < 0) break; //dumphex("L", ctlr->bufpage, len); amt += total; } qunlock(&ctlr->iolock); return total; } static void lgclear(SDev *) { } SDifc sdlgifc = { "lg", /* name */ lgpnp, /* pnp */ 0, /* legacy */ 0, /* enable */ 0, /* disable */ lgverify, /* verify */ lgonline, /* online */ lgrio, /* rio */ 0, /* rctl */ 0, /* wctl */ lgbio, /* bio */ 0, /* probe */ lgclear, /* clear */ 0, /* stat */ }; 664 0 0 2725 11022044637 13633ustar00bootesbooteslguest25/trap.c 664 0 0 57057 11035213606 12515ustar00bootesbootes#include "u.h" #include "tos.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "ureg.h" #include "../port/error.h" #include "lguest.h" #include #pragma profile 0 static int trapinited; void noted(Ureg*, ulong); static void debugbpt(Ureg*, void*); static void fault386(Ureg*, void*); static void doublefault(Ureg*, void*); static void unexpected(Ureg*, void*); static void _dumpstack(Ureg*); static Lock vctllock; static Vctl *vctl[256]; enum { Ntimevec = 20 /* number of time buckets for each intr */ }; ulong intrtimes[256][Ntimevec]; void set_lguest_trap(int irq, void *a) { unsigned long lo, hi; /* A present interrupt gate. */ unsigned int flags = 0x8e00; lo = (KESEG<<16) | (((unsigned long)a) & 0xffff); hi = (((unsigned long)a) & 0xffff0000) | flags; hcall(LHCALL_LOAD_IDT_ENTRY, irq, lo, hi); } void intrenable(int irq, void (*f)(Ureg*, void*), void* a, int tbdf, char *name) { int vno; Vctl *v; if(f == nil){ print("intrenable: nil handler for %d, tbdf 0x%uX for %s\n", irq, tbdf, name); return; } v = xalloc(sizeof(Vctl)); v->isintr = 1; v->irq = irq; v->tbdf = tbdf; v->f = f; v->a = a; strncpy(v->name, name, KNAMELEN-1); v->name[KNAMELEN-1] = 0; ilock(&vctllock); vno = arch->intrenable(v); if(vno == -1){ iunlock(&vctllock); print("intrenable: couldn't enable irq %d, tbdf 0x%uX for %s\n", irq, tbdf, v->name); xfree(v); return; } if(vctl[vno]){ if(vctl[vno]->isr != v->isr || vctl[vno]->eoi != v->eoi) panic("intrenable: handler: %s %s %luX %luX %luX %luX\n", vctl[vno]->name, v->name, vctl[vno]->isr, v->isr, vctl[vno]->eoi, v->eoi); v->next = vctl[vno]; } vctl[vno] = v; iunlock(&vctllock); } int intrdisable(int irq, void (*f)(Ureg *, void *), void *a, int tbdf, char *name) { Vctl **pv, *v; int vno; /* * For now, none of this will work with the APIC code, * there is no mapping between irq and vector as the IRQ * is pretty meaningless. */ if(arch->intrvecno == nil) return -1; vno = arch->intrvecno(irq); ilock(&vctllock); pv = &vctl[vno]; while (*pv && ((*pv)->irq != irq || (*pv)->tbdf != tbdf || (*pv)->f != f || (*pv)->a != a || strcmp((*pv)->name, name))) pv = &((*pv)->next); assert(*pv); v = *pv; *pv = (*pv)->next; /* Link out the entry */ if(vctl[vno] == nil && arch->intrdisable != nil) arch->intrdisable(irq); iunlock(&vctllock); xfree(v); return 0; } static long irqallocread(Chan*, void *vbuf, long n, vlong offset) { char *buf, *p, str[2*(11+1)+KNAMELEN+1+1]; int m, vno; long oldn; Vctl *v; if(n < 0 || offset < 0) error(Ebadarg); oldn = n; buf = vbuf; for(vno=0; vnonext){ m = snprint(str, sizeof str, "%11d %11d %.*s\n", vno, v->irq, KNAMELEN, v->name); if(m <= offset) /* if do not want this, skip entry */ offset -= m; else{ /* skip offset bytes */ m -= offset; p = str+offset; offset = 0; /* write at most max(n,m) bytes */ if(m > n) m = n; memmove(buf, p, m); n -= m; buf += m; if(n == 0) return oldn; } } } return oldn - n; } void trapenable(int vno, void (*f)(Ureg*, void*), void* a, char *name) { Vctl *v; if(vno < 0 || vno >= VectorPIC) panic("trapenable: vno %d\n", vno); v = xalloc(sizeof(Vctl)); v->tbdf = BUSUNKNOWN; v->f = f; v->a = a; strncpy(v->name, name, KNAMELEN); v->name[KNAMELEN-1] = 0; ilock(&vctllock); if(vctl[vno]) v->next = vctl[vno]->next; vctl[vno] = v; iunlock(&vctllock); } static void nmienable(void) { int x; /* * Hack: should be locked with NVRAM access. */ outb(0x70, 0x80); /* NMI latch clear */ outb(0x70, 0); x = inb(0x61) & 0x07; /* Enable NMI */ outb(0x61, 0x08|x); outb(0x61, x); } /* * Minimal trap setup. Just enough so that we can panic * on traps (bugs) during kernel initialization. * Called very early - malloc is not yet available. */ void trapinit0(void) { int d1, v; ulong vaddr; Segdesc *idt; idt = (Segdesc*)IDTADDR; vaddr = (ulong)vectortable; for(v = 0; v < 256; v++){ d1 = (vaddr & 0xFFFF0000)|SEGP; switch(v){ case VectorBPT: d1 |= SEGPL(3)|SEGIG; break; case VectorSYSCALL: d1 |= SEGPL(3)|SEGTG; break; default: d1 |= SEGPL(0)|SEGIG; break; } idt[v].d0 = (vaddr & 0xFFFF)|(KESEL<<16); idt[v].d1 = d1; vaddr += 6; /* now tell LG ... the local copy is only a shadow */ // iprint("Set vec %d to %p:%p\n", v, (void *) idt[v].d0, idt[v].d1); // hcall(LHCALL_LOAD_IDT_ENTRY, v, idt[v].d0, idt[v].d1); } } void trapinit(void) { /* * Special traps. * Syscall() is called directly without going through trap(). */ trapenable(VectorBPT, debugbpt, 0, "debugpt"); trapenable(VectorPF, fault386, 0, "fault386"); trapenable(Vector2F, doublefault, 0, "doublefault"); trapenable(Vector15, unexpected, 0, "unexpected"); nmienable(); addarchfile("irqalloc", 0444, irqallocread, nil); trapinited = 1; } static char* excname[32] = { "divide error", "debug exception", "nonmaskable interrupt", "breakpoint", "overflow", "bounds check", "invalid opcode", "coprocessor not available", "double fault", "coprocessor segment overrun", "invalid TSS", "segment not present", "stack exception", "general protection violation", "page fault", "15 (reserved)", "coprocessor error", "alignment check", "machine check", "19 (reserved)", "20 (reserved)", "21 (reserved)", "22 (reserved)", "23 (reserved)", "24 (reserved)", "25 (reserved)", "26 (reserved)", "27 (reserved)", "28 (reserved)", "29 (reserved)", "30 (reserved)", "31 (reserved)", }; /* * keep histogram of interrupt service times */ void intrtime(Mach*, int vno) { ulong diff; ulong x; x = perfticks(); diff = x - m->perf.intrts; m->perf.intrts = x; m->perf.inintr += diff; if(up == nil && m->perf.inidle > diff) m->perf.inidle -= diff; diff /= m->cpumhz*100; // quantum = 100µsec if(diff >= Ntimevec) diff = Ntimevec-1; intrtimes[vno][diff]++; } /* go to user space */ void kexit(Ureg*) { uvlong t; Tos *tos; /* precise time accounting, kernel exit */ tos = (Tos*)(USTKTOP-sizeof(Tos)); cycles(&t); tos->kcycles += t - up->kentry; tos->pcycles = up->pcycles; tos->pid = up->pid; } /* * All traps come here. It is slower to have all traps call trap() * rather than directly vectoring the handler. However, this avoids a * lot of code duplication and possible bugs. The only exception is * VectorSYSCALL. * Trap is called with interrupts disabled via interrupt-gates. */ void trap(Ureg* ureg) { int clockintr, i, vno, user; char buf[ERRMAX]; Vctl *ctl, *v; Mach *mach; void lgconswrite(char *n, int len); //lgconswrite("T",1); //lguest_panic("trap"); if(!trapinited){ /* fault386 can give a better error message */ if(ureg->trap == VectorPF) fault386(ureg, nil); panic("trap %lud: not ready", ureg->trap); } m->perf.intrts = perfticks(); user = (ureg->cs & 0xFFFF) == UESEL; if(user){ up->dbgreg = ureg; cycles(&up->kentry); } clockintr = 0; vno = ureg->trap; //iprint("Trap, vno %d\n", vno); if(ctl = vctl[vno]){ if(ctl->isintr){ m->intr++; if(vno >= VectorPIC && vno != VectorSYSCALL) m->lastintr = ctl->irq; } if(ctl->isr) ctl->isr(vno); for(v = ctl; v != nil; v = v->next){ if(v->f) v->f(ureg, v->a); //iprint("done clock\n"); } //iprint("handled it early\n"); if(ctl->eoi) ctl->eoi(vno); if(ctl->isintr){ intrtime(m, vno); if(ctl->irq == IrqCLOCK || ctl->irq == IrqTIMER) { //iprint("CLOCK"); clockintr = 1; } if(up && !clockintr) preempted(); } } else if(vno <= nelem(excname) && user){ spllo(); sprint(buf, "sys: trap: %s", excname[vno]); postnote(up, 1, buf, NDebug); } else if(vno >= VectorPIC && vno != VectorSYSCALL){ /* * An unknown interrupt. * Check for a default IRQ7. This can happen when * the IRQ input goes away before the acknowledge. * In this case, a 'default IRQ7' is generated, but * the corresponding bit in the ISR isn't set. * In fact, just ignore all such interrupts. */ /* call all interrupt routines, just in case */ for(i = VectorPIC; i <= MaxIrqLAPIC; i++){ ctl = vctl[i]; if(ctl == nil) continue; if(!ctl->isintr) continue; for(v = ctl; v != nil; v = v->next){ if(v->f) v->f(ureg, v->a); } /* should we do this? */ if(ctl->eoi) ctl->eoi(i); } /* clear the interrupt */ /* not really needed on lguest?*/ if(0)print("cpu%d: spurious interrupt %d, last %d\n", m->machno, vno, m->lastintr); if(0)if(conf.nmach > 1){ for(i = 0; i < 32; i++){ if(!(active.machs & (1<machno == mach->machno) continue; print(" cpu%d: last %d", mach->machno, mach->lastintr); } print("\n"); } m->spuriousintr++; if(user) kexit(ureg); return; } else{ if(vno == VectorNMI){ nmienable(); if(m->machno != 0){ print("cpu%d: PC %8.8luX\n", m->machno, ureg->pc); for(;;); } } dumpregs(ureg); if(!user){ ureg->sp = (ulong)&ureg->sp; _dumpstack(ureg); } if(vno < nelem(excname)) panic("%s", excname[vno]); panic("unknown trap/intr: %d\n", vno); } //iprint("trap before splhi\n"); splhi(); /* delaysched set because we held a lock or because our quantum ended */ //iprint("up->delaysched %d clockintr %d\n", up->delaysched, clockintr); if(up && up->delaysched && clockintr){ sched(); splhi(); } //iprint("User %d proctl %d node %d\n", user, up->procctl, up->nnote); if(user){ if(up->procctl || up->nnote) notify(ureg); kexit(ureg); } // iprint("trap returns\n"); } /* * dump registers */ void dumpregs2(Ureg* ureg) { if(up) print("cpu%d: registers for %s %lud\n", m->machno, up->text, up->pid); else print("cpu%d: registers for kernel\n", m->machno); print("FLAGS=%luX TRAP=%luX ECODE=%luX PC=%luX", ureg->flags, ureg->trap, ureg->ecode, ureg->pc); print(" SS=%4.4luX USP=%luX\n", ureg->ss & 0xFFFF, ureg->usp); print(" AX %8.8luX BX %8.8luX CX %8.8luX DX %8.8luX\n", ureg->ax, ureg->bx, ureg->cx, ureg->dx); print(" SI %8.8luX DI %8.8luX BP %8.8luX\n", ureg->si, ureg->di, ureg->bp); print(" CS %4.4luX DS %4.4luX ES %4.4luX FS %4.4luX GS %4.4luX\n", ureg->cs & 0xFFFF, ureg->ds & 0xFFFF, ureg->es & 0xFFFF, ureg->fs & 0xFFFF, ureg->gs & 0xFFFF); panic("aw shit"); } void dumpregs(Ureg* ureg) { vlong mca, mct; dumpregs2(ureg); /* * Processor control registers. * If machine check exception, time stamp counter, page size extensions * or enhanced virtual 8086 mode extensions are supported, there is a * CR4. If there is a CR4 and machine check extensions, read the machine * check address and machine check type registers if RDMSR supported. */ print(" CR0 %8.8lux CR2 %8.8lux CR3 %8.8lux", getcr0(), getcr2(), getcr3()); if(m->cpuiddx & 0x9A){ print(" CR4 %8.8lux", getcr4()); if((m->cpuiddx & 0xA0) == 0xA0){ rdmsr(0x00, &mca); rdmsr(0x01, &mct); print("\n MCA %8.8llux MCT %8.8llux", mca, mct); } } print("\n ur %lux up %lux\n", ureg, up); } /* * Fill in enough of Ureg to get a stack trace, and call a function. * Used by debugging interface rdb. */ void callwithureg(void (*fn)(Ureg*)) { Ureg ureg; ureg.pc = getcallerpc(&fn); ureg.sp = (ulong)&fn; fn(&ureg); } static void _dumpstack(Ureg *ureg) { ulong l, v, i, estack; extern ulong etext; int x; /* if(getconf("*nodumpstack")){ iprint("dumpstack disabled\n"); return; }*/ iprint("dumpstack\n"); x = 0; x += iprint("ktrace /kernel/path %.8lux %.8lux <pc, ureg->sp); i = 0; if(up && (ulong)&l >= (ulong)up->kstack && (ulong)&l <= (ulong)up->kstack+KSTACK) estack = (ulong)up->kstack+KSTACK; else if((ulong)&l >= (ulong)m->stack && (ulong)&l <= (ulong)m+BY2PG) estack = (ulong)m+MACHSIZE; else return; x += iprint("estackx %.8lux\n", estack); for(l=(ulong)&l; lpc--; sprint(buf, "sys: breakpoint"); postnote(up, 1, buf, NDebug); } static void doublefault(Ureg*, void*) { panic("double fault"); } static void unexpected(Ureg* ureg, void*) { print("unexpected trap %lud; ignoring\n", ureg->trap); } extern void checkpages(void); extern void checkfault(ulong, ulong); static void fault386(Ureg* ureg, void*) { ulong addr; int read, user, n, insyscall; char buf[ERRMAX]; addr = getcr2(); read = !(ureg->ecode & 2); user = (ureg->cs & 0xFFFF) == UESEL; if(!user){ if(vmapsync(addr)) return; if(addr >= USTKTOP) panic("kernel fault: bad address pc=0x%.8lux addr=0x%.8lux", ureg->pc, addr); if(up == nil) panic("kernel fault: no user process pc=0x%.8lux addr=0x%.8lux", ureg->pc, addr); } if(up == nil) panic("user fault: up=0 pc=0x%.8lux addr=0x%.8lux", ureg->pc, addr); insyscall = up->insyscall; up->insyscall = 1; n = fault(addr, read); if(n < 0){ if(!user){ dumpregs(ureg); panic("fault: 0x%lux\n", addr); } checkpages(); checkfault(addr, ureg->pc); sprint(buf, "sys: trap: fault %s addr=0x%lux", read ? "read" : "write", addr); postnote(up, 1, buf, NDebug); } up->insyscall = insyscall; } /* * system calls */ #include "../port/systab.h" struct f { struct f *next, *prev; int frompid, topid; int size; unsigned long dat[4]; }; struct f f = {.next = nil, .prev = nil, .size = 0}, temp; RWlock fl; Rendez fr; static void *findf(void *) { struct f *it; rlock(&fl); for(it = f.next; it; it = it->next) { //print("Check list pid %d against my pid %d\n", it->topid, up->pid); if (it->topid == up->pid) break; } runlock(&fl); //print("Find finds %p\n", it); return it; } static int isf(void *) { return (findf(nil) != nil); } long sysr1(ulong *arg) { unsigned long wsize, rsize, topid; Ureg *ureg = (Ureg *)arg; struct f *newf, *foundf; wsize = ureg->si&0xff; rsize = (ureg->si>>8)&0xff; topid = ureg->di; //print("sysr1: topid %d, rsize %d, wsize %d\n", topid, rsize, wsize); if (wsize) { newf = mallocz(sizeof(*newf), 1); newf->size = wsize; newf->topid = topid; newf->frompid = up->pid; newf->dat[0] = ureg->bp; newf->dat[1] = ureg->bx; newf->dat[2] = ureg->cx; newf->dat[3] = ureg->dx; wlock(&fl); if (f.next) f.next->prev = newf; newf->next = f.next; f.next = newf; newf->prev = &f; wunlock(&fl); wakeup(&fr); /* hmm */ sched(); //print("sysr1: added %p\n", newf); } /* shaky. but not really. nobody but us will be dq'ing stuff for us. */ if (rsize){ for(foundf = findf(nil); ! foundf; foundf = findf(nil)) sleep(&fr, isf, nil); wlock(&fl); if (foundf->next) foundf->next->prev = foundf->prev; foundf->prev->next = foundf->next; wunlock(&fl); //print("sysr1: dq %p\n", foundf); /* fill out return */ ureg->si &= ~(0xff<<8); ureg->si |= foundf->size <<8; ureg->di = foundf->frompid; ureg->bp = foundf->dat[0]; ureg->bx = foundf->dat[1]; ureg->cx = foundf->dat[2]; ureg->dx = foundf->dat[3]; free(foundf); } return ureg->si; } /* * Syscall is called directly from assembler without going through trap(). */ void syscall(Ureg* ureg) { char *e; ulong sp; long ret; int i, s; ulong scallnr; //iprint("S"); if((ureg->cs & 0xFFFF) != UESEL) panic("syscall: cs 0x%4.4luX\n", ureg->cs); cycles(&up->kentry); m->syscall++; up->insyscall = 1; up->pc = ureg->pc; up->dbgreg = ureg; if(up->procctl == Proc_tracesyscall){ up->procctl = Proc_stopme; procctl(up); } scallnr = ureg->ax; up->scallnr = scallnr; if(scallnr == RFORK && up->fpstate == FPactive){ fpsave(&up->fpsave); up->fpstate = FPinactive; } spllo(); sp = ureg->usp; up->nerrlab = 0; ret = -1; if(!waserror()){ if(scallnr >= nsyscall || systab[scallnr] == 0){ pprint("bad sys call number %d pc %lux\n", scallnr, ureg->pc); postnote(up, 1, "sys: bad sys call", NDebug); error(Ebadarg); } if (scallnr != SYSR1) if(sp<(USTKTOP-BY2PG) || sp>(USTKTOP-sizeof(Sargs)-BY2WD)) validaddr(sp, sizeof(Sargs)+BY2WD, 0); up->s = *((Sargs*)(sp+BY2WD)); up->psstate = sysctab[scallnr]; if (scallnr != SYSR1) ret = systab[scallnr](up->s.args); else ret = sysr1((ulong *)ureg); poperror(); }else{ /* failure: save the error buffer for errstr */ e = up->syserrstr; up->syserrstr = up->errstr; up->errstr = e; if(0 && up->pid == 1) print("syscall %lud error %s\n", scallnr, up->syserrstr); } if(up->nerrlab){ print("bad errstack [%lud]: %d extra\n", scallnr, up->nerrlab); for(i = 0; i < NERR; i++) print("sp=%lux pc=%lux\n", up->errlab[i].sp, up->errlab[i].pc); panic("error stack"); } /* * Put return value in frame. On the x86 the syscall is * just another trap and the return value from syscall is * ignored. On other machines the return value is put into * the results register by caller of syscall. */ ureg->ax = ret; if(up->procctl == Proc_tracesyscall){ up->procctl = Proc_stopme; s = splhi(); procctl(up); splx(s); } up->insyscall = 0; up->psstate = 0; if(scallnr == NOTED) noted(ureg, *(ulong*)(sp+BY2WD)); if(scallnr!=RFORK && (up->procctl || up->nnote)){ splhi(); notify(ureg); } /* if we delayed sched because we held a lock, sched now */ if(up->delaysched) sched(); kexit(ureg); } /* * Call user, if necessary, with note. * Pass user the Ureg struct and the note on his stack. */ int notify(Ureg* ureg) { int l; ulong s, sp; Note *n; if(up->procctl) procctl(up); if(up->nnote == 0) return 0; if(up->fpstate == FPactive){ fpsave(&up->fpsave); up->fpstate = FPinactive; } up->fpstate |= FPillegal; s = spllo(); qlock(&up->debug); up->notepending = 0; n = &up->note[0]; if(strncmp(n->msg, "sys:", 4) == 0){ l = strlen(n->msg); if(l > ERRMAX-15) /* " pc=0x12345678\0" */ l = ERRMAX-15; sprint(n->msg+l, " pc=0x%.8lux", ureg->pc); } if(n->flag!=NUser && (up->notified || up->notify==0)){ if(n->flag == NDebug) pprint("suicide: %s\n", n->msg); qunlock(&up->debug); pexit(n->msg, n->flag!=NDebug); } if(up->notified){ qunlock(&up->debug); splhi(); return 0; } if(!up->notify){ qunlock(&up->debug); pexit(n->msg, n->flag!=NDebug); } sp = ureg->usp; sp -= sizeof(Ureg); if(!okaddr((ulong)up->notify, 1, 0) || !okaddr(sp-ERRMAX-4*BY2WD, sizeof(Ureg)+ERRMAX+4*BY2WD, 1)){ pprint("suicide: bad address in notify\n"); qunlock(&up->debug); pexit("Suicide", 0); } up->ureg = (void*)sp; memmove((Ureg*)sp, ureg, sizeof(Ureg)); *(Ureg**)(sp-BY2WD) = up->ureg; /* word under Ureg is old up->ureg */ up->ureg = (void*)sp; sp -= BY2WD+ERRMAX; memmove((char*)sp, up->note[0].msg, ERRMAX); sp -= 3*BY2WD; *(ulong*)(sp+2*BY2WD) = sp+3*BY2WD; /* arg 2 is string */ *(ulong*)(sp+1*BY2WD) = (ulong)up->ureg; /* arg 1 is ureg* */ *(ulong*)(sp+0*BY2WD) = 0; /* arg 0 is pc */ ureg->usp = sp; ureg->pc = (ulong)up->notify; up->notified = 1; up->nnote--; memmove(&up->lastnote, &up->note[0], sizeof(Note)); memmove(&up->note[0], &up->note[1], up->nnote*sizeof(Note)); qunlock(&up->debug); splx(s); return 1; } /* * Return user to state before notify() */ void noted(Ureg* ureg, ulong arg0) { Ureg *nureg; ulong oureg, sp; qlock(&up->debug); if(arg0!=NRSTR && !up->notified) { qunlock(&up->debug); pprint("call to noted() when not notified\n"); pexit("Suicide", 0); } up->notified = 0; nureg = up->ureg; /* pointer to user returned Ureg struct */ up->fpstate &= ~FPillegal; /* sanity clause */ oureg = (ulong)nureg; if(!okaddr((ulong)oureg-BY2WD, BY2WD+sizeof(Ureg), 0)){ pprint("bad ureg in noted or call to noted when not notified\n"); qunlock(&up->debug); pexit("Suicide", 0); } /* * Check the segment selectors are all valid, otherwise * a fault will be taken on attempting to return to the * user process. * Take care with the comparisons as different processor * generations push segment descriptors in different ways. */ if((nureg->cs & 0xFFFF) != UESEL || (nureg->ss & 0xFFFF) != UDSEL || (nureg->ds & 0xFFFF) != UDSEL || (nureg->es & 0xFFFF) != UDSEL || (nureg->fs & 0xFFFF) != UDSEL || (nureg->gs & 0xFFFF) != UDSEL){ pprint("bad segment selector in noted\n"); qunlock(&up->debug); pexit("Suicide", 0); } /* don't let user change system flags */ nureg->flags = (ureg->flags & ~0xCD5) | (nureg->flags & 0xCD5); memmove(ureg, nureg, sizeof(Ureg)); switch(arg0){ case NCONT: case NRSTR: if(!okaddr(nureg->pc, 1, 0) || !okaddr(nureg->usp, BY2WD, 0)){ qunlock(&up->debug); pprint("suicide: trap in noted\n"); pexit("Suicide", 0); } up->ureg = (Ureg*)(*(ulong*)(oureg-BY2WD)); qunlock(&up->debug); break; case NSAVE: if(!okaddr(nureg->pc, BY2WD, 0) || !okaddr(nureg->usp, BY2WD, 0)){ qunlock(&up->debug); pprint("suicide: trap in noted\n"); pexit("Suicide", 0); } qunlock(&up->debug); sp = oureg-4*BY2WD-ERRMAX; splhi(); ureg->sp = sp; ((ulong*)sp)[1] = oureg; /* arg 1 0(FP) is ureg* */ ((ulong*)sp)[0] = 0; /* arg 0 is pc */ break; default: pprint("unknown noted arg 0x%lux\n", arg0); up->lastnote.flag = NDebug; /* fall through */ case NDFLT: if(up->lastnote.flag == NDebug){ qunlock(&up->debug); pprint("suicide: %s\n", up->lastnote.msg); } else qunlock(&up->debug); pexit(up->lastnote.msg, up->lastnote.flag!=NDebug); } } long execregs(ulong entry, ulong ssize, ulong nargs) { ulong *sp; Ureg *ureg; up->fpstate = FPinit; fpoff(); sp = (ulong*)(USTKTOP - ssize); *--sp = nargs; ureg = up->dbgreg; ureg->usp = (ulong)sp; ureg->pc = entry; return USTKTOP-sizeof(Tos); /* address of kernel/user shared data */ } /* * return the userpc the last exception happened at */ ulong userpc(void) { Ureg *ureg; ureg = (Ureg*)up->dbgreg; return ureg->pc; } /* This routine must save the values of registers the user is not permitted * to write from devproc and then restore the saved values before returning. */ void setregisters(Ureg* ureg, char* pureg, char* uva, int n) { ulong flags; ulong cs; ulong ss; flags = ureg->flags; cs = ureg->cs; ss = ureg->ss; memmove(pureg, uva, n); ureg->flags = (ureg->flags & 0x00FF) | (flags & 0xFF00); ureg->cs = cs; ureg->ss = ss; } static void linkproc(void) { spllo(); up->kpfun(up->kparg); pexit("kproc dying", 0); } void kprocchild(Proc* p, void (*func)(void*), void* arg) { /* * gotolabel() needs a word on the stack in * which to place the return PC used to jump * to linkproc(). */ p->sched.pc = (ulong)linkproc; p->sched.sp = (ulong)p->kstack+KSTACK-BY2WD; p->kpfun = func; p->kparg = arg; } void forkchild(Proc *p, Ureg *ureg) { Ureg *cureg; /* * Add 2*BY2WD to the stack to account for * - the return PC * - trap's argument (ur) */ p->sched.sp = (ulong)p->kstack+KSTACK-(sizeof(Ureg)+2*BY2WD); p->sched.pc = (ulong)forkret; cureg = (Ureg*)(p->sched.sp+2*BY2WD); memmove(cureg, ureg, sizeof(Ureg)); /* return value of syscall in child */ cureg->ax = 0; /* Things from bottom of syscall which were never executed */ p->psstate = 0; p->insyscall = 0; } /* Give enough context in the ureg to produce a kernel stack for * a sleeping process */ void setkernur(Ureg* ureg, Proc* p) { ureg->pc = p->sched.pc; ureg->sp = p->sched.sp+4; } ulong dbgpc(Proc *p) { Ureg *ureg; ureg = p->dbgreg; if(ureg == 0) return 0; return ureg->pc; } call = insyscall; } /* * system calls */ #include "../port/systab.h" struct f { struct f *next, *prev; int frompid, topid; int size; unsigned long dat[4]; }; struct f f = {.next = nil, .prev = nil, .size = 0}, temp; RWlock fl; Rendez fr; static void *findf(void *) { struct f *it; rlock(&fl); for(it = f.next; it; it = it->next) { //print("Check list pid %d against my pid %d\n", it->topid, up->pid); if (it->topid == up->pid) break; } runlock(&lguest25/uartlg.c 664 0 0 6177 11022045357 13024ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "../port/error.h" #include "lguest.h" #include #pragma profile 0 /* The Lguest uart. Pretty simple. And now kludgy. */ extern PhysUart lgphysuart; /*static */Uart lguart = { .regs = 0, .name = "lgCOM1", .freq = 1000000, .phys = &lgphysuart, .special= 0, .next = nil, }; extern int console; void lgconswrite(char *str, int len) { void lgvconsout(char *a, int len); if (console < 0) hcall(LHCALL_NOTIFY, paddr(str), 0, 0); if (console >-1){ lgvconsout(str, len); } } static char dmabuf[1024]; #ifdef NOT /* TODO: find the write place to put the dmabuf pointer!*/ static void lgintr(Ureg *, void *arg){ int i; Uart *uart = arg; /* XXX should be p = uart->something->dmabuf or some such */ char *p = dmabuf; /* for(i = 0; i < consin.used_len; i++){ lgconswrite(p, 1); uartrecv(uart, *p++); }*/ } #endif static Uart* lgpnp(void) { return &lguart; } unsigned char *in = nil; unsigned char inbuf[1024]; unsigned char inlen = 0; void consinintr(void) { int lgvgetconsbuf(unsigned char *v, int len); int lgvaddconsbuf(unsigned char *v, int len); unsigned char *p = inbuf; // iprint("CONSININTR\n"); // iprint("consin %d\n", inlen); inlen = lgvgetconsbuf(inbuf, sizeof(inbuf)); // iprint("consin %d\n", inlen); for(;inlen> 0;inlen--){ lgconswrite((char *)p, 1); uartrecv(consuart, *p++); } inlen = lgvaddconsbuf(inbuf, sizeof(inbuf)); // iprint("consin %d\n", inlen); //iprint("EXIT CONSININTR"); } void lgenable(Uart *uart, int){ void lgvconsin(void *, int, char *, void *f); //(void *f)(void)); uart->console = 1; /* set up to receive data into our one single buffer. * further lguest input will block until we resubmit this one */ lgvconsin(inbuf, sizeof(inbuf), "consinintr", consinintr); } static void lgdisable(Uart*) { } static long lgstatus(Uart*, void* buf, long n, long offset) { char *p; p = malloc(READSTR); snprint(p, READSTR,"one kickass uart\n"); n = readstr(offset, buf, n, p); free(p); return n; } static void lgputc(Uart *, int c) { void lgvconsout(char *a, int len); /* need to use static -- lguest is asynchronous to us */ static char cc[1]; cc[0] = c; lgconswrite(cc, 1); } static int lgnop(Uart *, int){ return 0; } static void lgnopv(Uart *, int){ } static int lgnogetcyet(Uart *){ iprint("NO GETC YET\n"); return 0; } PhysUart lgphysuart = { .name = "lguestuart", .pnp = lgpnp, .enable = lgenable, .disable = lgdisable, .kick = nil, .dobreak = nil, .baud = lgnop, .bits = lgnop, .stop = lgnop, .parity = lgnop, .modemctl = nil, .rts = nil, .dtr = nil, .status = lgstatus, .fifo = nil, .getc = lgnogetcyet, .putc = lgputc, }; void lgconsole(void) { int findlgv(char *name); Uart *uart; uart = &lguart; uartctl(uart, "b9600 l8 pn s1"); /* if(*cmd != '\0') uartctl(uart, cmd); */ consuart = uart; uart->console = 1; iprint("findlgv is %d\n", findlgv("cons")); /*if only. lgconswrite("lgpnp\n", 7); iprint("lgpnp, open cons\n"); c = namec("#Z/cons", Aopen, ORDWR, 0); iprint("lgpnp, c %p\n", c); if (! c) return; console = c; */ } * Pass user the Ureg struct and the note on his stack. */ int notify(Ureg* ureg) { int l; ulong s, sp; Note *n; if(up->procctl) procctl(up); if(up->nnote == 0) return 0; if(up->fpstate == FPactive){ fpsave(&up->fpsave); up->fpstate = FPinactive; } up->fpstate |= FPillegal; s = spllo(); qlock(&up->debug); up->notepending = 0; n = &up->note[0]; if(strncmp(n->lguest25/devlgv.c 664 0 0 40676 11022045246 13034ustar00bootesbootes#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "../port/error.h" #include "io.h" #include "lguest.h" #include "virtio_ring.h" #pragma profile 0 #ifdef ABIGCOMMENT /* so, how does this ring stuff line up anyways? At start time, there is an lguest device page, mapped "just after" memory, * i.e. at the page right after top of mem. This page contains an array of device descriptors. * They like this: /* The device type: console, network, disk etc. Type 0 terminates. */ u8 type; /* The number of virtqueues (first in config array) */ u8 num_vq; /* The number of bytes of feature bits. Multiply by 2: one for host * features and one for Guest acknowledgements. */ u8 feature_len; /* The number of bytes of the config array after virtqueues. */ u8 config_len; /* A status byte, written by the Guest. */ u8 status; u8 config[]; * That config thing is a variable-sized area, with fairly arbitrary layout. The layout by convention is devices, then feature_len, * then config_len. The devices area is num_vq * sizeof the ring descriptor, which is a 2-byte # pages, a 2-byte irq #, and a * 4-byte page frame number. You have to map in that page frame number for the # pages. This gets you access to the * actual virtio ring. Recall too that there are num_vq of these. * * Now we are down to it. What is this bunch of pages we mapped in? * I *think* it is a vring. Time to try to find out. Yes it is, from lguest launcher: /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, getpagesize()); * So the ring is basically set up for us. To do IO, we just use the add_buf stuff, and get_buf, and it ought to work. * So we'll add those functions here and if we see a console we'll try a hello from virtq type thing. * see console functions below. This is kind of working. */ #endif #define START_USE(vq) \ do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0) #define END_USE(vq) \ do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0) /* * lguest virtio driver */ enum{ Qdir = 0, Qlgv, Qmax = 32, QTNET = 1, QTBLOCK, QTCONS, FirstDev = 2, }; struct lguest_device { Lock; int open; int devno; struct lguest_device_desc *lgd; unsigned char *features; unsigned char *config; int featurelen, configlen; /* we only use 2 max now, so I worrieth not */ struct vqring ring[32]; }; struct devinfo { char *format; int unit; }; struct devinfo devs[] = { [QTNET] = {"net%d", 0}, [QTBLOCK] = {"block%d",0}, /* there really should be only one */ [QTCONS] = {"cons", 0}, }; Dirtab lgvdir[Qmax]={ ".", {Qdir, 0, QTDIR}, 0, 0555, "lgv", {Qlgv, 0}, 0, 0664, }; struct lguest_device lgv[Qmax]; static int ndev = FirstDev; int console = -1; /* from the linux stuff. addbuf and getbuf */ /* clean this up as it it GPL! */ int vring_add_buf(struct vqring *vqring, void *v[], int *len, unsigned int out, unsigned int in, void *data) { unsigned int i, avail, head, index = 0; int prev = 0; if (data == nil) panic("vring_add_buf: data is nil"); if (out + in > vqring->vring.num) panic("vring_add_buf: out %d + in %d is > vq->vring.num %d", out, in, vqring->vring.num); if (out + in == 0) panic("vring_add_buf: out %d + in %d == 0", out, in); if (vqring->num_free < out + in) { iprint("Can't add buf len %ud - avail = %ud\n", out + in, vqring->num_free); /* We notify *even if* VRING_USED_F_NO_NOTIFY is set here. */ //vq->notify(&vq->vq); return 0; } /* We're about to use some buffers from the free list. */ vqring->num_free -= out + in; head = vqring->free_head; for (i = vqring->free_head; out; i = vqring->vring.desc[i].next, out--) { vqring->vring.desc[i].flags = VRING_DESC_F_NEXT; vqring->vring.desc[i].addr = paddr(v[index]); vqring->vring.desc[i].len = len[index]; prev = i; index++; } //iprint("after loop i is %d\n", i); for (; in; i = vqring->vring.desc[i].next, in--) { vqring->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; vqring->vring.desc[i].addr = paddr(v[index]); vqring->vring.desc[i].len = len[index]; prev = i; index++; } /* Last one doesn't continue. */ vqring->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; /* Update free pointer */ vqring->free_head = i; /* Set token. */ vqring->data[head] = data; /* Put entry in available array (but don't update avail->idx until they * do sync). FIXME: avoid modulus here? */ avail = (vqring->vring.avail->idx + vqring->num_added++) % vqring->vring.num; vqring->vring.avail->ring[avail] = head; //{int i = console; console = -1; //{ iprint("Added buffer head %ud to %p\n", head, vq); console = i;} return 0; } void vring_kick(struct vqring *vqring) { /* Descriptors and available array need to be set before we expose the * new available array entries. */ coherence(); vqring->vring.avail->idx += vqring->num_added; vqring->num_added = 0; /* Need to update avail index before checking if we should notify */ coherence(); //iprint("vqring: num %d irq %d pages %p ppages %p\n", vqring->num, vqring->irq, vqring->pages, (void *)vqring->ppages); if (!(vqring->vring.used->flags & VRING_USED_F_NO_NOTIFY)) /* Prod other side to tell it about changes. */ hcall(LHCALL_NOTIFY, vqring->ppages, 0, 0); } void detach_buf(struct vqring *vqring, unsigned int head) { unsigned int i; /* Clear data ptr. */ vqring->data[head] = nil; /* Put back on free list: find end */ i = head; while (vqring->vring.desc[i].flags & VRING_DESC_F_NEXT) { i = vqring->vring.desc[i].next; vqring->num_free++; } vqring->vring.desc[i].next = vqring->free_head; vqring->free_head = head; /* Plus final descriptor */ vqring->num_free++; } int more_used(struct vqring *vqring) { return vqring->last_used_idx != vqring->vring.used->idx; } void *vring_get_buf(struct vqring *vqring, unsigned int *len) { void *ret; unsigned int i; if (!more_used(vqring)) { // iprint("No more buffers in queue\n"); return nil; } i = vqring->vring.used->ring[vqring->last_used_idx%vqring->vring.num].id; *len = vqring->vring.used->ring[vqring->last_used_idx%vqring->vring.num].len; if (i >= vqring->vring.num) { panic("id %d out of range\n", i); return nil; } if (!vqring->data[i]) { panic("id %d is not a head!\n", i); return nil; } /* detach_buf clears data, so grab it now. */ ret = vqring->data[i]; detach_buf(vqring, i); vqring->last_used_idx++; return ret; } /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * * struct vring * { * // The actual descriptors (16 bytes each) * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. * __u16 avail_flags; * __u16 avail_idx; * __u16 available[num]; * * // Padding to the next page boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. * __u16 used_flags; * __u16 used_idx; * struct vring_used_elem used[num]; * }; */ static void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long pagesize) { vr->num = num; vr->desc = p; vr->avail = (void *)((unsigned char *)p + num*sizeof(struct vring_desc)); vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + pagesize-1) & ~(pagesize - 1)); } static unsigned vring_size(unsigned int num, unsigned long pagesize) { return ((sizeof(struct vring_desc) * num + sizeof(u16) * (2 + num) + pagesize - 1) & ~(pagesize - 1)) + sizeof(u16) * 2 + sizeof(struct vring_used_elem) * num; } /* end Linux */ /* config and feature management */ struct lguest_device *devtolg(int devno) { struct lguest_device *dev; /* valid device? */ if ((devno < FirstDev) || (FirstDev > ndev)) panic("devtolg: bad dev %d", devno); dev = &lgv[devno]; return dev; } struct vqring *lgtoring(struct lguest_device *dev, int ring) { struct lguest_device_desc *lg = dev->lgd; struct vqring *vring; if (ring > lg->num_vq) panic("Bad ring %d dev %p\n", ring, dev); vring = &dev->ring[ring]; return vring; } struct vqring *devtoring(int dev, int ring) { struct lguest_device *pdev; struct vqring *vring; pdev = devtolg(dev); vring = lgtoring(pdev, ring); return vring; } /* these ought to be replaced, maybe, with some common function. Ah well. */ int lgvirq(int devno, int ring) { struct vqring *vring = devtoring(devno, ring); return vring->irq; } int lgvnumfree(int devno, int ring) { struct vqring *vring = devtoring(devno, ring); return vring->num_free; } int lgvfeaturelen(int devno) { struct lguest_device *dev; dev = devtolg(devno); return dev->featurelen; } int lgvconfiglen(int devno) { struct lguest_device *dev; dev = devtolg(devno); return dev->configlen; } /* this really needs to be a file io thing at some point. Perhaps each dev should be a * directory with a feaures file. We're still figuring this out */ int lgvfeature(int devno, unsigned int feature) { struct lguest_device *dev; u32 word, bit; int ret; dev = devtolg(devno); /* recall that it is a bit number! */ if (feature/32 > dev->featurelen) panic("Feature out of bounds: %x on %d\n", feature, devno); word = feature/32; bit = feature % 32; ret = dev->config[word] & bit; return ret; } void lgvconfig(int devno, unsigned char *config, int off ,int len) { struct lguest_device *dev; dev = devtolg(devno); /* recall that it is a bit number! */ if ((off + len) > dev->configlen) panic("Config out of bounds: %d/%d on %d\n", off, len, devno); memmove(config, &dev->config[off], len); } void lgvdumpconfig(int devno) { void dumphex(char *name, unsigned char *s, int len); struct lguest_device *dev; dev = devtolg(devno); dumphex("Config", dev->config, dev->configlen); } int lgvconsout(char *a, int len) { unsigned int getbuflen; //{int i = console;console=-1; iprint("lgv consout a %p len %d\n", a, len);console=i;} vring_add_buf(&lgv[console].ring[1], &a, &len, 1, 0, a); // iprint("ret from add buf is %d\n", ret); vring_kick(&lgv[console].ring[1]); // iprint("back from kick\n"); vring_get_buf(&lgv[console].ring[1], &getbuflen); // hcall(LHCALL_NOTIFY, paddr(str), 0, 0); return getbuflen; } int lgvaddrecvbuf(int dev, int ring, void *v[], int len[], int nbuf, void *tag) { vring_add_buf(&lgv[dev].ring[ring], v, len, 0, nbuf, tag); vring_kick(&lgv[dev].ring[ring]); return 0; } int lgvaddxmitbuf(int dev, int ring, void *v[], int len[], int nbuf, void *tag) { // int i; // iprint("smit dev %d ring %d nbuf %d: ", dev, ring, nbuf); // for(i = 0; i < nbuf; i++) iprint("%p/%d", v[i], len[i]); // iprint("\n"); vring_add_buf(&lgv[dev].ring[ring], v, len, nbuf, 0, tag); vring_kick(&lgv[dev].ring[ring]); return 0; } int lgvaddbuf(int dev, int ring, void *v[], int len[], int out, int in, void *tag) { // int i; // iprint("smit dev %d ring %d out %d in %d: ", dev, ring, out, in); // for(i = 0; i < out + in; i++) iprint("%p/%d,", v[i], len[i]); // iprint("\n"); vring_add_buf(&lgv[dev].ring[ring], v, len, out, in, tag); vring_kick(&lgv[dev].ring[ring]); return 0; } int lgvaddconsbuf(unsigned char *v, int len) { return lgvaddrecvbuf(console, 0, &v, &len, 1, v); } void * lgvgetbuf(int dev, int ring, int *plen) { void *vring_get_buf(struct vqring *vqring, unsigned int *len); void *ret; *plen = 0; ret = vring_get_buf(&lgv[dev].ring[ring], (unsigned int *)plen); return ret; } int lgvgetconsbuf(unsigned char *, int) { int plen = 0; /* void *vring_get_buf(struct vqring *vqring, unsigned int *len); int plen = 0; vring_get_buf(&lgv[console].ring[0], (unsigned int *)&plen); //iprint("lgvgetconsbuf\n"); return plen; */ lgvgetbuf(console, 0, &plen); return plen; } void lgvconsin(void *a, int len, char *name, void *f) //(void *)f(void)) { iprint("lgv consin a %p len %d f %p\n", a, len, f); if (f) { intrenable(lgv[console].ring[0].irq + 32, f, a, BUSUNKNOWN, name); } vring_add_buf(&lgv[console].ring[0], &a, &len, 0, 1, a); // iprint("ret from add buf is %d\n", ret); vring_kick(&lgv[console].ring[0]); } int findlgv(char *name) { int ret = -1; int i; for(i = 0; i < ndev; i++) { if (! strcmp(name, lgvdir[i].name)){ iprint("findlgv %s, @%d\n", name, i); ret = i; break; } } return ret; } static Chan* lgvattach(char* spec) { return devattach('Z', spec); } static Walkqid* lgvwalk(Chan* c, Chan *nc, char** name, int nname) { return devwalk(c, nc, name, nname, lgvdir, ndev, devgen); } static int lgvstat(Chan* c, uchar* dp, int n) { return devstat(c, dp, n, lgvdir, ndev, devgen); } static void lgvintr(Ureg *, void *) { // hcall(LHCALL_CRASH, paddr("data is not aligned"), 0, 0); // panic("lgvintr"); hcall(LHCALL_NOTIFY, paddr("lgvintr\n"), 0, 0); } static Chan* lgvopen(Chan* c, int omode) { Chan *ret; ret = devopen(c, omode, lgvdir, ndev, devgen); return ret; } static void lgvclose(Chan*) { } static long lgvread(Chan* c, void* buf, long n, vlong off) { ulong offset = off; if(c->qid.type & QTDIR) return devdirread(c, buf, n, lgvdir, ndev, devgen); switch((ulong)c->qid.path){ case Qlgv:{ int l; int i, j; char *p; if((p = malloc(READSTR)) == nil) error(Enomem); for(l = 0, i = 2; i < ndev; i++){ l += snprint(p+l, READSTR-l, "%s: numq %d status %d", lgvdir[i].name, lgv[i].lgd->num_vq, lgv[i].lgd->status); for(j = 0; j < lgv[i].lgd->num_vq; j++){ l += snprint(p+l, READSTR-l, "[%d irq %d]", j, lgvirq(i, j)); } l += snprint(p+l, READSTR-l, "\n"); } n = readstr(offset, buf, n, p); free(p); return n; } } error(Ebadarg); return 0; } /* Not ready yet to make the write go where it should. */ static long lgvwrite(Chan* , void*, long , vlong) { error(Eperm); } void configdesc(struct vqring *ring) { int k; for (k = 0; k < ring->num-1; k++) { ring->vring.desc[k].next = k+1; } } void configring(struct vqring *ring, unsigned char *v, char *) { unsigned long pfn; memmove(&ring->num, &v[0], 2); memmove(&ring->irq, &v[2], 2); memmove(&pfn, &v[4], 4); ring->ppages = pfn << PGSHIFT; /* 16 bytes per entry. So it is ring->num * BY2PG / 16 */ ring->pages = vmap(pfn<num*BY2PG/16); vring_init(&ring->vring, ring->num, ring->pages,BY2PG); iprint("ring %p: num %d irq %d pages %p\n", ring, ring->num, ring->irq, ring->pages); ring->q = qopen(4096, 0, 0, 0); ring->num_free = ring->num; ring->free_head = 0; configdesc(ring); } struct lguest_device_desc * configldev(struct lguest_device_desc *l, struct lguest_device *lg, char *name) { unsigned char *v; int j; unsigned char *cp; iprint("Dev %d, %d, %d, %d, %d, ...\n", l->type, l->num_vq, l->feature_len, l->config_len, l->status); for(j = 0, v = l->config; j < l->num_vq; j++) { configring(&lg->ring[j], &v[j*8], name); } l->status = 1; /* set up feature and configure pointers */ cp = (unsigned char *)l; cp += 5 + l->num_vq*8; lg->features = cp; lg->featurelen = l->feature_len; cp += l->feature_len*2; lg->config = cp; lg->configlen = l->config_len; cp += l->config_len; l = (struct lguest_device_desc *)cp; return l; } void lgdir(struct lguest_device_desc *l, int path, Dirtab *d) { /* set up the name */ sprint(d->name, devs[l->type].format, devs[l->type].unit); d->qid.path = path; d->qid.vers = 0; d->qid.type = l->type; d->length = 0; d->perm = 0644; } void lgvreset(void) { void dumphex(char *name, unsigned char *s, int len); extern struct lguest_device_desc *lgd; struct lguest_device_desc *l = lgd, *nextl; int i; iprint("lgv reset\n"); dumphex("lgd", (uchar *)lgd, 256); /* let's dump some devices */ { extern struct lguest_device_desc *lgd;struct lguest_device_desc *l = lgd; unsigned char *cp;int i; for(i = 0; i < 10 && l->type; i++){ iprint("Dev %d, %d, %d, %d, %d, ...\n", l->type, l->num_vq, l->feature_len, l->config_len, l->status); cp = (unsigned char *)l; cp += 5 + l->feature_len*2 + l->config_len + l->num_vq*8; l = (struct lguest_device_desc *)cp; } } for(i = 2; i < Qmax && l->type; i++, l = nextl, ndev++){ lgv[i].lgd = l; nextl = configldev(l, &lgv[i], lgvdir[i].name ); lgdir(l, i, &lgvdir[i]); if (l->type == QTCONS) { char *a = "============================== hi there ========================\n"; iprint("Found a console! try output!\n"); console = i; lgvconsout(a, strlen(a)); lgvconsout(a, strlen(a)); lgvconsout(a, strlen(a)); } } iprint("lgv reset done\n"); } Dev lgvdevtab = { 'Z', "lgv", lgvreset, devinit, devshutdown, lgvattach, lgvwalk, lgvstat, lgvopen, devcreate, lgvclose, lgvread, devbread, lgvwrite, devbwrite, devremove, devwstat, }; turn ret; } /* The standard layout for the ring is a continuous clguest25/virtio_ring.h 664 0 0 10061 11001234710 14057ustar00bootesbootes /* An interface for efficient virtio implementation, currently for use by KVM * and lguest, but hopefully others soon. Do NOT change this since it will * break existing servers and clients. * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * * Copyright Rusty Russell IBM Corporation 2007. */ /* This marks a buffer as continuing via the next field. */ #define VRING_DESC_F_NEXT 1 /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 /* The Host uses this in used->flags to advise the Guest: don't kick me when * you add a buffer. It's unreliable, so it's simply an optimization. Guest * will still kick if it's out of buffers. */ #define VRING_USED_F_NO_NOTIFY 1 /* The Guest uses this in avail->flags to advise the Host: don't interrupt me * when you consume a buffer. It's unreliable, so it's simply an * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 typedef uvlong u64; /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ u64 addr; /* Length. */ u32 len; /* The flags as indicated above. */ u16 flags; /* We chain unused descriptors via this, too */ u16 next; }; struct vring_avail { u16 flags; u16 idx; u16 ring[]; }; /* u32 is used here for ids for padding reasons. */ struct vring_used_elem { /* Index of start of used descriptor chain. */ u32 id; /* Total length of the descriptor chain which was used (written to) */ u32 len; }; struct vring_used { u16 flags; u16 idx; struct vring_used_elem ring[]; }; struct vring { unsigned int num; struct vring_desc *desc; struct vring_avail *avail; struct vring_used *used; }; /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * * struct vring * { * // The actual descriptors (16 bytes each) * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. * u16 avail_flags; * u16 avail_idx; * u16 available[num]; * * // Padding to the next page boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. * u16 used_flags; * u16 used_idx; * struct vring_used_elem used[num]; * }; */ /* normalized from the lguest passed-in data */ struct vqring { int num; int irq; ulong ppages; /* physical page address */ void *pages; struct vring vring; /* Other side has made a mess, don't try any more. */ int broken; /* Number of free buffers */ unsigned int num_free; /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ unsigned int num_added; /* Last used index we've seen. */ u16 last_used_idx; /* How to notify other side. FIXME: commonalize hcalls! */ void (*notify)(struct virtqueue *vq); #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; #endif /* Tokens for callbacks. */ unsigned char *data[128]; /* and one queue for each ring. Good idea? Who knows? */ /* need to model this after lgether, which was zero-copy. Later. */ Queue *q; }; #ifdef NO static inline void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long pagesize) { vr->num = num; vr->desc = p; vr->avail = p + num*sizeof(struct vring_desc); vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + pagesize-1) & ~(pagesize - 1)); } static inline unsigned vring_size(unsigned int num, unsigned long pagesize) { return ((sizeof(struct vring_desc) * num + sizeof(u16) * (2 + num) + pagesize - 1) & ~(pagesize - 1)) + sizeof(u16) * 2 + sizeof(struct vring_used_elem) * num; } #ifdef __KERNEL__ #include struct virtio_device; struct virtqueue; struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq)); void vring_del_virtqueue(struct virtqueue *vq); irqreturn_t vring_interrupt(int irq, void *_vq); #endif #endifdrecvbuf(console, 0, &v, &len, 1, v); } void * lgvgetbuf(int dev, int ring, int *plen) { void *vring_get_buf(struct vqring *vqring, unsigned int *len); void *ret; *plen = 0; ret = vring_get_buf(&lgv[dev].ring[ring], (unsigned int *)plen); return ret; } int lgvgetconsbuf(unsigned char *, int) { int plen = 0; /* void *vring_get_buf(struct vqring *vqring, unsigned int *len); int plen = 0; vring_get_buf(&lgv[console].ring[0], (unsigned int *)&plen); //