/* * intel 10gbe pcie driver * copyright © 2007, coraid, inc. */ #include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "../port/error.h" #include "../port/netif.h" #include "etherif.h" /* * // comments note conflicts with 82563-style drivers * */ enum{ /* general */ Ctrl = 0x00000/4, /* Device Control */ Status = 0x00008/4, /* Device Status */ Ctrlext = 0x00018/4, /* Extended Device Control */ Esdp = 0x00020/4, // extended sdp control Esodp = 0x00028/4, // extended od sdp control Ledctl = 0x00200/4, // led control Tcptimer = 0x0004c/4, // tcp timer Ecc = 0x110b0/4, // errata ecc control magic /* nvm */ Eec = 0x10010/4, // eeprom/flash control Eerd = 0x10014/4, // eeprom read Fla = 0x1001c/4, // flash access Flop = 0x1013c/4, // flash opcode Grc = 0x10200/4, // general rx control /* interrupt */ Icr = 0x00800/4, // interrupt cause read Ics = 0x00808/4, // " set Ims = 0x00880/4, // " mask read/set Imc = 0x00888/4, // " mask clear Iac = 0x00810/4, // " ayto clear Iam = 0x00890/4, // " auto mask enable Itr = 0x00820/4, // " throttling rate (0-19) Ivar = 0x00900/4, // " vector allocation regs. /*msi interrupt */ Msixt = 0x0000/4, // msix table (bar3) Msipba = 0x2000/4, // msix pending bit array (bar3) Pbacl = 0x11068/4, // pba clear Gpie = 0x00898/4, // general purpose int enable /* flow control */ Pfctop = 0x03008/4, // priority flow ctl type opcode Fcttv = 0x03200/4, // " transmit timer value (0-3) Fcrtl = 0x03220/4, // " rx threshold low (0-7) +8n Fcrth = 0x03260/4, // " rx threshold high (0-7) +8n Rcrtv = 0x032a0/4, // " refresh value threshold Tfcs = 0x0ce00/4, // " tx status /* rx dma */ Rbal = 0x01000/4, // rx desc base low (0-63) +0x40n Rbah = 0x01004/4, // " high Rdlen = 0x01008/4, // " length Rdh = 0x01010/4, // " head Rdt = 0x01018/4, // " tail Rxdctl = 0x01028/4, // " control Srrctl = 0x02100/4, // split and replication rx ctl. Dcarxctl = 0x02200/4, // rx dca control Rdrxctl = 0x02f00/4, // rx dma control Rxpbsize = 0x03c00/4, // rx packet buffer size Rxctl = 0x03000/4, // rx control Dropen = 0x03d04/4, // drop enable control /* rx */ Rxcsum = 0x05000/4, // rx checksum control Rfctl = 0x04008/4, // rx filter control Mta = 0x05200/4, // multicast table array (0-127) Ral = 0x05400/4, // rx address low Rah = 0x05404/4, Psrtype = 0x05480/4, // packet split rx type. Vfta = 0x0a000/4, // vlan filter table array. Fctrl = 0x05080/4, // filter control Vlnctrl = 0x05088/4, // vlan control Msctctrl = 0x05090/4, // multicast control Mrqc = 0x05818/4, // multiple rx queues cmd Vmdctl = 0x0581c/4, // vmdq control Imir = 0x05a80/4, // immediate irq rx (0-7) Imirext = 0x05aa0/4, // immediate irq rx ext Imirvp = 0x05ac0/4, // immediate irq vlan priority Reta = 0x05c00/4, // redirection table Rssrk = 0x05c80/4, // rss random key /* tx */ Tdbal = 0x06000/4, // tx desc base low +0x40n Tdbah = 0x06004/4, // " high Tdlen = 0x06008/4, // " len Tdh = 0x06010/4, // " head Tdt = 0x06018/4, // " tail Txdctl = 0x06028/4, // " control Tdwbal = 0x06038/4, // " write-back address low Tdwbah = 0x0603c/4, Dtxctl = 0x07e00/4, // tx dma control Tdcatxctrl = 0x07200/4, // tx dca register (0-15) Tipg = 0x0cb00/4, // tx inter-packet gap Txpbsize = 0x0cc00/4, // tx packet-buffer size (0-15) /* mac */ Hlreg0 = 0x04240/4, // highlander control reg 0 Hlreg1 = 0x04244/4, // highlander control reg 1 (ro) Msca = 0x0425c/4, // mdi signal cmd & addr Msrwd = 0x04260/4, // mdi single rw data Mhadd = 0x04268/4, // mac addr high & max frame Pcss1 = 0x04288/4, // xgxs status 1 Pcss2 = 0x0428c/4, Xpcss = 0x04290/4, // 10gb-x pcs status Serdesc = 0x04298/4, // serdes control Macs = 0x0429c/4, // fifo control & report Autoc = 0x042a0/4, // autodetect control & status Links = 0x042a4/4, // link status Autoc2 = 0x042a8/4, }; enum{ /* Ctrl */ Rst = 1<<26, /* full nic reset */ /* Txdctl */ Ten = 1<<25, /* Fctrl */ Bam = 1<<10, /* broadcast accept mode */ Upe = 1<<9, /* unicast promiscuous */ Mpe = 1<<8, /* multicast promiscuous */ /* Rxdctl */ Pthresh = 0, /* prefresh threshold shift in bits */ Hthresh = 8, /* host buffer minimum threshold " */ Wthresh = 16, /* writeback threshold */ Renable = 1<<25, /* Rxctl */ Rxen = 1<<0, Dmbyps = 1<<1, /* Rdrxctl */ Rdmt½ = 0, Rdmt¼ = 1, Rdmt⅛ = 2, /* Rxcsum */ Ippcse = 1<<12, /* ip payload checksum enable */ /* Eerd */ EEstart = 1<<0, /* Start Read */ EEdone = 1<<1, /* Read done */ /* interrupts */ Irx0 = 1<<0, /* driver defined */ Itx0 = 1<<1, /* driver defined */ Lsc = 1<<20, /* link status change */ /* Links */ Lnkup = 1<<30, Lnkspd = 1<<29, /* Hlreg0 */ Jumboen = 1<<2, }; typedef struct{ uint reg; char *name; }Stat; Stat stattab[] = { 0x4000, "crc error", 0x4004, "illegal byte", 0x4008, "short packet", 0x3fa0, "missed pkt0", 0x4034, "mac local flt", 0x4038, "mac rmt flt", 0x4040, "rx length err", 0x3f60, "xon tx", 0xcf60, "xon rx", 0x3f68, "xoff tx", 0xcf68, "xoff rx", 0x405c, "rx 040", 0x4060, "rx 07f", 0x4064, "rx 100", 0x4068, "rx 200", 0x406c, "rx 3ff", 0x4070, "rx big", 0x4074, "rx ok", 0x4078, "rx bcast", 0x3fc0, "rx no buf0", 0x40a4, "rx runt", 0x40a8, "rx frag", 0x40ac, "rx ovrsz", 0x40b0, "rx jab", 0x40d0, "rx pkt", 0x40d4, "tx pkt", 0x40d8, "tx 040", 0x40dc, "tx 07f", 0x40e0, "tx 100", 0x40e4, "tx 200", 0x40e8, "tx 3ff", 0x40ec, "tx big", 0x40f4, "tx bcast", 0x4120, "xsum err", }; /* status */ enum{ Pif = 1<<7, /* past exact filter (sic) */ Ipcs = 1<<6, /* ip checksum calcuated */ L4cs = 1<<5, /* layer 2 */ Tcpcs = 1<<4, /* tcp checksum calcuated */ Vp = 1<<3, /* 802.1q packet matched vet */ Ixsm = 1<<2, /* ignore checksum */ Reop = 1<<1, /* end of packet */ Rdd = 1<<0, /* descriptor done */ }; typedef struct{ u32int addr[2]; ushort length; ushort cksum; uchar status; uchar errors; ushort vlan; }Rd; enum{ /* Td cmd */ Rs = 1<<3, Ic = 1<<2, Ifcs = 1<<1, Teop = 1<<0, /* Td status */ Tdd = 1<<0, }; typedef struct{ u32int addr[2]; ushort length; uchar cso; uchar cmd; uchar status; uchar css; ushort vlan; }Td; enum{ Factive = 1<<0, Fstarted = 1<<1, }; typedef struct{ Pcidev *p; u32int *reg; u32int *reg3; uchar flag; int nrd, ntd, nrb, rbsz; QLock slock, alock, tlock; Rendez lrendez, trendez, rrendez; uint im, lim, rim, tim; Lock imlock; char *alloc; Rd *rdba; Block **rb; uint rdt, rdfree; Td *tdba; uint tdh, tdt; Block **tb; uchar ra[Eaddrlen]; uchar mta[128]; ulong stats[nelem(stattab)]; uint speeds[3]; }Ctlr; /* tweakable paramaters */ enum{ Rbsz = 12*1024, Nrd = 256, Ntd = 256, Nrb = 256, }; static Ctlr *ctlrtab[4]; static int nctlr; static Lock rblock; static Block *rbpool; static void readstats(Ctlr *c) { int i; qlock(&c->slock); for(i = 0; i < nelem(c->stats); i++) c->stats[i] += c->reg[stattab[i].reg>>2]; qunlock(&c->slock); } static int speedtab[] = { 0, 1000, 10000, }; static long ifstat(Ether *e, void *a, long n, ulong offset) { Ctlr *c; char *s, *p, *q; uint i, *t; c = e->ctlr; p = s = malloc(2*READSTR); q = p+2*READSTR; readstats(c); for(i = 0; istats[i]>0) p = seprint(p, q, "%.10s %uld\n", stattab[i].name, c->stats[i]); t = c->speeds; p = seprint(p, q, "speeds: 0:%d 1000:%d 10000:%d\n", t[0], t[1], t[2]); seprint(p, q, "rdfree %d rdh %d rdt %d\n", c->rdfree, c->reg[Rdt], c->reg[Rdh]); n = readstr(offset, a, n, s); free(s); return n; } static void im(Ctlr *c, int i) { ilock(&c->imlock); c->im |= i; c->reg[Ims] = c->im; iunlock(&c->imlock); } static int lim(void *v) { return ((Ctlr*)v)->lim != 0; } static void lproc(void *v) { Ether *e; Ctlr *c; int r, i; e = v; c = e->ctlr; loop: r = c->reg[Links]; e->link = (r&Lnkup) != 0; i = 0; if(e->link) i = 1+((r&Lnkspd) != 0); c->speeds[i]++; e->mbps = speedtab[i]; c->lim = 0; im(c, Lsc); sleep(&c->lrendez, lim, c); c->lim = 0; goto loop; } static long ctl(Ether *, void *, long) { error(Ebadarg); return -1; } static Block* rballoc(void) { Block *b; ilock(&rblock); if(b = rbpool){ rbpool = b->next; b->next = 0; } iunlock(&rblock); return b; } void rbfree(Block *b) { b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base); ilock(&rblock); b->next = rbpool; rbpool = b; iunlock(&rblock); } #define Next(x, m) (((x)+1) & (m)) static int cleanup(Ctlr *c, int tdh) { Block *b; uint m, n; m = c->ntd-1; while(c->tdba[n = Next(tdh, m)].status&Tdd){ tdh = n; b = c->tb[tdh]; c->tb[tdh] = 0; freeb(b); c->tdba[tdh].status = 0; } return tdh; } void transmit(Ether *e) { Ctlr *c; Block *b; Td *t; uint i, m, tdt, tdh; c = e->ctlr; // qlock(&c->tlock); if(!canqlock(&c->tlock)){ im(c, Itx0); return; } tdh = c->tdh = cleanup(c, c->tdh); tdt = c->tdt; m = c->ntd-1; for(i = 0; i<8; i++){ if(Next(tdt, m) == tdh){ im(c, Itx0); break; } if(!(b = qget(e->oq))) break; t = c->tdba+tdt; t->addr[0] = PCIWADDR(b->rp); t->length = BLEN(b); t->cmd = Rs|Ifcs|Teop; c->tb[tdt] = b; tdt = Next(tdt, m); } if(i){ c->tdt = tdt; c->reg[Tdt] = tdt; } qunlock(&c->tlock); } static int tim(void *c) { return ((Ctlr*)c)->tim != 0; } static void tproc(void *v) { Ether *e; Ctlr *c; e = v; c = e->ctlr; loop: sleep(&c->trendez, tim, c); /* transmit kicks us */ c->tim = 0; transmit(e); goto loop; } static void rxinit(Ctlr *c) { Block *b; int i; c->reg[Rxctl] &= ~Rxen; for(i = 0; inrd; i++){ b = c->rb[i]; c->rb[i] = 0; if(b) freeb(b); } c->rdfree = 0; c->reg[Fctrl] |= Bam; c->reg[Rxcsum] |= Ipcs; c->reg[Srrctl] = (c->rbsz+1023)/1024; c->reg[Mhadd] = c->rbsz<<16; c->reg[Hlreg0] |= Jumboen; c->reg[Rbal] = PCIWADDR(c->rdba); c->reg[Rbah] = 0; c->reg[Rdlen] = c->nrd*sizeof(Rd); c->reg[Rdh] = 0; c->reg[Rdt] = c->rdt = 0; c->reg[Rdrxctl] = Rdmt¼; c->reg[Rxdctl] = 8<reg[Rxctl] |= Rxen|Dmbyps; } static void replenish(Ctlr *c, uint rdh) { Rd *r; Block *b; int rdt, m, i; m = c->nrd-1; i = 0; for(rdt = c->rdt; Next(rdt, m) != rdh; rdt = Next(rdt, m)){ r = c->rdba+rdt; if(!(b = rballoc())){ print("82598: no buffers\n"); break; } c->rb[rdt] = b; r->addr[0] = PCIWADDR(b->rp); r->status = 0; c->rdfree++; i++; } if(i) c->reg[Rdt] = c->rdt = rdt; } static int rim(void *v) { return ((Ctlr*)v)->rim != 0; } static uchar zeroea[Eaddrlen]; void rproc(void *v) { Ether *e; Ctlr *c; Block *b; Rd *r; uint m, rdh; e = v; c = e->ctlr; m = c->nrd-1; rdh = 0; loop: replenish(c, rdh); im(c, Irx0); sleep(&c->rrendez, rim, c); loop1: c->rim = 0; if(c->nrd-c->rdfree >= 16) replenish(c, rdh); r = c->rdba+rdh; if(!(r->status&Rdd)) goto loop; b = c->rb[rdh]; c->rb[rdh] = 0; b->wp += r->length; b->lim = b->wp; /* lie like a dog */ if(!(r->status&Ixsm)){ if(r->status&Ipcs) b->flag |= Bipck; if(r->status&Tcpcs) b->flag |= Btcpck|Budpck; b->checksum = r->cksum; } // r->status = 0; etheriq(e, b, 1); c->rdfree--; rdh = Next(rdh, m); goto loop1; } static void promiscuous(void *a, int on) { Ether *e; Ctlr *c; e = a; c = e->ctlr; if(on) c->reg[Fctrl] |= Upe|Mpe; else c->reg[Fctrl] &= ~(Upe|Mpe); } static void multicast(void *a, uchar *ea, int on) { Ether *e; Ctlr *c; int b, i; e = a; c = e->ctlr; i = ea[5]>>1; b = (ea[5]&1)<<4|ea[4]>>4; b = 1<mta[i] |= b; else c->mta[i] &= ~b; c->reg[Mta+i] = c->mta[i]; } static int detach(Ctlr *c) { int i; c->reg[Imc] = ~0; c->reg[Ctrl] |= Rst; for(i = 0; i < 100; i++){ delay(1); if((c->reg[Ctrl]&Rst) == 0) goto good; } return -1; good: /* errata */ delay(50); c->reg[Ecc] &= ~(1<<21|1<<18|1<<9|1<<6); /* not cleared by reset; kill it manually. */ for(i = 1; i<16; i++) c->reg[Rah] &= ~(1<<31); for(i = 0; i<128; i++) c->reg[Mta+i] = 0; for(i = 1; i<640; i++) c->reg[Vfta+i] = 0; return 0; } static void shutdown(Ether *e) { detach(e->ctlr); } /* ≤ 20ms */ static ushort eeread(Ctlr *c, int i) { c->reg[Eerd] = EEstart|i<<2; while((c->reg[Eerd]&EEdone) == 0) ; return c->reg[Eerd]>>16; } static int eeload(Ctlr *c) { ushort u, v, p, l, i, j; if((eeread(c, 0)&0xc0) != 0x40) return -1; u = 0; for(i = 0; i < 0x40; i++) u += eeread(c, i); for(i = 3; i < 0xf; i++){ p = eeread(c, i); l = eeread(c, p++); if((int)p+l+1 > 0xffff) continue; for(j = p; j < p+l; j++) u += eeread(c, j); } if(u != 0xbaba) return -1; if(c->reg[Status]&1<<3) u = eeread(c, 10); else u = eeread(c, 9); u++; for(i = 0; ira[i++] = v; c->ra[i++] = v>>8; } c->ra[5] += (c->reg[Status]&0xc)>>2; return 0; } static int reset(Ctlr *c) { uchar *p; int i; if(detach(c)){ print("82598: reset timeout\n"); return -1; } if(eeload(c)){ print("82598: eeprom failure\n"); return -1; } p = c->ra; c->reg[Ral] = p[3]<<24|p[2]<<16|p[1]<<8|p[0]; c->reg[Rah] = p[5]<<8|p[4]|1<<31; readstats(c); for(i = 0; istats); i++) c->stats[i] = 0; c->reg[Ctrlext] |= 1<<16; /* make some guesses for flow control */ c->reg[Fcrtl] = 0x10000|1<<31; c->reg[Fcrth] = 0x40000|1<<31; c->reg[Rcrtv] = 0x6000; /* configure interrupt mapping (don't ask) */ c->reg[Ivar+0] = 0|1<<7; c->reg[Ivar+64/4] = 1|1<<7; // c->reg[Ivar+97/4] = (2|1<<7)<<8*(97%4); /* interrupt throttling goes here. */ for(i = Itr; ireg[i] = 128; /* ¼µs intervals */ c->reg[Itr+Itx0] = 256; return 0; } static void txinit(Ctlr *c) { Block *b; int i; c->reg[Txdctl] = 16<ntd; i++){ b = c->tb[i]; c->tb[i] = 0; if(b) freeb(b); } memset(c->tdba, 0, c->ntd*sizeof(Td)); c->reg[Tdbal] = PCIWADDR(c->tdba); c->reg[Tdbah] = 0; c->reg[Tdlen] = c->ntd*sizeof(Td); c->reg[Tdh] = 0; c->reg[Tdt] = 0; c->tdh = c->ntd-1; c->tdt = 0; c->reg[Txdctl] |= Ten; } static void attach(Ether *e) { Block *b; Ctlr *c; int t; char buf[KNAMELEN]; c = e->ctlr; qlock(&c->alock); if(c->alloc){ qunlock(&c->alock); return; } c->nrd = Nrd; c->ntd = Ntd; t = c->nrd*sizeof *c->rdba+255; t += c->ntd*sizeof *c->tdba+255; t += (c->ntd+c->nrd)*sizeof(Block*); c->alloc = malloc(t); qunlock(&c->alock); if(c->alloc == 0) error(Enomem); c->rdba = (Rd*)ROUNDUP((uintptr)c->alloc, 256); c->tdba = (Td*)ROUNDUP((uintptr)(c->rdba+c->nrd), 256); c->rb = (Block**)(c->tdba+c->ntd); c->tb = (Block**)(c->rb+c->nrd); if(waserror()){ while(b = rballoc()){ b->free = 0; freeb(b); } free(c->alloc); c->alloc = 0; nexterror(); } for(c->nrb = 0; c->nrb < 2*Nrb; c->nrb++){ if(!(b = allocb(c->rbsz+BY2PG))) error(Enomem); b->free = rbfree; freeb(b); } poperror(); rxinit(c); txinit(c); sprint(buf, "#l%dl", e->ctlrno); kproc(buf, lproc, e); sprint(buf, "#l%dr", e->ctlrno); kproc(buf, rproc, e); sprint(buf, "#l%dt", e->ctlrno); kproc(buf, tproc, e); } static void interrupt(Ureg*, void *v) { Ether *e; Ctlr *c; int icr, im; e = v; c = e->ctlr; ilock(&c->imlock); c->reg[Imc] = ~0; im = c->im; while(icr = c->reg[Icr]&c->im){ if(icr&Lsc){ im &= ~Lsc; c->lim = icr&Lsc; wakeup(&c->lrendez); } if(icr&Irx0){ im &= ~Irx0; c->rim = icr&Irx0; wakeup(&c->rrendez); } if(icr&Itx0){ im &= ~Itx0; c->tim = icr&Itx0; wakeup(&c->trendez); } } c->reg[Ims] = c->im = im; iunlock(&c->imlock); } static void scan(void) { Pcidev *p; ulong io, io3; void *mem, *mem3; Ctlr *c; p = 0; while(p = pcimatch(p, 0x8086, 0)){ switch(p->did){ case 0x10c6: /* 82598 af dual port */ case 0x10c7: /* 82598 af single port */ case 0x10b6: /* 82598 backplane */ case 0x10dd: /* 82598 at cx4 */ break; default: continue; } if(nctlr == nelem(ctlrtab)){ print("i82598: too many controllers\n"); return; } io = p->mem[0].bar&~0xf; mem = vmap(io, p->mem[0].size); if(mem == 0){ print("i82598: cant map %#p\n", p->mem[0].bar); continue; } io3 = p->mem[3].bar&~0xf; mem3 = vmap(io3, p->mem[3].size); if(mem3 == 0){ print("i82598: cant map %#p\n", p->mem[3].bar); vunmap(mem, p->mem[0].size); continue; } c = malloc(sizeof *c); c->p = p; c->reg = (u32int*)mem; c->reg3 = (u32int*)mem3; c->rbsz = Rbsz; if(reset(c)){ print("i82598: cant reset\n"); free(c); vunmap(mem, p->mem[0].size); vunmap(mem3, p->mem[3].size); continue; } pcisetbme(p); ctlrtab[nctlr++] = c; } } static int pnp(Ether *e) { Ctlr *c; int i; if(nctlr == 0) scan(); for(i = 0; iflag&Factive) continue; if(e->port == 0 || e->port == (ulong)c->reg) goto found; } return -1; found: c->flag |= Factive; e->ctlr = c; e->port = (uintptr)c->reg; e->irq = c->p->intl; e->tbdf = c->p->tbdf; e->mbps = 10000; e->maxmtu = c->rbsz; memmove(e->ea, c->ra, Eaddrlen); e->arg = e; e->attach = attach; e->ctl = ctl; e->ifstat = ifstat; e->interrupt = interrupt; e->multicast = multicast; e->promiscuous = promiscuous; e->shutdown = shutdown; e->transmit = transmit; return 0; } void ether82598link(void) { addethercard("i82598", pnp); }