/sys/src/cmd/ndb/dnresolve.c % cat >/sys/lib/dist/changes/1176838212.1.txt << EOF • Add better comments • Disable code that was good in theory, but not in practice • Reduce timeout from 60s to 30s EOF [geoff] --rw-rw-r-- M 64 glenda sys 30062 Apr 17 15:19 sys/src/cmd/ndb/dnresolve.c /n/sourcesdump/2007/0417/plan9/sys/src/cmd/ndb/dnresolve.c:540,546 - /n/sourcesdump/2007/0418/plan9/sys/src/cmd/ndb/dnresolve.c:540,549 else if (readn(fd, lenbuf, 2) != 2) { dnslog("readnet: short read of tcp size from %I", qp->tcpip); - /* probably a time-out; demote the ns */ + /* + * probably a time-out; demote the ns. + * actually, the problem may be the query, not the ns. + */ addslug(qp->tcpip); } else { len = lenbuf[0]<<8 | lenbuf[1]; /n/sourcesdump/2007/0417/plan9/sys/src/cmd/ndb/dnresolve.c:1028,1040 - /n/sourcesdump/2007/0418/plan9/sys/src/cmd/ndb/dnresolve.c:1031,1043 } procsetname("recursive query for %s %s", qp->dp->name, rrname(qp->type, buf, sizeof buf)); - qunlock(&qp->dp->querylck); + // qunlock(&qp->dp->querylck); queryinit(&nquery, qp->dp, qp->type, qp->req); nquery.nsrp = tp; rv = netquery(&nquery, depth+1); - qlock(&qp->dp->querylck); + // qlock(&qp->dp->querylck); rrfreelist(tp); memset(&nquery, 0, sizeof nquery); /* prevent accidents */ return rv; /n/sourcesdump/2007/0417/plan9/sys/src/cmd/ndb/dnresolve.c:1246,1253 - /n/sourcesdump/2007/0418/plan9/sys/src/cmd/ndb/dnresolve.c:1249,1256 } if(fd >= 0) { qp->req->aborttime = time(nil) + (patient? Maxreqtm: Maxreqtm/2); - /* tune; was (patient? 15: 10) */ qp->udpfd = fd; + /* tune; was (patient? 15: 10) */ rv = netquery1(qp, depth, ibuf, obuf, (patient? 10: 5), inns); close(fd); } else /n/sourcesdump/2007/0417/plan9/sys/src/cmd/ndb/dnresolve.c:1277,1289 - /n/sourcesdump/2007/0418/plan9/sys/src/cmd/ndb/dnresolve.c:1280,1299 */ /* don't lock before call to slave so only children can block */ - lock = qp->req->isslave != 0; - if(lock) { + if (0) + lock = qp->req->isslave != 0; + if(0 && lock) { procsetname("query lock wait for %s", qp->dp->name); - /* don't make concurrent queries for this name */ + /* + * don't make concurrent queries for this name. + * + * this seemed like a good idea, to avoid swamping + * an overloaded ns, but in practice, dns processes + * pile up quickly and dns becomes unresponsive for a while. + */ qlock(&qp->dp->querylck); - procsetname("netquery: %s", qp->dp->name); } + procsetname("netquery: %s", qp->dp->name); /* prepare server RR's for incremental lookup */ for(rp = qp->nsrp; rp; rp = rp->next) /n/sourcesdump/2007/0417/plan9/sys/src/cmd/ndb/dnresolve.c:1323,1331 - /n/sourcesdump/2007/0418/plan9/sys/src/cmd/ndb/dnresolve.c:1333,1340 // if (rv == 0) /* could ask /net.alt/dns directly */ // askoutdns(qp->dp, qp->type); - if(lock) { + if(0 && lock) qunlock(&qp->dp->querylck); - } return rv; } /n/sourcesdump/2007/0417/plan9/sys/src/cmd/ndb/dnresolve.c:1339,1345 - /n/sourcesdump/2007/0418/plan9/sys/src/cmd/ndb/dnresolve.c:1348,1354 memset(&req, 0, sizeof req); req.isslave = 1; - req.aborttime = now + Maxreqtm*2; /* be patient */ + req.aborttime = now + Maxreqtm; queryinit(&query, dnlookup(root, Cin, 1), Tns, &req); query.nsrp = dblookup(root, Cin, Tns, 0, 0); rv = netquery(&query, 0);