libxml/ 775 0 0 0 12437702313 103455ustar00stevesyslibxml/README 664 0 0 204 12055115600 11172ustar00stevesys if you have graphviz installed you can generate a state transition diagram for the parser by running mk doc.ps -Steve (Nov 2012) libxml/doc.c 664 0 0 1176 12050142137 11254ustar00stevesys/* * Generate a state transition diagram from the state tables, * we skip error transitions as they just confuse the drawing. */ #include #include #include #include "state-machine.h" void main(void) { int os, s, a, t; print("digraph xml_parser {\n"); print(" rankdir=LR;\n"); print(" size=\"11,9\"\n"); print(" node [shape = circle];\n"); for(t = 0; t < NumToks; t++) for(os = 0; os < NumStates; os++){ s = statab[os][t]; a = acttab[os][t]; if(a != Aerr) print(" %s -> %s [ label = \"%s / %s\" ];\n", stastr[os], stastr[s], tokstr[t], actstr[a]); } print("}\n"); exits(nil); } libxml/libxml.man 644 0 0 10742 12412351645 12354ustar00stevesys.TH LIBXML 2 .SH NAME xml_attr, xml_elem, xml_find, xml_free, xml_look, xml_new, xml_parse, xml_print, xml_value \- DOM model XML library .SH SYNOPSIS .de PB .PP .ft L .nf .. .PB #include #include #include .PB enum { Fcrushwhite = 1, Fstripnamespace = 2, }; .PB struct Xml{ Elem *root; /* root of tree */ char *doctype; /* DOCTYPE structured comment, or nil */ ... }; .PB struct Elem { Elem *next; /* next element at this hierarchy level */ Elem *child; /* first child of this node */ Elem *parent; /* parent of this node */ Attr *attrs; /* linked list of atributes */ char *name; /* element name */ char *pcdata; /* pcdata following this element */ int line; /* Line number (for errors) */ }; .PB struct Attr { Attr *next; /* next atribute */ Elem *parent; /* parent element */ char *name; /* atributes name */ char *value; /* atributes value */ }; .PB .PD 0 .ta +\w'\fL 'u +\w'\fL 'u +6n +4n Attr* xml_attr(Xml *xp, Attr **root, Elem *parent, char *name, char *value) .PB Elem* xml_elem(Xml *xp, Elem **root, Elem *parent, char *name) .PB Elem* xml_find(Xml *xp, Elem *ep, char *path) .PB Elem* xml_look(Elem *ep, char *path, char *attr, char *value) .PB Xml* xml_new(int blksize) .PB Xml* xml_parse(int fd, int blksize, int flags) .PB char* xml_value(Elem *ep, char *name) .PB void xml_free(Xml *xp) .PB void xml_print(Xml *xp, int fd) .SH DESCRIPTION .PP .I LibXml is a library for manipulating an XML document, in-memory (known as the DOM model). Each element may have a number of children, each of which has a number of attributes, each attribute has a single value. All elements contain a pointer to their parent element, the root element having a nil parent pointer. .I Pcdata (free form text) found between elements is attached to element which follows it. The line numbers where each element was found is stored to allow unambigious error messages during later analysis. .PP Strings are either individually in .IR malloc (2)'ed memory or, agregated in blocks. These blocks are used to create a binary tree for common strings such as element and attribute names. Uncommon names such as values and pcdata are kept in a simple, unmanaged heap. These steps can vastly reduce the memory footprint of the parsed file and the time needed to free the XML data. .PP .I xml_parse reads the given file and builds an in-memory tree. .I Blocksize controls the granularity of allocation of the string heap described above, 8192 is typically used; a value of zero disabled the string heap and uses traditional .IR malloc (2) calls. The .I flags field allows some control over the parser, it is a bitwise .B or of the following values: .TP 10 .B Fcrushwhite All strings whitespace in PCdata is replaced by a single space and leading and trailing whitespace is removed. .TP .B Fstripsnamespace Remove leading namespace strings form all element and attribute names; this effectively ignores namespaces which can lead to parsing ambiguities, though in practice it has not been a problem—yet. .PP Xml trees may also be built up by calling .I xml_new to create the XML tree, followed by .I xml_elem and .I xml_attr to create individual elements and attributes respectively. .I xml_elem takes the address of the root of an element list to which the new element should be appended, the address of the parent node the new element should reference, and the name of the node to create; It returns the address of the created element. .PP .I xml_attr attaches an attribute to an existing element. It takes a list pointer and parent pointer like .IR xml_elem , but requires both an atribute name and value, and returns the address of the new attribute. .PP .I xml_look descends through the tree rooted at .I ep using the path specified in .IR path . It then returns if .I elem is nil, or continues to search for a matching element. if .I attr and .I value are not nil, the search will continue for for an element which contains this attribute and value pair. .PP .I xml_value searches the given element's attribute list and returns the value of the attribute found or nil if that attribute is not found. .PP .I xml_print writes the XML hierarchy rooted at \fIep\fR as text to the given file descriptor. .PP .I xml_free frees all memory used by the given .B Xml tree. .SH SOURCE /sys/src/libxml .SH "SEE ALSO" .IR xb (1), .IR xml2 (1). .SH BUGS Namespaces should be handled properly. .PP A SAX model parser will probably be needed sometime (e.g. for Ebooks). .PP UTF-16 headers should be respected but UTF-16 files seems rare. libxml/libxml.ms 664 0 0 2740 12055120424 12171ustar00stevesys The beginings of a parer on the heap structure used for libxml. here are the raw stats from parsing a few files, using strdup/malloc -vs- using the heap+tree. --------------------------------- file size = 214492 old stats 8.out: mem used=2203664 8.out: mem not free'd=1840 nfree=62409 new stats 8.out: mem used=346816 8.out: mem not free'd=0 nfree=293 old time : 0.15u 0.00s 0.17r 8.out -M /usr/steve/work/iqp/Menus/Conv/menu/IQPChannelCombined.xml new time : 0.09u 0.00s 0.11r 8.out -M /usr/steve/work/iqp/Menus/Conv/menu/IQPChannelCombined.xml larch% cat $home/wip/opc/docx/snell/*.xml | time 8.out -M > /dev/null 8.out: mem used=5767752 8.out: mem not free'd=4632 nfree=134574 0.34u 0.01s 0.37r 8.out -M larch% time cat $home/wip/opc/docx/snell/*.xml | time xb -M > /dev/null 0.00u 0.01s 0.28r cat /usr/steve/wip/opc/docx/snell/ipad.xml /usr/steve/wip/opc/docx/snell/license.xml /usr/steve/wip/opc/docx/snell/mike.xml /usr/steve/wip/opc/docx/snell/pat.xml ... xb: mem used=1205824 xb: mem not free'd=4294966856 nfree=14482 0.28u 0.00s 0.31r xb -M ================== larch% cat $home/wip/opc/docx/snell/*.xml | time xb -M > /dev/null xb: mem used=1205824 xb: mem not free'd=4294966856 nfree=14482 0.30u 0.01s 0.32r xb -M ================== larch% cat $home/wip/opc/docx/snell/*.xml | time 8.out -M > /dev/null 8.out: mem used=5767752 8.out: mem not free'd=4632 nfree=134574 0.34u 0.01s 0.37r 8.out -M larch% cat $home/wip/opc/docx/snell/*.xml | wc 9 32819 626208 libxml/mkfile 644 0 0 1257 12412624644 11545ustar00stevesys doc.ps install: cp xml.h /sys/include/xml.h cp libxml.man /sys/man/2/libxml dist: clean 9fs sources; cd ..; tar cv $PKG > /n/sources/contrib/$user/$PKG.tgz pkg:V: clean tar cv `{pwd} | bzip2 > $home/pkg/$PKG.tbz libxml/state-machine.h 644 0 0 4022 12050142140 13217ustar00stevesysenum { /* Lexer Tokens */ Twhite = 0, Topen, Tname, Tclose, Tequal, Tendblk, Tnulblk, NumToks }; enum { /* Parser states */ Slost = 0, Sopened = 1, Snamed = 2, Sattred = 3, Sequed = 4, Sendblk = 5, Sclosed = 6, NumStates }; enum { /* Parser Actions */ Aerr = 0, Anop = 1, Aelem = 2, Apcdata = 3, Aattr = 4, Avalue = 5, Aup = 6, Adown = 7, Acheck = 8, NumActions }; static char * tokstr[] = { /* lexer token names for debug */ [Twhite] "white", [Topen] "open", [Tname] "name", [Tclose] "close", [Tequal] "equal", [Tendblk] "endblk", [Tnulblk] "nulblk" }; static char * stastr[] = { /* parser state names for debug */ [Slost] "lost", [Sopened] "opened", [Snamed] "named", [Sattred] "attred", [Sequed] "equed", [Sendblk] "endblk", [Sclosed] "closed", }; static char * actstr[] = { /* parser action names for debug */ [Aerr] "error", [Anop] "nop", [Apcdata] "pcdata", [Aattr] "attr", [Avalue] "value", [Aelem] "elem", [Aup] "up", [Adown] "down", [Acheck] "check" }; static int statab[7][7] = { /* Parser state transition table */ /* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk */ [Slost] { Slost, Sopened,Slost, Slost, Slost, Slost, Slost }, [Sopened] { 0, 0, Snamed, 0, 0, 0, 0 }, [Snamed] { Snamed, 0, Sattred,Sendblk,0, Slost, Slost }, [Sattred] { Sattred, 0, 0, 0, Sequed, 0, 0 }, [Sequed] { Sequed, 0, Snamed, 0, 0, 0, 0 }, [Sendblk] { 0, 0, Sclosed,0, 0, 0, 0 }, [Sclosed] { 0, 0, 0, Slost, 0, 0, 0 }, }; static int acttab[7][7] = { /* Parser action table */ /* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk */ [Slost] { Apcdata, Anop, Apcdata, Apcdata, Apcdata, Aup, Apcdata }, [Sopened] { 0, 0, Aelem, 0, 0, 0, 0 }, [Snamed] { Anop, 0, Aattr, Adown, 0, Anop, Anop }, [Sattred] { Anop, 0, 0, 0, Anop, 0, 0 }, [Sequed] { Anop, 0, Avalue, 0, 0, 0, 0 }, [Sendblk] { 0, 0, Acheck, 0, 0, 0, 0 }, [Sclosed] { 0, 0, 0, Anop, 0, 0, 0 }, }; ; /* parent of this node */ Attr *attrs; /* linked list of atributes */ char *name; /* element name */ char *pcdata; /* pcdata following this element */ int line; /* Line number (for errors) */ }; .PB struct Attr { Attr *next; /* next atribute */ Elem *parent; /* parent element */ char *name; /* atributes name */ char *value; /* atributes value */ }; .PB .PD 0 .ta +\w'\fL 'u +\w'\fL 'u +6n +4n Attr* xml_attr(Xml *xp, Attr **root, Elem *parent, char *name, char *libxml/xml.h 664 0 0 2724 12412351742 11322ustar00stevesys #pragma lib "libxml.a" typedef struct Xml Xml; typedef struct Attr Attr; typedef struct Elem Elem; typedef struct xml_tree xml_tree; typedef struct xml_block xml_block; #pragma incomplete xml_tree #pragma incomplete xml_block enum { Fcrushwhite = 1, Fstripnamespace = 2, }; struct Xml { Elem *root; /* root of tree */ char *doctype; /* DOCTYPE structured comment, or nil */ struct { xml_tree *root; xml_block *active; int blksiz; } alloc; }; struct Elem { Elem *next; /* next element at this hierarchy level */ Elem *child; /* first child of this node */ Elem *parent; /* parent of this node */ Attr *attrs; /* linked list of atributes */ char *name; /* element name */ char *pcdata; /* pcdata following this element */ int line; /* Line number (for errors) */ }; struct Attr { Attr *next; /* next atribute */ Elem *parent; /* parent element */ char *name; /* atributes name (NULL for coments) */ char *value; /* atributes value */ }; extern int xml_debug; Attr *xml_attr(Xml *, Attr **, Elem *, char *, char *); Elem *xml_elem(Xml *, Elem **, Elem *, char *); Elem *xml_find(Xml *, Elem *, char *); void xml_free(Xml *); char *xml_strdup(Xml *, char *, int); void *xml_calloc(Xml *, long, long); void *xml_malloc(Xml *, int); void _xml_heapstats(void); void _xml_heapfree(Xml *); Elem *xml_look(Elem *, char *, char *, char *); Xml *xml_new(int); Xml *xml_parse(int, int, int); void xml_print(Xml *, int); char *xml_value(Elem *, char *); been a problem—yet. .PP Xml trees may alsolibxml/xml_attr.c 640 0 0 1106 12412352307 12330ustar00stevesys#include #include #include "xml.h" Attr * xml_attr(Xml *xp, Attr **root, Elem *parent, char *name, char *value) { Attr *ap, *t; USED(xp); if((ap = xml_calloc(xp, sizeof(Attr), 1)) == nil) sysfatal("no memory - %r\n"); if(*root == nil){ *root = ap; } else{ for (t = *root; t->next; t = t->next) continue; t->next = ap; } ap->parent = parent; if(name) if((ap->name = xml_strdup(xp, name, 1)) == nil) sysfatal("no memory - %r\n"); if(value) if((ap->value = xml_strdup(xp, value, 0)) == nil) sysfatal("no memory - %r\n"); return ap; } _print writes the XML hierarchy rooted at \fIep\fR as text to the given file descriptor. .PP .I xml_free frees all memory used by the given .B Xml tree. .SH SOURCE /sys/src/libxml .SH "SEE ALSO" .IR xb (1), .IR xml2 (1). .SH BUGS Namespaces should be handled properly. .PP A SAX model parser will probably be needed sometime (e.g. for Ebooks). .PP UTF-16 headers should be respected but UTF-16 files seems rare. libxml/xml_elem.c 640 0 0 723 12412352300 12255ustar00stevesys#include #include #include "xml.h" Elem * xml_elem(Xml *xp, Elem **root, Elem *parent, char *name) { Elem *ep, *t; USED(xp); if((ep = xml_calloc(xp, sizeof(Elem), 1)) == nil) sysfatal("no memory - %r\n"); if(! *root){ *root = ep; } else{ for (t = *root; t->next; t = t->next) continue; t->next = ep; } ep->parent = parent; if(name) if((ep->name = xml_strdup(xp, name, 1)) == nil) sysfatal("no memory - %r\n"); return ep; } libxml/xml_find.c 640 0 0 1065 12412352273 12304ustar00stevesys#include #include #include "xml.h" /* * search for element, starting at ep. */ Elem * xml_find(Xml *xp, Elem *ep, char *path) { char *p; Elem *t; USED(xp); if (path == nil) return nil; if (*path == '/') path++; if ((p = strchr(path, '/')) == nil) if((p = strchr(path, 0)) == nil) return nil; // shut up lint ! for(; ep; ep = ep->next) if (strncmp(ep->name, path, p-path) == 0){ if (*p == 0) return ep; if (! ep->child) continue; if ((t = xml_find(xp, ep->child, p)) != nil) return t; } return nil; } libxml/xml_heap.c 644 0 0 4275 12412352266 12315ustar00stevesys#include #include #include "xml.h" #include "memalloc.h" #define Roundup(x, g) (((x) + (unsigned)(g-1)) & ~((unsigned)(g-1))) struct xml_tree { xml_tree *left; xml_tree *right; char *str; int hits; }; struct xml_block { xml_block *next; char *free; char *end; }; static int Strdups, Commons, Unique, Memblocks; static void * getmem(Xml *xp, int len) { int sz; xml_block *b; char *ret; len = Roundup(len, sizeof(long long)); sz = xp->alloc.blksiz; /* shorthand */ b = xp->alloc.active; if(len > sz) sysfatal("xml heap: object larget than blocksize (%d > %d)\n", len, sz); if(xp->alloc.active == nil || b->free + len >= b->end){ Memblocks++; b = mallocz(sizeof(xml_block) + sz, 0); b->free = (char *)&b[1]; b->end = (char *)&b->free[sz]; b->next = xp->alloc.active; xp->alloc.active = b; } ret = b->free; b->free += len; return ret; } static xml_tree * lookadd(Xml *xp, xml_tree *t, char *str, xml_tree **match) { int n; if(t == nil){ Unique++; t = getmem(xp, sizeof(xml_tree) + strlen(str)+1); t->left = nil; t->right = nil; t->str = (char *)&t[1]; strcpy(t->str, str); *match = t; t->hits = 1; return t; } if((n = strcmp(str, t->str)) == 0){ *match = t; t->hits++; } if(n < 0) t->left = lookadd(xp, t->left, str, match); if(n > 0) t->right = lookadd(xp, t->right, str, match); return t; } static void heapfree(Xml *xp) { xml_block *b, *n; for(b = xp->alloc.active; b; b = n){ n = b->next; if(xml_debug) memset(b, 0x7e, xp->alloc.blksiz); free(b); } } static void dumpstats(Xml *) { fprint(2, "total=%d common=%d -> unique=%d rare=%d memblocks=%d\n", Strdups, Commons, Unique, Strdups - Commons, Memblocks); } static char * dostrdup(Xml *xp, char *str, int iscommon) { char *s; xml_tree *t; Strdups++; if(iscommon){ Commons++; xp->alloc.root = lookadd(xp, xp->alloc.root, str, &t); return t->str; } s = getmem(xp, strlen(str)+1); return strcpy(s, str); } static void * docalloc(Xml *xp, long n, long m) { void *v; v = getmem(xp, n * m); memset(v, 0, n * m); return v; } static void dofree(Xml *xp) { heapfree(xp); free(xp); } Memalloc _xheapalloc = { dostrdup, docalloc, dofree, dumpstats }; lost }, [Sopened] { 0, 0, Snamed, 0, 0, 0, 0 }, [Snamed] { Snamed, 0, Sattred,Sendblk,0, Slost, Slost }, [Sattred] { Sattred, 0, 0, 0, Sequed, 0, 0 }, [Sequed] { Sequed, 0, Snamed, 0, 0, 0, 0 }, [Sendblk] { 0, 0, Sclosed,0, 0, 0, 0 }, [Sclosed] { 0, 0, 0, Slost, 0, 0, 0 libxml/xml_look.c 664 0 0 1772 12412352261 12340ustar00stevesys#include #include #include #include "xml.h" /* * search for element, starting at ep. * if attr!=nil the elem must have an attribute attr * if value!=nil then the elem must have attr=value */ Elem * xml_look(Elem *ep, char *path, char *attr, char *value) { char *p; Elem *t; Attr *ap; if (path == nil) return nil; if (*path == '/') path++; if ((p = strchr(path, '/')) == nil) if((p = strchr(path, 0)) == nil) return nil; // shut up lint ! for(; ep; ep = ep->next) if (strncmp(ep->name, path, p-path) == 0){ if (*p == '/'){ if (ep->child) if ((t = xml_look(ep->child, p, attr, value)) != nil) return t; continue; } if (attr == nil) return ep; for (ap = ep->attrs; ap; ap = ap->next) if (strcmp(ap->name, attr) == 0){ if (value == nil) return ep; if (strcmp(ap->value, value) == 0) return ep; } if (ep->child) if ((t = xml_look(ep->child, p, attr, value)) != nil) return t; } return nil; } libxml/xml_malloc.c 640 0 0 5757 12412352246 12647ustar00stevesys#include #include #include "xml.h" static void freestr(char *s, uchar fill) { if(xml_debug) memset(s, fill, strlen(s)); free(s); } static void freemem(void *v, uchar fill, int len) { if(xml_debug) memset(v, fill, len); free(v); } static void xfree(Elem *ep) { Elem *te; Attr *ap, *ta; while(ep){ if(ep->name) freestr(ep->name, 0xff); if(ep->pcdata) freestr(ep->pcdata, 0xfe); ap = ep->attrs; while(ap){ if(ap->name) freestr(ap->name, 0xfd); if(ap->value) freestr(ap->value, 0xfc); ta = ap->next; if(ap) freemem(ap, 0xfb, sizeof(Attr)); ap = ta; } if(ep->child) xfree(ep->child); te = ep->next; freemem(ep, 0xfa, sizeof(Elem)); ep = te; } } static void freetree(Xml *xp) { xfree(xp->root); free(xp->doctype); } /********************/ #define Roundup(x, g) (((x) + (unsigned)(g-1)) & ~((unsigned)(g-1))) struct xml_tree { xml_tree *left; xml_tree *right; char *str; int hits; }; struct xml_block { xml_block *next; char *free; char *end; }; static int Strdups, Commons, Unique, Memblocks; static void * getmem(Xml *xp, int len) { int sz; xml_block *b; char *ret; len = Roundup(len, sizeof(long long)); sz = xp->alloc.blksiz; /* shorthand */ b = xp->alloc.active; if(len > sz) sysfatal("xml heap: object larget than blocksize (%d > %d)\n", len, sz); if(xp->alloc.active == nil || b->free + len >= b->end){ Memblocks++; b = mallocz(sizeof(xml_block) + sz, 0); b->free = (char *)&b[1]; b->end = (char *)&b->free[sz]; b->next = xp->alloc.active; xp->alloc.active = b; } ret = b->free; b->free += len; return ret; } static xml_tree * lookadd(Xml *xp, xml_tree *t, char *str, xml_tree **match) { int n; if(t == nil){ Unique++; t = getmem(xp, sizeof(xml_tree) + strlen(str)+1); t->left = nil; t->right = nil; t->str = (char *)&t[1]; strcpy(t->str, str); *match = t; t->hits = 1; return t; } if((n = strcmp(str, t->str)) == 0){ *match = t; t->hits++; } if(n < 0) t->left = lookadd(xp, t->left, str, match); if(n > 0) t->right = lookadd(xp, t->right, str, match); return t; } static void heapfree(Xml *xp) { xml_block *b, *n; for(b = xp->alloc.active; b; b = n){ n = b->next; if(xml_debug) memset(b, 0x7e, xp->alloc.blksiz); free(b); } } static void dumpstats(Xml *) { fprint(2, "total=%d common=%d -> unique=%d rare=%d memblocks=%d\n", Strdups, Commons, Unique, Strdups - Commons, Memblocks); } char * xml_strdup(Xml *xp, char *str, int iscommon) { char *s; xml_tree *t; if(xp->alloc.blksiz <= 0) return strdup(str); Strdups++; if(iscommon){ Commons++; xp->alloc.root = lookadd(xp, xp->alloc.root, str, &t); return t->str; } s = getmem(xp, strlen(str)+1); return strcpy(s, str); } void * xml_calloc(Xml *xp, long n, long m) { void *v; if(xp->alloc.blksiz <= 0) return calloc(n, m); v = getmem(xp, n * m); memset(v, 0, n * m); return v; } void xml_free(Xml *xp) { if(xp->alloc.blksiz <= 0) freetree(xp); else heapfree(xp); free(xp); } libxml/xml_new.c 644 0 0 302 12412352234 12127ustar00stevesys#include #include #include "xml.h" Xml * xml_new(int blksize) { Xml *xp; xp = mallocz(sizeof(Xml), 1); xp->alloc.blksiz = blksize; if(xp == nil) return nil; return xp; } libxml/xml_parse.c 640 0 0 24415 12500122523 12511ustar00stevesys#include #include #include #include #include "xml.h" #include "state-machine.h" int xml_debug = 0; enum { Grain = 16 }; #define isname1(c) (isalpha((c)) || c == '_') /* FIxml_ME: not enforced yet */ #define isnameN(r) (isalpharune((r)) || isdigitrune((r)) || r == L'_' || r == L'-' || r == L'.' || r == L':') #define Roundup(x, g) (((x) + (unsigned)(g-1)) & ~((unsigned)(g-1))) enum { Ntext = 1024, /* longest name or atribute value possible */ Nref = 32 /* longest entity reference name */ }; typedef struct { int line; /* Line number (for errors) */ Biobuf *bp; /* input stream */ int flags; /* misc flags, see xml.h */ Xml *xml; int failed; } State; typedef struct { char *buf; int sz; } Lexbuf; static int trimwhite(char *s) { char *p; for(p = s; *p; p++) if(! isspace(*p)) return 0; /* trim any whitespace into a single space */ s[0] = ' '; s[1] = 0; return 1; } static void growstr(State *st, Lexbuf *lb, char *str) { int b, s, sz; if(str == nil || *str == 0) return; if((st->flags & Fcrushwhite) && trimwhite(str) && (lb->buf == nil || lb->buf[0] == 0)) return; b = 0; if(lb->buf) b = strlen(lb->buf); s = strlen(str); sz = Roundup(b+s+1, Grain); if(sz >= lb->sz){ lb->buf = realloc(lb->buf, sz); if(lb->buf == nil) sysfatal("No memory, wanted %d bytes\n", sz); lb->sz = sz; } strcpy(lb->buf+b, str); } static void growrune(State *st, Lexbuf *lb, Rune r) { int n; char str[UTFmax+1]; n = runetochar(str, &r); str[n] = 0; growstr(st, lb, str); } static void stripns(char *str) { char *p; if((p = strrchr(str, ':')) == nil) return; strcpy(str, p+1); } static void failed(State *st, char *fmt, ...) { int n; va_list arg; char err[ERRMAX]; st->failed = 1; va_start(arg, fmt); n = snprint(err, sizeof(err), "%d ", st->line); vsnprint(err+n, sizeof(err)-n, fmt, arg); va_end(arg); werrstr("%s", err); } static void unget(State *st, int c) { if(c == '\n') st->line--; Bungetrune(st->bp); } static long get(State *st) { long r; r = Bgetrune(st->bp); if(r == Runeerror){ failed(st, "bad UTF-8 sequence"); r = L' '; } if(r == L'\n') st->line++; if(xml_debug){ if(xml_debug == 3) fprint(2, "%C", (Rune)r); if(xml_debug == 1 && r == -1) fprint(2, "EOF\n"); } return r; } static struct { char *name; Rune rune; } Entities[] = { { "amp", L'&' }, { "lt", L'<' }, { "gt", L'>' }, { "apos", L'\'' }, { "quot", L'"' }, { "nbsp", 0xa0 }, /* no-break space */ }; static long entityref(State *st) { int n, i, l; long r; Rune x; char *p, buf[Nref]; l = 0; p = buf; while((r = get(st)) != -1 && r != L';' && ! isspacerune(r)) if(l < (sizeof(buf) - UTFmax)){ x = r; n = runetochar(p, &x); p += n; l += n; } *p = 0; if(r == -1) return -1; /* false positive */ if(r != L';'){ fprint(2, "%d: unquoted '&' - ignored\n", st->line); for(i = --l; i >= 0; i--) unget(st, buf[i]); return L'&'; } if(buf[0] == '#'){ if(buf[1] == 'x' || buf[1] == 'X') return strtol(buf+2, 0, 16); return strtol(buf+1, 0, 10); } for(i = 0; i < nelem(Entities); i++) if(memcmp(Entities[i].name, buf, l) == 0) return Entities[i].rune; fprint(2, "%d: '&%s;' unknown/unsupported entity reference\n", st->line, buf); return L'?'; } static int match(State *st, Rune *s) { long r; Rune *p; r = -1; for(p = s; *p; p++) if((r = get(st)) != *p) break; if(r == -1) return -1; /* EOF */ if(*p == 0) return 0; /* match */ unget(st, r); for(p--; p >= s; p--) unget(st, *p); return 1; /* no match */ } static int comment(State *st) { long r; int startline; startline = st->line; do{ if(get(st) == -1) break; }while(match(st, L"--") == 1); r = get(st); if(r == -1){ failed(st, "EOF in comment (re: line %d)", startline); return -1; } if(r != L'>'){ failed(st, "'--' illegal in a comment (re: line %d)", startline); return Twhite; } return Twhite; } static int doctype(State *st, Lexbuf *lb) { long r; char *p; int startline, depth; startline = st->line; /* trim leading whitespace */ while((r = get(st)) != -1 && isspacerune(r)) continue; unget(st, r); if(lb->buf) lb->buf[0] = 0; depth = 1; while((r = get(st)) != -1){ switch(r){ case L'<': depth++; break; case L'>': depth--; break; } if(depth <= 0) break; growrune(st, lb, r); } if(r == -1){ failed(st, "EOF in DOCTYPE (re: line %d)", startline); return -1; } /* trim trailing whitespace */ p = strrchr(lb->buf, 0); for(p--; p >= lb->buf && isspace(*p); p--) *p = 0; st->xml->doctype = xml_strdup(st->xml, lb->buf, 0); return Twhite; } static int cdata(State *st, Lexbuf *lb) { long r; int startline; startline = st->line; do{ if((r = get(st)) == -1) break; if(r == L'&') if((r = entityref(st)) == -1) break; growrune(st, lb, r); }while(match(st, L"]]>") == 1); if(r == -1){ failed(st, "EOF in CDATA (re: line %d)", startline); return -1; } return Tname; } /* * byte order mark. * This is pointless for utf8 which has defined byte order, * and we don't support utf16 or utf32 but some xml seems to * prepend them to utf8 so we need to find them and skip them */ static int bom(State *st) { long r; if((r = get(st)) == -1) return -1; if(r != 0xbb){ unget(st, 0xef); unget(st, r); return -1; } if((r = get(st)) == -1){ unget(st, 0xef); unget(st, 0xbb); return -1; } if(r != 0xbf){ unget(st, 0xef); unget(st, 0xbb); unget(st, r); } return 0; } static int xlex(State *st, Lexbuf *lb, int s) { long r; Rune q; while((r = get(st)) != -1){ if(r == 0xef) if(bom(st) == 0) continue; if(r == L'<'){ r = get(st); switch(r){ case L'?': while((r = get(st)) != -1 && r != L'>') continue; if(r == -1) return -1; return Twhite; case L'!': if(match(st, L"--") == 0) return comment(st); if(match(st, L"DOCTYPE ") == 0) return doctype(st, lb); if(match(st, L"[CDATA[") == 0) return cdata(st, lb); failed(st, "': return Tclose; case '/': r = get(st); if(r == '>') return Tnulblk; unget(st, r); continue; case '\'': case '"': /* attribute value */ q = r; while((r = get(st)) != -1 && r != q){ if(r == L'&') if((r = entityref(st)) == -1) break; growrune(st, lb, r); } if(r == -1) return -1; return Tname; case '\n': case '\r': case ' ': case '\v': case '\f': case '\t': do growrune(st, lb, r); while((r = get(st)) != -1 && isspacerune(r)); if(r == -1) return -1; unget(st, r); return Twhite; default: /* attribute name */ do growrune(st, lb, r); while((r = get(st)) != -1 && isnameN(r)); if(r == -1) return -1; unget(st, r); return Tname; } } do{ if(r == L'&') if((r = entityref(st)) == -1) break; growrune(st, lb, r); }while((r = get(st)) != -1 && r != '<'); if(r == -1) return -1; unget(st, r); return Tname; } return -1; } static Elem * _xml_parse(State *st, Elem *parent, int depth) { Attr *ap; Lexbuf lexbuf, *lb; Lexbuf pcdata, *pc; Elem *root, *ep; int os, s, t, a; ap = nil; ep = nil; s = Slost; root = nil; lb = &lexbuf; memset(lb, 0, sizeof(Lexbuf)); pc = &pcdata; memset(pc, 0, sizeof(Lexbuf)); while((t = xlex(st, lb, s)) != -1){ os = s; s = statab[os][t]; a = acttab[os][t]; if(xml_debug == 2) fprint(2, "depth=%d token=%s action=%s state=%s->%s str='%s'\n", depth, tokstr[t], actstr[a], stastr[os], stastr[s], lb->buf); switch(a){ case Aelem: if(xml_debug == 1) fprint(2, "%-3d %*.selem name='%s'\n", st->line, depth, "", lb->buf); if(!isname1(lb->buf[0])) failed(st, "'%s' is an illegal element name", lb->buf); if(st->flags & Fstripnamespace) stripns(lb->buf); ep = xml_elem(st->xml, &root, parent, lb->buf); assert(ep != nil); ep->line = st->line; break; case Apcdata: if(parent) growstr(st, pc, lb->buf); break; case Aattr: assert(ep != nil); if(xml_debug == 1) fprint(2, "%-3d %*.sattr name='%s'\n", st->line, depth, "", lb->buf); if(!isname1(lb->buf[0])) failed(st, "'%s' is an illegal attribute name", lb->buf); if(st->flags & Fstripnamespace) stripns(lb->buf); ap = xml_attr(st->xml, &(ep->attrs), ep, lb->buf, nil); assert(ap != nil); break; case Avalue: assert(ep != nil); assert(ap != nil); ap->value = xml_strdup(st->xml, lb->buf, 0); ap = nil; if(xml_debug == 1) fprint(2, "%*.sattr value=%s\n", depth, "", lb->buf); break; case Adown: assert(ep != nil); if(xml_debug == 1) fprint(2, "%*.sdown name=%s\n", depth, "", ep->name); ep->child = _xml_parse(st, ep, depth+1); if(xml_debug == 1 && ep->pcdata) fprint(2, "%*.s name=%s pcdata len=%ld\n", depth, "", ep->name, (ep->pcdata)? strlen(ep->pcdata): 0L); break; case Aup: if(pc->buf){ parent->pcdata = xml_strdup(st->xml, pc->buf, 0); free(pc->buf); } free(lb->buf); return root; /* NOTREACHED */ break; case Acheck: assert(ep != nil); if(st->flags & Fstripnamespace) stripns(lb->buf); if(ep->name && strcmp(lb->buf, ep->name) != 0) failed(st, " found, expecting match for <%s> (re: line %d) - nesting error", lb->buf, ep->name, ep->line); break; case Anop: break; case Aerr: failed(st, "%s syntax error", lb->buf); break; default: sysfatal("xml_parse: %d - internal error, unknown action\n", a); break; } if(lb->buf) lb->buf[0] = 0; } if(t == -1 && depth != 0) failed(st, "unexpected EOF (depth=%d)", depth); if(pc->buf){ parent->pcdata = xml_strdup(st->xml, pc->buf, 0); free(pc->buf); } free(lb->buf); return root; } Xml * xml_parse(int fd, int blksize, int flags) { State s; Biobuf bio; Xml *x; memset(&s, 0, sizeof(s)); s.line = 1; Binit(&bio, fd, OREAD); s.bp = &bio; s.flags = flags; x = xml_new(blksize); s.xml = x; x->root = _xml_parse(&s, nil, 0); if(s.failed){ if(x) xml_free(x); x = nil; } Bterm(&bio); return x; } libxml/xml_print.c 640 0 0 3054 12412352177 12523ustar00stevesys#include #include #include #include #include "xml.h" static void prval(Biobuf *bp, char *s) { char *p; Rune r; p = s; while(*p){ p += chartorune(&r, p); switch(r){ case L'&': Bprint(bp, "&"); break; case L'<': Bprint(bp, "<"); break; case L'>': Bprint(bp, ">"); break; case L'"': Bprint(bp, """); break; case L'\'': Bprint(bp, "'"); break; default: if(r >= L' ') Bprint(bp, "%C", r); else Bprint(bp, "&#x%04x;", r); break; } } } static void _xml_print(Biobuf *bp, Elem *ep, int in) { Attr *ap; enum {indent = 4}; for(; ep; ep = ep->next){ Bprint(bp, "%*s<%s", in, "", ep->name); for (ap = ep->attrs; ap; ap = ap->next){ Bprint(bp, " %s=\'", ap->name); prval(bp, ap->value); Bprint(bp, "\'"); } if(ep->child){ if(ep->pcdata){ Bprint(bp, ">\n%*s\n", in+indent, ""); prval(bp, ep->pcdata); } else Bprint(bp, ">\n"); _xml_print(bp, ep->child, in+indent); Bprint(bp, "%*s\n", in, "", ep->name); } else{ if(ep->pcdata){ Bprint(bp, ">\n%*s", in+indent, ""); prval(bp, ep->pcdata); Bprint(bp, "\n%*s\n", in, "", ep->name); } else Bprint(bp, "/>\n"); } } } void xml_print(Xml *xp, int fd) { Biobuf bout; Binit(&bout, fd, OWRITE); if(xp->doctype){ Bprint(&bout, "\n"); Bprint(&bout, "\n", xp->doctype); } else Bprint(&bout, "\n"); _xml_print(&bout, xp->root, 0); Bterm(&bout); } ref]; l = 0; p = buf; while((r = get(st)) != -1 && r != L';' && ! isspacerune(r)) if(l < (sizeof(buf) - UTFmax)){ x = r; n = runetochar(p, &x); p += n; l += n; } *p = 0; if(r == -1) return -1; /* false positive */ if(r != L';'){ fprint(2, "%d: unquoted '&' - ignored\n", st->line); for(i = --l; i >= 0; i--) unget(st, buf[i]); return L'&'; } if(buf[0] == '#'){ if(buf[1] == 'x' || buf[1] == 'X') return strtol(buf+2, 0, 16);libxml/xml_value.c 664 0 0 475 12412613423 12467ustar00stevesys#include #include #include "xml.h" char * xml_value(Elem *ep, char *name) { Attr *ap; /* * This enables the common idiom: xml_value(xml_look(), name); */ if (ep == nil) return nil; for(ap = ep->attrs; ap; ap = ap->next) if(strcmp(ap->name, name) == 0) return ap->value; return nil; }