To: 9fans@cse.psu.edu Reply-To: erik quanstrom From: erik quanstrom Mime-Version: 1.0 Content-Type: multipart/mixed; boundary=x Subject: minimal changes to make byron's rc utf-8 compatable. --x Content-Type: text/plain; charset=utf-8 9fans, i realize that this is not the ideal forum but it seems that rc-fans@hawkwind.utcs.toronto.ca is defunct and tim goodwin hasn't (yet) answered my email... this is all you need to make byron's rc utf-8 compatable. you should be able to do this ; α = 1; echo $α and this ; ~ α [αβ] && ~ α . && ~ β [α-γ] && echo works with this patch. i have not tested this with 4-byte utf-8 sequences, due to the fact that p9p doesn't support 32bit unicode. but, barring a typo they should work. erik --x Content-Type: text/plain; charset=utf8; filename="match.c" /* match.c: pattern matching routines */ #include "rc.h" static int rangematch(const char*, const char*); enum { RANGE_FAIL = -1, RANGE_ERROR = -2 }; /* match() matches a single pattern against a single string. */ /* utf-8 support copyright © 2005 erik quanstrom with the same licencing terms as the rest of rc. since rc doesn't really do utf-8, we are going to pretend, relying on the properties of utf-8 we know that we can 1. get away with byte-wise comparisons as long as we are not insisting that the next byte is the next character. ranges and the ? match operator need to be utf-8-aware. 2. we can compare 2 utf-8 characters without converting to unicode (PITA) by comparing length (longer is greater) and then bytewise. all we require is utf8len. */ static int utf8len(const char* ss){ const unsigned char* s = (unsigned char*)ss; int c; c=*s; if (c<0x80){ return 1; } if (0x80 == (c&0xc0) || 0xc0 == (c&0xe0)){ return 2; } if ((c & 0xf0) == 0xe0){ return 3; } if ((c & 0xf8) == 0xf0){ return 4; } return 1; /* bad */ } static int utf8cmp(const char* s1, int l1, const char* s2, int l2){ int l; int t1; int t2; int i; l = l2-l1; if (l){ return l; } for(i=0; i=utf8cmp(p, l, c, cl)){ matched = 1; } } else if (cl == l) { for(i=0; i != l; i++){ if (p[i] != c[i]){ break; } } matched |= i==l; } } if (matched ^ neg) return p - orig + 1; /* skip the right-bracket */ return RANGE_FAIL; } --x Content-Type: text/plain; charset=utf-8; filename="lex.c /* lex.c: rc's lexical analyzer */ #include "rc.h" #include "parse.h" /* Special characters (i.e., "non-word") in rc: \t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \ The lexical analyzer is fairly straightforward. The only really unclean part concerns backslash continuation and "double backslashes". A backslash followed by a newline is treated as a space, otherwise backslash is not a special character (i.e., it can be part of a word). This introduces a host of unwanted special cases. In our case, \ cannot be a word character, since we wish to read in all word characters in a tight loop. Note: to save the trouble of declaring these arrays with TRUEs and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is it declared in rc.h) */ #define BUFSIZE ((size_t) 1000) /* malloc hates power of 2 buffers? */ #define BUFMAX (8 * BUFSIZE) /* How big the buffer can get before we re-allocate the space at BUFSIZE again. Premature optimization? Maybe. */ typedef enum wordstates { NW, RW, KW /* "nonword", "realword", "keyword" */ } wordstates; static void getpair(int); int lineno; /* does not check for valid utf-8; alternative is changing gchar() to return a Rune */ const char nw[] = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const char dnw[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static size_t bufsize = BUFSIZE; static char *realbuf = NULL; static bool newline = FALSE; static bool errset = FALSE; static bool prerror = FALSE; static wordstates w = NW; static int fd_left, fd_right; #define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }} enum filedescriptors { UNSET = -9, CLOSED = -1 }; /* does this string require quoting? */ extern bool quotep(char *s, bool dollar) { unsigned char c; const char *meta; meta = dollar ? dnw : nw; while ((c = *s++)) if (meta[c]) return TRUE; return FALSE; } extern int yylex() { static bool dollar = FALSE; bool saw_meta = FALSE; int c; size_t i; /* The purpose of all these local assignments is to */ const char *meta; /* allow optimizing compilers like gcc to load these */ char *buf = realbuf; /* values into registers. On a sparc this is a */ YYSTYPE *y = &yylval; /* win, in code size *and* execution time */ if (errset) { errset = FALSE; return '\n'; } /* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */ meta = (dollar ? dnw : nw); dollar = FALSE; if (newline) { --lineno; /* slight space optimization; print_prompt2() always increments lineno */ print_prompt2(); newline = FALSE; } top: while ((c = gchar()) == ' ' || c == '\t') w = NW; if (c == EOF) return END; if (!meta[(unsigned char) c]) { /* it's a word or keyword. */ checkfreecaret; w = RW; i = 0; read: do { buf[i++] = c; if (c == '?' || c == '[' || c == '*') saw_meta = TRUE; if (i >= bufsize) buf = realbuf = erealloc(buf, bufsize *= 2); } while ((c = gchar()) != EOF && !meta[(unsigned char) c]); while (c == '\\') { if ((c = gchar()) == '\n') { print_prompt2(); c = ' '; /* Pretend a space was read */ break; } else { bs: if (meta != dnw) { /* all words but varnames may have a bslash */ buf[i++] = '\\'; if (i >= bufsize) buf = realbuf = erealloc(buf, bufsize *= 2); if (!meta[(unsigned char) c]) goto read; } else { ugchar(c); c = '\\'; break; } } } ugchar(c); buf[i] = '\0'; w = KW; if (i == 2) { if (*buf == 'i' && buf[1] == 'f') return IF; if (*buf == 'f' && buf[1] == 'n') return FN; if (*buf == 'i' && buf[1] == 'n') return IN; } if (streq(buf, "for")) return FOR; if (streq(buf, "else")) return ELSE; if (streq(buf, "switch")) return SWITCH; if (streq(buf, "while")) return WHILE; if (streq(buf, "case")) return CASE; w = RW; y->word.w = ncpy(buf); if (saw_meta) { char *r, *s; y->word.m = nalloc(strlen(buf) + 1); for (r = buf, s = y->word.m; *r != '\0'; r++, s++) *s = (*r == '?' || *r == '[' || *r == '*'); } else { y->word.m = NULL; } y->word.q = FALSE; return WORD; } if (c == '`' || c == '!' || c == '@' || c == '~' || c == '$' || c == '\'') { checkfreecaret; if (c == '!' || c == '@' || c == '~') w = KW; } switch (c) { case '!': return BANG; case '@': return SUBSHELL; case '~': return TWIDDLE; case '`': c = gchar(); if (c == '`') return BACKBACK; ugchar(c); return '`'; case '$': dollar = TRUE; c = gchar(); if (c == '#') return COUNT; if (c == '^') return FLAT; ugchar(c); return '$'; case '\'': w = RW; i = 0; /* double ' to quote it, like this: 'how''s it going?' */ while ((c = gchar()) != '\'' || (c = gchar()) == '\'') { buf[i++] = c; if (c == '\n') print_prompt2(); if (c == EOF) { w = NW; scanerror("eof in quoted string"); return HUH; } if (i >= bufsize) buf = realbuf = erealloc(buf, bufsize *= 2); } ugchar(c); buf[i] = '\0'; y->word.w = ncpy(buf); y->word.m = NULL; y->word.q = TRUE; return WORD; case '\\': if ((c = gchar()) == '\n') { print_prompt2(); goto top; /* Pretend it was just another space. */ } ugchar(c); c = '\\'; checkfreecaret; c = gchar(); i = 0; goto bs; case '(': if (w == RW) /* SUB's happen only after real words, not keyowrds, so if () and while () work */ c = SUB; w = NW; return c; case '#': while ((c = gchar()) != '\n') /* skip comment until newline */ if (c == EOF) return END; /* FALLTHROUGH */ case '\n': lineno++; newline = TRUE; /* FALLTHROUGH */ case ';': case '^': case ')': case '=': case '{': case '}': w = NW; return c; case '&': w = NW; c = gchar(); if (c == '&') return ANDAND; ugchar(c); return '&'; case '|': w = NW; c = gchar(); if (c == '|') return OROR; getpair(c); if (errset) return HUH; if ((y->pipe.left = fd_left) == UNSET) y->pipe.left = 1; /* default to fd 1 */ if ((y->pipe.right = fd_right) == UNSET) y->pipe.right = 0; /* default to fd 0 */ if (y->pipe.right == CLOSED) { scanerror("expected digit after '='"); /* can't close a pipe */ return HUH; } return PIPE; case '>': c = gchar(); if (c == '>') { c = gchar(); y->redir.type = rAppend; } else y->redir.type = rCreate; y->redir.fd = 1; goto common; case '<': c = gchar(); if (c == '<') { c = gchar(); if (c == '<') { c = gchar(); y->redir.type = rHerestring; } else { y->redir.type = rHeredoc; } } else y->redir.type = rFrom; y->redir.fd = 0; common: w = NW; getpair(c); if (errset) return HUH; if (fd_right == UNSET) { /* redirection, not dup */ if (fd_left != UNSET) { y->redir.fd = fd_left; return SREDIR; } return (y->redir.type == rFrom || y->redir.type == rCreate) ? REDIR : SREDIR; } else { /* dup; recast yylval */ y->dup.type = y->redir.type; y->dup.left = fd_left; y->dup.right = fd_right; return DUP; } default: w = NW; return c; /* don't know what it is, let yacc barf on it */ } } extern void yyerror(const char *s) { char *tok; if (prerror) { /* don't print "syntax error" if there's a more informative scanerror */ prerror = FALSE; return; } if (!interactive) { if (w != NW) tok = realbuf; else if (lastchar == EOF) tok = "eof"; else if (lastchar == '\n') tok = "end of line"; else tok = nprint((lastchar < 32 || lastchar > 126) ? "(decimal %d)" : "'%c'", lastchar); fprint(2, "line %d: %s near %s\n", lineno - (lastchar == '\n'), s, tok); } else fprint(2, "%s\n", s); } extern void scanerror(char *s) { flushu(); /* flush upto newline */ yyerror(s); errset = prerror = TRUE; } extern void inityy() { newline = FALSE; w = NW; hq = NULL; /* return memory to the system if the buffer got too large */ if (bufsize > BUFMAX && realbuf != NULL) { efree(realbuf); bufsize = BUFSIZE; realbuf = ealloc(bufsize); } else if (realbuf == NULL) realbuf = ealloc(bufsize); } /* Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g., >[2] is represented as (2,UNSET). This function makes use of unsigned compares to make range tests in one compare operation. */ static void getpair(int c) { int n; fd_left = fd_right = UNSET; if (c != '[') { ugchar(c); return; } if ((unsigned int) (n = gchar() - '0') > 9) { scanerror("expected digit after '['"); return; } while ((unsigned int) (c = gchar() - '0') <= 9) n = n * 10 + c; fd_left = n; c += '0'; switch (c) { default: scanerror("expected '=' or ']' after digit"); return; case ']': return; case '=': if ((unsigned int) (n = gchar() - '0') > 9) { if (n != ']' - '0') { scanerror("expected digit or ']' after '='"); return; } fd_right = CLOSED; } else { while ((unsigned int) (c = gchar() - '0') <= 9) n = n * 10 + c; if (c != ']' - '0') { scanerror("expected ']' after digit"); return; } fd_right = n; } } } --x--