To: 9fans@cse.psu.edu
Reply-To: erik quanstrom <quanstro@speakeasy.net>
From: erik quanstrom <quanstro@quanstro.net>
Mime-Version: 1.0
Content-Type: multipart/mixed; 
	boundary=x
Subject: minimal changes to make byron's rc utf-8 compatable.

--x
Content-Type: text/plain; 
	charset=utf-8

9fans,

i realize that this is not the ideal forum but it seems that 
rc-fans@hawkwind.utcs.toronto.ca is defunct and tim goodwin
hasn't (yet) answered my email...

this is all you need to make byron's rc utf-8 compatable.

you should be able to do this

	; α = 1; echo $α

and this
	; ~ α [αβ] &&
		~ α . &&
		~ β [α-γ] && echo works

with this patch.

i have not tested this with 4-byte utf-8 sequences, due to the fact
that p9p doesn't support 32bit unicode. but, barring a typo they should work.

erik

--x
Content-Type: text/plain;
	charset=utf8;
	filename="match.c"

/* match.c: pattern matching routines */

#include "rc.h"

static int rangematch(const char*, const char*);

enum { RANGE_FAIL = -1, RANGE_ERROR = -2 };

/* match() matches a single pattern against a single string. */

/* utf-8 support copyright © 2005 erik quanstrom with the same
    licencing terms as the rest of rc.

    since rc doesn't really do utf-8, we are going to pretend,
    relying on the properties of utf-8 we know that we can

    1.  get away with byte-wise comparisons as long as we are not
        insisting that the next byte is the next character.
        ranges and the ? match operator need to be utf-8-aware.

    2.  we can compare 2 utf-8 characters without converting to
        unicode (PITA) by comparing length (longer is greater) and
        then bytewise. all we require is utf8len.

*/

static int utf8len(const char* ss){
	const unsigned char* s = (unsigned char*)ss;
	int c;

	c=*s;

	if (c<0x80){
		return 1;
	}

	if (0x80 == (c&0xc0) || 0xc0 == (c&0xe0)){
		return 2;
	}

	if ((c & 0xf0) == 0xe0){
		return 3;
	} 
	if ((c & 0xf8) == 0xf0){
		return 4;
	}
	return 1; /* bad */
}

static int utf8cmp(const char* s1, int l1, const char* s2, int l2){
	int l;
	int t1;
	int t2;
	int i;

	l = l2-l1;
	if (l){
		return l;
	}

	for(i=0; i<l1; i++){
		t1 = *(unsigned char*)s1++;
		t2 = *(unsigned char*)s2++;
		l = t2-t1;
		if (l){
			return l;
		}
	}

	return 0;
}

#define nextc()  l = utf8len(s); do { if (!*s++) return FALSE; } while (--l)

extern bool match(char *p, char *m, char *s) {
	int i, j;
	int l;
	if (m == NULL)
		return streq(p, s);
	i = 0;
	while (1) {
		if (p[i] == '\0')
			return *s == '\0';
		else if (m[i]) {
			switch (p[i++]) {
			case '?':
				nextc();
				break;
			case '*':
				while (p[i] == '*' && m[i] == 1)	/* collapse multiple stars */
					i++;
				if (p[i] == '\0') 	/* star at end of pattern? */
					return TRUE;
				while (*s != '\0') {
					if (match(p + i, m + i, s)) {
						return TRUE;
					}
					nextc();
				}
				return FALSE;
			case '[':
				if (*s == '\0')
					return FALSE;
				switch (j = rangematch(p + i, s)) {
				default:
					i += j;
					break;
				case RANGE_FAIL:
					return FALSE;
				case RANGE_ERROR:
					if (*s != '[')
						return FALSE;
				}
				s += utf8len(s);
				break;
			default:
				panic("bad metacharacter in match");
				/* NOTREACHED */
				return FALSE; /* hush up gcc -Wall */
			}
		} else if (p[i++] != *s++)
			return FALSE;
	}
}

/*
   From the ed(1) man pages (on ranges):

	The `-' is treated as an ordinary character if it occurs first
	(or first after an initial ^) or last in the string.

	The right square bracket does not terminate the enclosed string
	if it is the first character (after an initial `^', if any), in
	the bracketed string.

   rangematch() matches a single character against a class, and returns
   an integer offset to the end of the range on success, or -1 on
   failure.
*/

/* we might get invalid utf-8 because rc doesn't check; so lets be paranoid */
#define check(s,l)  for(k=0; k<l; k++) { if (!s[k]) return RANGE_ERROR; }

static int rangematch(const char* p, const char* c){
	int l, cl;
	int i;
	int m;
	int k;
	const char* orig = p;
	bool neg = (*p == '~');
	bool matched = FALSE;

	if (neg)
		p++;
	if (*p == ']') {
		p++;
		matched = (*c == ']');
	}

	cl = utf8len(c); check(c,cl);

	for(; *p != ']'; p += l){
		if (*p == '\0')
			return RANGE_ERROR;

		l = utf8len(p); check(p,l);

		if (p[l] == '-' && p[l+1] != ']') { /* check for [..-..] but ignore [..-] */
			m = utf8cmp(p, l, c, cl);
			p+= l+1;
			l = utf8len(p); check(p,l);
			if (0<=m && 0>=utf8cmp(p, l, c, cl)){
				matched = 1;
			}
		} else if (cl == l) {
			for(i=0; i != l; i++){
				if (p[i] != c[i]){
					break;	
				}
			}
			matched |= i==l;
		}
	}
	if (matched ^ neg)
		return p - orig + 1; /* skip the right-bracket */
	return RANGE_FAIL;
}

--x
Content-Type: text/plain;
	charset=utf-8;
	filename="lex.c

/* lex.c: rc's lexical analyzer */

#include "rc.h"
#include "parse.h"

/*
	Special characters (i.e., "non-word") in rc:
		\t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \

	The lexical analyzer is fairly straightforward. The only really
	unclean part concerns backslash continuation and "double
	backslashes". A backslash followed by a newline is treated as a
	space, otherwise backslash is not a special character (i.e.,
	it can be part of a word).  This introduces a host of unwanted
	special cases. In our case, \ cannot be a word character, since
	we wish to read in all word characters in a tight loop.

	Note: to save the trouble of declaring these arrays with TRUEs
	and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is
	it declared in rc.h)
*/

#define BUFSIZE ((size_t) 1000)	/*	malloc hates power of 2 buffers? */
#define BUFMAX (8 * BUFSIZE)	/* 	How big the buffer can get before we re-allocate the
					space at BUFSIZE again. Premature optimization? Maybe.
				*/

typedef enum wordstates {
	NW, RW, KW /* "nonword", "realword", "keyword" */
} wordstates;

static void getpair(int);

int lineno;

/* does not check for valid utf-8; alternative is changing gchar() to return a Rune */
const char nw[] = {
	1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

const char dnw[] = {
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

static size_t bufsize = BUFSIZE;
static char *realbuf = NULL;
static bool newline = FALSE;
static bool errset = FALSE;
static bool prerror = FALSE;
static wordstates w = NW;
static int fd_left, fd_right;

#define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }}

enum filedescriptors {
	UNSET = -9, CLOSED = -1
};

/* does this string require quoting? */
extern bool quotep(char *s, bool dollar) {
	unsigned char c;
	const char *meta;

	meta = dollar ? dnw : nw;
	while ((c = *s++))
		if (meta[c])
			return TRUE;
	return FALSE;
}

extern int yylex() {
	static bool dollar = FALSE;
	bool saw_meta = FALSE;
	int c;
	size_t i;			/* The purpose of all these local assignments is to	*/
	const char *meta;		/* allow optimizing compilers like gcc to load these	*/
	char *buf = realbuf;		/* values into registers. On a sparc this is a		*/
	YYSTYPE *y = &yylval;		/* win, in code size *and* execution time		*/
	if (errset) {
		errset = FALSE;
		return '\n';
	}
	/* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */
	meta = (dollar ? dnw : nw);
	dollar = FALSE;
	if (newline) {
		--lineno; /* slight space optimization; print_prompt2() always increments lineno */
		print_prompt2();
		newline = FALSE;
	}
top:	while ((c = gchar()) == ' ' || c == '\t')
		w = NW;
	if (c == EOF)
		return END;
	if (!meta[(unsigned char) c]) {	/* it's a word or keyword. */
		checkfreecaret;
		w = RW;
		i = 0;
	read:	do {
			buf[i++] = c;
			if (c == '?' || c == '[' || c == '*')
				saw_meta = TRUE;
			if (i >= bufsize)
				buf = realbuf = erealloc(buf, bufsize *= 2);
		} while ((c = gchar()) != EOF && !meta[(unsigned char) c]);
		while (c == '\\') {
			if ((c = gchar()) == '\n') {
				print_prompt2();
				c = ' '; /* Pretend a space was read */
				break;
			} else {
	bs:			if (meta != dnw) { /* all words but varnames may have a bslash */
					buf[i++] = '\\';
					if (i >= bufsize)
						buf = realbuf = erealloc(buf, bufsize *= 2);
					if (!meta[(unsigned char) c])
						goto read;
				} else {
					ugchar(c);
					c = '\\';
					break;
				}
			}
		}
		ugchar(c);
		buf[i] = '\0';
		w = KW;
		if (i == 2) {
			if (*buf == 'i' && buf[1] == 'f') return IF;
			if (*buf == 'f' && buf[1] == 'n') return FN;
			if (*buf == 'i' && buf[1] == 'n') return IN;
		}
		if (streq(buf, "for")) return FOR;
		if (streq(buf, "else")) return ELSE;
		if (streq(buf, "switch")) return SWITCH;
		if (streq(buf, "while")) return WHILE;
		if (streq(buf, "case")) return CASE;
		w = RW;
		y->word.w = ncpy(buf);
		if (saw_meta) {
			char *r, *s;

			y->word.m = nalloc(strlen(buf) + 1);
			for (r = buf, s = y->word.m; *r != '\0'; r++, s++)
				*s = (*r == '?' || *r == '[' || *r == '*');
		} else {
			y->word.m = NULL;
		}
		y->word.q = FALSE;
		return WORD;
	}
	if (c == '`' || c == '!' || c == '@' || c == '~' || c == '$' || c == '\'') {
		checkfreecaret;
		if (c == '!' || c == '@' || c == '~')
			w = KW;
	}
	switch (c) {
	case '!':
		return BANG;
	case '@':
		return SUBSHELL;
	case '~':
		return TWIDDLE;
	case '`':
		c = gchar();
		if (c == '`')
			return BACKBACK;
		ugchar(c);
		return '`';
	case '$':
		dollar = TRUE;
		c = gchar();
		if (c == '#')
			return COUNT;
		if (c == '^')
			return FLAT;
		ugchar(c);
		return '$';
	case '\'':
		w = RW;
		i = 0;
		/* double ' to quote it, like this: 'how''s it going?' */
		while ((c = gchar()) != '\'' || (c = gchar()) == '\'') {
			buf[i++] = c;
			if (c == '\n')
				print_prompt2();
			if (c == EOF) {
				w = NW;
				scanerror("eof in quoted string");
				return HUH;
			}
			if (i >= bufsize)
				buf = realbuf = erealloc(buf, bufsize *= 2);
		}
		ugchar(c);
		buf[i] = '\0';
		y->word.w = ncpy(buf);
		y->word.m = NULL;
		y->word.q = TRUE;
		return WORD;
	case '\\':
		if ((c = gchar()) == '\n') {
			print_prompt2();
			goto top; /* Pretend it was just another space. */
		}
		ugchar(c);
		c = '\\';
		checkfreecaret;
		c = gchar();
		i = 0;
		goto bs;
	case '(':
		if (w == RW) /* SUB's happen only after real words, not keyowrds, so if () and while () work */
			c = SUB;
		w = NW;
		return c;
	case '#':
		while ((c = gchar()) != '\n') /* skip comment until newline */
			if (c == EOF)
				return END;
		/* FALLTHROUGH */
	case '\n':
		lineno++;
		newline = TRUE;
		/* FALLTHROUGH */
	case ';':
	case '^':
	case ')':
	case '=':
	case '{': case '}':
		w = NW;
		return c;
	case '&':
		w = NW;
		c = gchar();
		if (c == '&')
			return ANDAND;
		ugchar(c);
		return '&';
	case '|':
		w = NW;
		c = gchar();
		if (c == '|')
			return OROR;
		getpair(c);
		if (errset)
			return HUH;
		if ((y->pipe.left = fd_left) == UNSET)
			y->pipe.left = 1;				/* default to fd 1 */
		if ((y->pipe.right = fd_right) == UNSET)
			y->pipe.right = 0;				/* default to fd 0 */
		if (y->pipe.right == CLOSED) {
			scanerror("expected digit after '='");		/* can't close a pipe */
			return HUH;
		}
		return PIPE;
	case '>':
		c = gchar();
		if (c == '>') {
			c = gchar();
			y->redir.type = rAppend;
		} else
			y->redir.type = rCreate;
		y->redir.fd = 1;
		goto common;
	case '<':
		c = gchar();
		if (c == '<') {
			c = gchar();
			if (c == '<') {
				c = gchar();
				y->redir.type = rHerestring;
			} else {
				y->redir.type = rHeredoc;
			}
		} else
			y->redir.type = rFrom;
		y->redir.fd = 0;
	common:
		w = NW;
		getpair(c);
		if (errset)
			return HUH;
		if (fd_right == UNSET) { /* redirection, not dup */
			if (fd_left != UNSET) {
				y->redir.fd = fd_left;
				return SREDIR;
			}
			return (y->redir.type == rFrom || y->redir.type == rCreate) ? REDIR : SREDIR;
		} else { /* dup; recast yylval */
			y->dup.type = y->redir.type;
			y->dup.left = fd_left;
			y->dup.right = fd_right;
			return DUP;
		}
	default:
		w = NW;
		return c; /* don't know what it is, let yacc barf on it */
	}
}

extern void yyerror(const char *s) {
	char *tok;
	if (prerror) { /* don't print "syntax error" if there's a more informative scanerror */
		prerror = FALSE;
		return;
	}
	if (!interactive) {
		if (w != NW)
			tok = realbuf;
		else if (lastchar == EOF)
			tok = "eof";
		else if (lastchar == '\n')
			tok = "end of line";
		else
			tok = nprint((lastchar < 32 || lastchar > 126) ? "(decimal %d)" : "'%c'", lastchar);
		fprint(2, "line %d: %s near %s\n", lineno - (lastchar == '\n'), s, tok);
	} else
		fprint(2, "%s\n", s);
}

extern void scanerror(char *s) {
	flushu(); /* flush upto newline */
	yyerror(s);
	errset = prerror = TRUE;
}

extern void inityy() {
	newline = FALSE;
	w = NW;
	hq = NULL;
	/* return memory to the system if the buffer got too large */
	if (bufsize > BUFMAX && realbuf != NULL) {
		efree(realbuf);
		bufsize = BUFSIZE;
		realbuf = ealloc(bufsize);
	} else if (realbuf == NULL)
		realbuf = ealloc(bufsize);
}

/*
   Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file
   descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g.,
   >[2] is represented as (2,UNSET).

   This function makes use of unsigned compares to make range tests in one compare operation.
*/

static void getpair(int c) {
	int n;
	fd_left = fd_right = UNSET;
	if (c != '[') {
		ugchar(c);
		return;
	}
	if ((unsigned int) (n = gchar() - '0') > 9) {
		scanerror("expected digit after '['");
		return;
	}
	while ((unsigned int) (c = gchar() - '0') <= 9)
		n = n * 10 + c;
	fd_left = n;
	c += '0';
	switch (c) {
	default:
		scanerror("expected '=' or ']' after digit");
		return;
	case ']':
		return;
	case '=':
		if ((unsigned int) (n = gchar() - '0') > 9) {
			if (n != ']' - '0') {
				scanerror("expected digit or ']' after '='");
				return;
			}
			fd_right = CLOSED;
		} else {
			while ((unsigned int) (c = gchar() - '0') <= 9)
				n = n * 10 + c;
			if (c != ']' - '0') {
				scanerror("expected ']' after digit");
				return;
			}
			fd_right = n;
		}
	}
}
--x--