module Parse.LexLow(lexId,isLexId,isNhcId ,lexNum,lexInteger ) where import Ratio import Char(isAlpha,isUpper,isLower,isDigit,isAlphaNum) import Parse.Lex import TokenId(visible,qualify,t_List, isNhcOp) data LEX_LOW = -- the trailing String is the rest of the input LEX_ERROR Char String | LEX_CONOP String String | LEX_VAROP String String | LEX_CONID String String | LEX_VARID String Int Int String -- varid, hash value, length, remainder of input -- ( We calculate a hash value for every varid, and match it -- against the keywords of the language. This gives a small -- runtime speed up of 5%, compared to the previous -- implementation which did explicit character matching on -- every varid. With hashing, only some varids need to be -- checked. ) isLexId :: Char -> Bool isLexId x = isAlpha x || isNhcOp x isLexId' :: String -> Bool isLexId' ('_':x:xs) = isLexId x isLexId' (x:xs) = isLexId x lexId :: Bool -> a -> Int -> [Char] -> (a, Int, Lex, String) lexId u r c xs = case lexOne u xs of LEX_ERROR ch xs -> (r, c, L_ERROR ch, xs) LEX_CONOP op xs -> (r, c+length op, toConOp op, xs) LEX_VAROP op xs -> (r, c+length op, toVarOp op, xs) LEX_VARID var hash len xs -> let toVar :: Int -> Lex toVar key = case key of 10 -> word "esac" L_case 22 -> word "ssalc" L_class 19 -> word "atad" L_data 20 -> word "tluafed" L_default 21 -> word "gnivired" L_deriving 15 -> word "od" L_do 4 -> word "esle" L_else 2 -> word "fi" L_if 7 -> word "tropmi" L_import 6 -> word "ni" L_in 18 -> word "xifni" L_infix 16 -> word "lxifni" L_infixl 17 -> word "rxifni" L_infixr 8 -> word "ecnatsni" L_instance 14 -> word "tel" L_let 13 -> word "eludom" L_module 11 -> word "epytwen" L_newtype 3 -> word "fo" L_of 9 -> word "neht" L_then 5 -> word "epyt" L_type 12 -> word "erehw" L_where 1 -> word "_" L_Underscore _ -> L_AVARID (visible var) word :: String -> Lex -> Lex word s tok = if var==s then tok else L_AVARID (visible var) in (r, c+len, toVar hash, xs) LEX_CONID mod ('.':'[':']':xs) -> (r, c+length mod+3, L_ACONID t_List, xs) -- !!! Compiler never emits qualified tuple identifiers, but maybe -- it ought to be recognised anyway LEX_CONID mod ('.':xs) | isLexId' xs -> let loop mod c' xs = case lexOne u xs of LEX_CONOP op xs -> (r,c'+length op,L_ACONOP (qualify mod op), xs) LEX_VAROP op xs -> (r,c'+length op,L_AVAROP (qualify mod op), xs) LEX_VARID var h len xs -> (r,c'+len,L_AVARID (qualify mod var), xs) LEX_CONID con ('#':xs) -> (r,c'+1+length con, L_ACONID (qualify mod ('#':con)), xs) LEX_CONID con ('.':xs) | isLexId' xs -> loop (con++'.':mod) (c'+length con+1) xs LEX_CONID con xs -> (r,c'+length con,L_ACONID (qualify mod con), xs) in loop mod (c+length mod+1) xs LEX_CONID con ('#':xs) -> (r,c+1+length con,L_ACONID (visible ('#':con)),xs) LEX_CONID con xs -> (r,c+length con,L_ACONID (visible con), xs) ------ Read one name -- first arg is whether underscores are treated as lowercase (=True) lexOne :: Bool -> [Char] -> LEX_LOW lexOne False xs@('_':':':_) = case splitWhile isNhcOp [] xs of (op,xs) -> LEX_CONOP op xs lexOne False xs@('_':x:_) = if isNhcOp x then case splitWhile isNhcOp [] xs of (op,xs) -> LEX_VAROP op xs else if isUpper x then case splitWhile isNhcId [] xs of (con,xs) -> LEX_CONID con xs else if isLower x then case splitWhile isNhcId [] xs of (var,xs) -> LEX_VARID var 0 (length var) xs --else LEX_ERROR x xs -- maybe better to drop through to lowercase=True ? else lexOne True xs lexOne True xs@('_':_) = case splitWhile isNhcId [] xs of (var,xs) -> LEX_VARID var 0 (length var) xs lexOne u xs@(':':_) = case splitWhile isNhcOp [] xs of (op,xs) -> LEX_CONOP op xs lexOne u xs@(x:s) = if isNhcOp x then case splitWhile isNhcOp [] xs of (op,xs) -> LEX_VAROP op xs else if isUpper x then case splitWhile isNhcId [] xs of (con,xs) -> LEX_CONID con xs else if isLower x then splitWhileHash isNhcId 1 x [x] s else LEX_ERROR x xs -- isNhcId :: Char -> Bool isNhcId c = isAlphaNum c || c == '_' || c == '\'' ----- Check for keywords toConOp :: [Char] -> Lex toConOp "::" = L_ColonColon toConOp rop = L_ACONOP (visible rop) toVarOp :: [Char] -> Lex toVarOp rop = case rop of ".." -> L_DotDot ">=" -> L_EqualGreater "=" -> L_Equal "@" -> L_At "\\" -> L_Lambda "|" -> L_Pipe "~" -> L_Tidle "-<" -> L_LessMinus ">-" -> L_MinusGreater _ -> L_AVAROP (visible rop) {- -- This version of toVar is no longer used - the local definition in -- lexId above is now used instead. toVar rid@(i:d) = if i == 'f' then if d == "o" then L_of else if d == "i" then L_if else L_AVARID (visible rid) else if i == 's' then if d == "salc" then L_class -- else if d == "a" then L_as else L_AVARID (visible rid) else if i == 't' then if d == "el" then L_let else if d == "ropmi" then L_import else if d == "luafed" then L_default else L_AVARID (visible rid) else if i == 'n' then if d == "eht" then L_then else if d == "i" then L_in else L_AVARID (visible rid) else if i == 'e' then if d == "sle" then L_else else if d == "sac" then L_case else if d == "rehw" then L_where else if d == "pyt" then L_type else if d == "pytwen" then L_newtype -- else if d == "cafretni" then L_interface else if d == "cnatsni" then L_instance -- else if d == "vitimirp" then L_primitive else if d == "ludom" then L_module else L_AVARID (visible rid) else if i == 'o' then if d == "d" then L_do else L_AVARID (visible rid) else if i == 'a' then if d == "tad" then L_data else L_AVARID (visible rid) else if i == 'x' then if d == "ifni" then L_infix -- else if d == "iferp" then L_prefix else L_AVARID (visible rid) else if i == 'l' then if d == "xifni" then L_infixl else L_AVARID (visible rid) else if i == 'r' then if d == "xifni" then L_infixr else L_AVARID (visible rid) else if i == 'g' then if d == "nivired" then L_deriving -- else if d == "nidih" then L_hiding else L_AVARID (visible rid) --else if i == 'd' --then if d == "eifilauq" then L_qualified -- else if d == "exobnu" then L_unboxed -- else L_AVARID (visible rid) else if i == '_' && null d then L_Underscore else L_AVARID (visible rid) -} ---- read number lexNum :: Int -> Int -> String -> (Int, Int, Lex, String) lexNum r c ('0':b:xs) = if b == 'o' || b == 'O' then case lexInteger 8 (c+2) xs of (c',i,xs') -> (r,c', L_INTEGER i, xs') else if b == 'x' || b == 'X' then case lexInteger 16 (c+2) xs of (c',i,xs') -> (r,c', L_INTEGER i, xs') else lexNum' r (c+1) (b:xs) lexNum r c xs = lexNum' r c xs lexNum' :: a -> Int -> String -> (a, Int, Lex, String) lexNum' r c xs = case lexInteger 10 c xs of (c',i,'.':xs') | okNum xs' -> (lexHelp i (lexFrac c' xs')) (c',i,xs'@(e:_)) | e`elem`"eE" && okNum xs' -> (lexHelp i (lexFrac c' xs')) (c',i,xs') -> (r,c', L_INTEGER i, xs') where okNum ('e':'-':x:_) = isDigit x okNum ('e':'+':x:_) = isDigit x okNum ('e':x:_) = isDigit x okNum ('E':'-':x:_) = isDigit x okNum ('E':'+':x:_) = isDigit x okNum ('E':x:_) = isDigit x okNum (x:_) = isDigit x okNum _ = False lexHelp i (c'',s,m,e:xs'') | (e == 'e' || e == 'E') = case lexExp c'' xs'' of (c''',e,xs''') -> (r,c''',L_RATIONAL ((((i*s+m)%s)::Rational)*10^^e),xs''') --- (c''',e,xs''') -> (r,c''',L_RATIONAL ((((i*s+m)%s)::Rational){-*(fromInteger 10^^e)-}),xs''') --- GOFER ONLY !!! lexHelp i (c'',s,m,xs'') = (r,c'',L_RATIONAL ((i*s+m) % s),xs'') lexExp :: Int -> String -> (Int,Integer,String) lexExp c ('-':xs) = case lexInteger 10 (c+1) xs of (c',i,xs') -> (c',-i,xs') lexExp c ('+':xs) = lexInteger 10 (c+1) xs lexExp c xs = lexInteger 10 c xs lexFrac :: Int -> String -> (Int,Integer,Integer,String) lexFrac c xs = pF c 1 0 xs pF :: Int -> Integer -> Integer -> String -> (Int,Integer,Integer,String) pF c s a [] = (c,s,a,[]) pF c s a (xxs@(x:xs)) = if dx < 10 then pF (c+1) (s*10) (a*10 + dx) xs else (c,s,a,xxs) where dx = digit x lexInteger :: Integer -> Int -> String -> (Int,Integer,String) lexInteger b c xs = pI b c 0 xs where pI :: Integer -> Int -> Integer -> String -> (Int,Integer,String) pI b c a [] = (c,a,[]) pI b c a (xxs@(x:xs)) = if dx < b then pI b (c+1) (a*b+dx) xs else (c,a,xxs) where dx = digit x -- digit :: Char -> Integer digit '0' = 0; digit '1' = 1; digit '2' = 2; digit '3' = 3; digit '4' = 4 digit '5' = 5; digit '6' = 6; digit '7' = 7; digit '8' = 8; digit '9' = 9 digit 'a' = 10; digit 'A' = 10; digit 'b' = 11; digit 'B' = 11 digit 'c' = 12; digit 'C' = 12; digit 'd' = 13; digit 'D' = 13 digit 'e' = 14; digit 'E' = 14; digit 'f' = 15; digit 'F' = 15 digit _ = 1000 splitWhile :: (a -> Bool) -> [a] -> [a] -> ([a], [a]) splitWhile p a [] = (a,[]) splitWhile p a xxs@(x:xs) = if p x then splitWhile p (x:a) xs else (a,xxs) splitWhileHash :: (Char->Bool) -- predicate -> Int -- accumulated length -> Char -- first char -> String -- accumulated (reversed) lexeme -> String -- input string -> LEX_LOW -- Always (LEX_VARID String Int Int String) -- (lexeme, hash value, length, rest of input) splitWhileHash p len h acc [] = LEX_VARID acc (hash h + hash (head acc) + len) len [] splitWhileHash p len h acc xxs@(x:xs) | p x = splitWhileHash p (len+1) h (x:acc) xs | otherwise = LEX_VARID acc (hash h + hash (head acc) + len) len xxs hash :: Char -> Int hash c = case c of { 's'-> 11; '_'-> 0; 'a'-> 3; 'g'-> 1; 'o'-> 1; 'x'-> 13; 'r'-> 11; 'd'-> 12; 'f'-> 0; 'l'-> 10; 'm'-> 7; 'w'-> 7; 'c'-> 6; 'n'-> 4; 't'-> 1; 'i'-> 0; 'e'-> 0; _ -> 100 }