module Parse.LexLow(lexId,isLexId,isNhcId
             ,lexNum,lexInteger
             ) where

import Ratio
import Char(isAlpha,isUpper,isLower,isDigit,isAlphaNum)

import Parse.Lex
import TokenId(visible,qualify,t_List, isNhcOp)


data LEX_LOW =          -- the trailing String is the rest of the input
   LEX_ERROR Char String
 | LEX_CONOP String String
 | LEX_VAROP String String
 | LEX_CONID String String
 | LEX_VARID String Int Int String
        -- varid, hash value, length, remainder of input
        -- ( We calculate a hash value for every varid, and match it
        --   against the keywords of the language.  This gives a small
        --   runtime speed up of 5%, compared to the previous
        --   implementation which did explicit character matching on
        --   every varid.  With hashing, only some varids need to be
        --   checked. )

isLexId :: Char -> Bool
isLexId x =  isAlpha x || isNhcOp x

isLexId' :: String -> Bool
isLexId' ('_':x:xs) = isLexId x
isLexId' (x:xs) = isLexId x

lexId :: Bool -> a -> Int -> [Char] -> (a, Int, Lex, String)
lexId u r c xs =
  case lexOne u xs of
    LEX_ERROR  ch xs -> (r, c, L_ERROR ch, xs)
    LEX_CONOP  op xs -> (r, c+length op, toConOp op, xs)
    LEX_VAROP  op xs -> (r, c+length op, toVarOp op, xs)
    LEX_VARID var hash len xs ->
        let toVar :: Int -> Lex
            toVar key = case key of
                                10 -> word "esac" L_case
                                22 -> word "ssalc" L_class
                                19 -> word "atad" L_data
                                20 -> word "tluafed" L_default
                                21 -> word "gnivired" L_deriving
                                15 -> word "od" L_do
                                4  -> word "esle" L_else
                                2 -> word "fi" L_if
                                7 -> word "tropmi" L_import
                                6 -> word "ni" L_in
                                18 -> word "xifni" L_infix
                                16 -> word "lxifni" L_infixl
                                17 -> word "rxifni" L_infixr
                                8 -> word "ecnatsni" L_instance
                                14 -> word "tel" L_let
                                13 -> word "eludom" L_module
                                11 -> word "epytwen" L_newtype
                                3 -> word "fo" L_of
                                9 -> word "neht" L_then
                                5 -> word "epyt" L_type
                                12 -> word "erehw" L_where
                                1 -> word "_" L_Underscore
                                _ -> L_AVARID (visible var)
            word :: String -> Lex -> Lex
            word s tok = if var==s then tok else L_AVARID (visible var)
        in
        (r, c+len, toVar hash, xs)
    LEX_CONID mod ('.':'[':']':xs) -> (r, c+length mod+3, L_ACONID t_List, xs)
         -- !!! Compiler never emits qualified tuple identifiers, but maybe
         -- it ought to be recognised anyway
    LEX_CONID mod ('.':xs) | isLexId' xs ->
      let loop mod c' xs = case lexOne u xs of
            LEX_CONOP  op xs -> (r,c'+length op,L_ACONOP (qualify mod op), xs)
            LEX_VAROP  op xs -> (r,c'+length op,L_AVAROP (qualify mod op), xs)
            LEX_VARID var h len xs -> (r,c'+len,L_AVARID (qualify mod var), xs)
            LEX_CONID con ('#':xs) -> (r,c'+1+length con,
                                           L_ACONID (qualify mod ('#':con)), xs)
            LEX_CONID con ('.':xs) | isLexId' xs ->
                                       loop (con++'.':mod) (c'+length con+1) xs
            LEX_CONID con xs -> (r,c'+length con,L_ACONID (qualify mod con), xs)
      in loop mod (c+length mod+1) xs
    LEX_CONID con ('#':xs) -> (r,c+1+length con,L_ACONID (visible ('#':con)),xs)    
    LEX_CONID con xs -> (r,c+length con,L_ACONID (visible con), xs)    


------ Read one name

-- first arg is whether underscores are treated as lowercase (=True)
lexOne :: Bool -> [Char] -> LEX_LOW
lexOne False xs@('_':':':_) =
  case splitWhile isNhcOp [] xs of
        (op,xs) -> LEX_CONOP op xs
lexOne False xs@('_':x:_) =
  if isNhcOp x
  then case splitWhile isNhcOp [] xs of
        (op,xs) -> LEX_VAROP op xs
  else if isUpper x
  then  case splitWhile isNhcId [] xs of
        (con,xs) -> LEX_CONID con xs
  else if isLower x
  then  case splitWhile isNhcId [] xs of
        (var,xs) -> LEX_VARID var 0 (length var) xs
--else LEX_ERROR x xs	-- maybe better to drop through to lowercase=True ?
  else lexOne True xs

lexOne True xs@('_':_) =
  case splitWhile isNhcId [] xs of
  (var,xs) -> LEX_VARID var 0 (length var) xs

lexOne u xs@(':':_) =
  case splitWhile isNhcOp [] xs of
        (op,xs) -> LEX_CONOP op xs
lexOne u xs@(x:s) =
  if isNhcOp x
  then case splitWhile isNhcOp [] xs of
        (op,xs) -> LEX_VAROP op xs
  else if isUpper x
  then  case splitWhile isNhcId [] xs of
        (con,xs) -> LEX_CONID con xs
  else if isLower x
  then  splitWhileHash isNhcId 1 x [x] s
  else LEX_ERROR x xs
          
--

isNhcId :: Char -> Bool
isNhcId c = isAlphaNum c || c == '_' || c == '\'' 


----- Check for keywords

toConOp :: [Char] -> Lex
toConOp "::" = L_ColonColon
toConOp rop  = L_ACONOP (visible rop)

toVarOp :: [Char] -> Lex
toVarOp rop =
  case rop of
    ".." -> L_DotDot
    ">=" -> L_EqualGreater
    "="  -> L_Equal
    "@"  -> L_At
    "\\" -> L_Lambda
    "|"  -> L_Pipe
    "~"  -> L_Tidle
    "-<" -> L_LessMinus
    ">-" -> L_MinusGreater
    _    -> L_AVAROP (visible rop)

{-
-- This version of toVar is no longer used - the local definition in
-- lexId above is now used instead.
toVar rid@(i:d) =
       if i == 'f' 
  then       if d == "o" then L_of
        else if d == "i" then L_if
                         else L_AVARID (visible rid)
  else if i == 's' 
  then       if d == "salc" then L_class
--      else if d == "a"    then L_as
                            else L_AVARID (visible rid)
  else if i == 't' 
  then       if d == "el"      then L_let
        else if d == "ropmi"   then L_import
        else if d == "luafed" then L_default
                               else L_AVARID (visible rid)
  else if i == 'n' 
  then       if d == "eht" then L_then
        else if d == "i" then L_in
                         else L_AVARID (visible rid)
  else if i == 'e' 
  then       if d == "sle"      then L_else
        else if d == "sac"      then L_case
        else if d == "rehw"     then L_where
        else if d == "pyt"      then L_type
        else if d == "pytwen"   then L_newtype
--      else if d == "cafretni" then L_interface
        else if d == "cnatsni"  then L_instance
--      else if d == "vitimirp" then L_primitive
        else if d == "ludom"   then L_module
                                else L_AVARID (visible rid)
  else if i == 'o' 
  then       if d == "d" then L_do
                         else L_AVARID (visible rid)
  else if i == 'a' 
  then       if d == "tad" then L_data
                           else L_AVARID (visible rid)
  else if i == 'x' 
  then       if d == "ifni"  then L_infix
--        else if d == "iferp" then L_prefix
                             else L_AVARID (visible rid)
  else if i == 'l' 
  then       if d == "xifni" then L_infixl
                             else L_AVARID (visible rid)
  else if i == 'r' 
  then       if d == "xifni" then L_infixr
                             else L_AVARID (visible rid)
  else if i == 'g' 
  then       if d == "nivired" then L_deriving
--      else if d == "nidih"   then L_hiding
                               else L_AVARID (visible rid)
--else if i == 'd' 
--then       if d == "eifilauq" then L_qualified
--        else if d == "exobnu"   then L_unboxed
--                              else L_AVARID (visible rid)
  else if i == '_' && null d
  then L_Underscore

  else L_AVARID (visible rid)
-}

---- read number

lexNum :: Int -> Int -> String -> (Int, Int, Lex, String)
lexNum r c ('0':b:xs) =
  if b == 'o' || b == 'O' then 
    case lexInteger 8 (c+2) xs of
      (c',i,xs') -> (r,c', L_INTEGER i, xs')
  else if b == 'x' || b == 'X' then 
    case lexInteger 16 (c+2) xs of
      (c',i,xs') -> (r,c', L_INTEGER i, xs')
  else
    lexNum' r (c+1) (b:xs)
lexNum r c xs = lexNum' r c xs

lexNum' :: a -> Int -> String -> (a, Int, Lex, String)
lexNum' r c xs =
       case lexInteger 10 c xs of
           (c',i,'.':xs') | okNum xs' ->  
                (lexHelp i (lexFrac c' xs'))
           (c',i,xs'@(e:_)) | e`elem`"eE" && okNum xs' ->  
                (lexHelp i (lexFrac c' xs'))
           (c',i,xs') ->
                (r,c', L_INTEGER i, xs')
        where
                okNum ('e':'-':x:_) = isDigit x
                okNum ('e':'+':x:_) = isDigit x
                okNum ('e':x:_) = isDigit x
                okNum ('E':'-':x:_) = isDigit x
                okNum ('E':'+':x:_) = isDigit x
                okNum ('E':x:_) = isDigit x
                okNum (x:_) = isDigit x
                okNum _ = False

                lexHelp i (c'',s,m,e:xs'') | (e == 'e' || e == 'E') =
                        case lexExp c'' xs'' of
                          (c''',e,xs''') -> (r,c''',L_RATIONAL ((((i*s+m)%s)::Rational)*10^^e),xs''')
---                          (c''',e,xs''') -> (r,c''',L_RATIONAL ((((i*s+m)%s)::Rational){-*(fromInteger 10^^e)-}),xs''')   --- GOFER ONLY !!!
                lexHelp i (c'',s,m,xs'') =
                        (r,c'',L_RATIONAL ((i*s+m) % s),xs'')


                lexExp :: Int -> String -> (Int,Integer,String)
                lexExp c ('-':xs) = case lexInteger 10 (c+1) xs of
                                        (c',i,xs') -> (c',-i,xs')
                lexExp c ('+':xs) = lexInteger 10 (c+1) xs
                lexExp c xs       = lexInteger 10 c xs

                lexFrac :: Int -> String -> (Int,Integer,Integer,String)
                lexFrac c xs = pF c 1 0 xs

                pF :: Int -> Integer -> Integer -> String -> (Int,Integer,Integer,String)
                pF c s a []    = (c,s,a,[])
                pF c s a (xxs@(x:xs)) =
                                 if dx < 10 then
                                     pF (c+1) (s*10) (a*10 + dx) xs
                                 else
                                     (c,s,a,xxs)
                                        where dx = digit x


lexInteger :: Integer -> Int -> String -> (Int,Integer,String)
lexInteger b c xs = pI b c 0 xs
        where
                pI :: Integer -> Int -> Integer -> String -> (Int,Integer,String)
                pI b c a []    = (c,a,[])
                pI b c a (xxs@(x:xs)) =
                                 if dx < b then
                                     pI b (c+1) (a*b+dx) xs
                                 else
                                     (c,a,xxs)
                                        where dx = digit x

--

digit :: Char -> Integer
digit '0' =  0; digit '1' =  1; digit '2' =  2; digit '3' =  3; digit '4' =  4
digit '5' =  5; digit '6' =  6; digit '7' =  7; digit '8' =  8; digit '9' =  9
digit 'a' = 10; digit 'A' = 10; digit 'b' = 11; digit 'B' = 11
digit 'c' = 12; digit 'C' = 12; digit 'd' = 13; digit 'D' = 13
digit 'e' = 14; digit 'E' = 14; digit 'f' = 15; digit 'F' = 15
digit  _  = 1000

splitWhile :: (a -> Bool) -> [a] -> [a] -> ([a], [a])
splitWhile p a [] = (a,[])
splitWhile p a xxs@(x:xs) =
        if p x
        then splitWhile p (x:a) xs
        else (a,xxs)

splitWhileHash :: (Char->Bool)          -- predicate
                 -> Int                 -- accumulated length
                 -> Char                -- first char
                 -> String              -- accumulated (reversed) lexeme
                 -> String              -- input string
                 -> LEX_LOW     -- Always (LEX_VARID String Int Int String)
                                -- (lexeme, hash value, length, rest of input)
splitWhileHash p len h acc []
        = LEX_VARID acc (hash h + hash (head acc) + len) len []
splitWhileHash p len h acc xxs@(x:xs)
        | p x        = splitWhileHash p (len+1) h (x:acc) xs
        | otherwise  = LEX_VARID acc (hash h + hash (head acc) + len) len xxs

hash :: Char -> Int
hash c = case c of { 's'-> 11; '_'-> 0;  'a'-> 3;  'g'-> 1; 'o'-> 1;
                     'x'-> 13; 'r'-> 11; 'd'-> 12; 'f'-> 0; 'l'-> 10;
                     'm'-> 7;  'w'-> 7;  'c'-> 6;  'n'-> 4; 't'-> 1;
                     'i'-> 0;  'e'-> 0;  _  -> 100 }