{
(*------------------------------------------------------------------------
 * The Lexer.  Some of the complication arises from the fact it is 
 * reused by the Visual Studio mode to do partial lexing reporting 
 * whitespace etc.
 * (c) Microsoft Corporation. All rights reserved 
 *-----------------------------------------------------------------------*)

(*F# open Microsoft.Research.AbstractIL F#*)
(*F# open Microsoft.Research.AbstractIL.Internal F#*)
(*F# open Microsoft.FSharp.Compiler F#*)

open Range
open Ast
open Pars
open Parsing
open Lexing
open Lexhelp
open Lib

let ulexeme lexbuf = (*IF-OCAML*) lexeme (*ENDIF-OCAML*) (*F# lexeme_utf8 F#*) lexbuf

let adjust_lexbuf_start_pos (lexbuf:lexbuf) p =
     (*IF-OCAML*)  lexbuf.Lexing.lex_start_p <- p  (*ENDIF-OCAML*)
     (*F#          lexbuf.StartPos <- p F#*) 

let string_trim_both s n m = String.sub s n (String.length s - (n+m))
let trim_both   lexbuf n m = string_trim_both (ulexeme lexbuf) n m
let trim_right  lexbuf n = trim_both lexbuf 0 n
let trim_left   lexbuf n = trim_both lexbuf n 0
let trim_to_i32 lexbuf n = Int32.of_string (trim_right lexbuf n)
let char lexbuf n = lexeme_char lexbuf n
let code lexbuf n = Char.code (char lexbuf n)

let check_op lexbuf = 
    if String.contains (ulexeme lexbuf)  ':' then 
        deprecated "':' is no longer permitted as a character in operator names" (get_lex_range lexbuf) 

let fail lexbuf msg =
     let m = get_lex_range  lexbuf in 
     error(Error(msg,m))
        
let quote_op_is_raw s = String.length s >= 1 && s.[0] = '@'

let rev_string s = 
  let n = String.length s in 
  let res = Buffer.create n in 
  for i = n - 1 downto 0 do Buffer.add_char res s.[i] done;
  Buffer.contents res
  
(* UTF-8 encodings in strings *)
(* NOTE: This assumes people will only work with ASCII or UTF-8 encodings. *)
(* REVIEW: these should not be permitted in bytearray constants. *)
(* U-00000080 - U-000007FF:  110xxxxx 10xxxxxx   *)
let mask m x = x land (lnot m)
let get_utf8_two lexbuf = 
      ((code lexbuf 0) |> mask 0xc0) * 0x40 
    + ((code lexbuf 1) |> mask 0x80)     
let get_utf8_three lexbuf = 
      ((code lexbuf 0) |> mask 0xe0) * 0x1000 
    + ((code lexbuf 1) |> mask 0x80) * 0x40 
    + ((code lexbuf 2) |> mask 0x80)      
let get_utf8_four lexbuf = 
      ((code lexbuf 0) |> mask 0xe0) * 0x40000 
    + ((code lexbuf 1) |> mask 0x80) * 0x1000 
    + ((code lexbuf 2) |> mask 0x80) * 0x40 
    + ((code lexbuf 3) |> mask 0x80)      
let get_utf8_five lexbuf =
      ((code lexbuf 0) |> mask 0xe0) * 0x100000 
    + ((code lexbuf 1) |> mask 0x80) * 0x40000 
    + ((code lexbuf 2) |> mask 0x80) * 0x1000 
    + ((code lexbuf 3) |> mask 0x80) * 0x40 
    + ((code lexbuf 4) |> mask 0x80)     
let get_utf8_six lexbuf = 
      ((code lexbuf 0) |> mask 0xe0) * 0x400000 
    + ((code lexbuf 1) |> mask 0x80) * 0x100000 
    + ((code lexbuf 2) |> mask 0x80) * 0x40000 
    + ((code lexbuf 3) |> mask 0x80) * 0x1000 
    + ((code lexbuf 4) |> mask 0x80) * 0x40 
    + ((code lexbuf 5) |> mask 0x80)      

let start_string lexbuf =
    let buf = Bytes.Bytebuf.create 100 in 
    let m = get_lex_range  lexbuf in
    let startp = Lexing.lexeme_start_p lexbuf in  
    let fin = (fun m2 b s -> 
                     let endp = Lexing.lexeme_end_p lexbuf in  
                     (* Adjust the start-of-token mark back to the true start of the token *)
                     adjust_lexbuf_start_pos lexbuf startp;
                     if b then BYTEARRAY (Lexhelp.stringbuf_as_bytes buf) 
                     else STRING s)  in
    buf,fin,m
                     
} 

let letter = ['A'-'Z'] | ['a'-'z']
let digit = ['0'-'9']
let hex = ['0'-'9'] | ['A'-'F'] | ['a'-'f']
let truewhite = [' ']
let offwhite = ['\t']
let anywhite = truewhite | offwhite
let op_char = '!'|'$'|'%'|'&'|'*'|'+'|'-'|'.'|'/'|'<'|'='|'>'|'?'|'@'|'^'|'|'|'~'|':'
let ignored_op_char = '.' | '$'
let xinteger = 
  (  '0' ('x'| 'X')  hex + 
   | '0' ('o'| 'O')  (['0'-'7']) + 
   | '0' ('b'| 'B')  (['0'-'1']) + )
let integer = digit+
let int8 = integer 'y'
let uint8 = (xinteger | integer) 'u' 'y' 
let int16 = integer 's'
let uint16 = (xinteger | integer) 'u' 's'
let int = integer 
let int32 = integer 'l'
let uint32 = (xinteger | integer) 'u' 
let uint32l = (xinteger | integer) 'u' 'l'
let nativeint = (xinteger | integer) 'n'
let unativeint = (xinteger | integer) 'u' 'n'
let int64 = (xinteger | integer) 'L' 
let uint64 = (xinteger | integer) ('u' | 'U') 'L' 
let xint8 = xinteger 'y'
let xint16 = xinteger 's'
let xint = xinteger 
let xint32 = xinteger 'l'
let floatp = '-' ? digit+ '.' digit*  
let floate = '-' ? digit+ ('.' digit* )? ('e'| 'E') ['+' '-']? digit+
let float = floatp | floate 
let bigint =  '-' ? integer 'I' 
let bignum =  '-' ? integer 'N' 
let ieee64 = float
(* let ieee64d = (float | '-' ? integer) ('d' | 'D')  *)
let ieee32 = (float | '-' ? integer) ('f' | 'F') 
let decimal = (float | '-' ? integer) ('m' | 'M') 
let xieee32 = xinteger 'l' 'f'
let xieee64 = xinteger 'L' 'F'
let escape_char = ('\\' ( '\\' | "\"" | '\'' | 'n' | 't' | 'b' | 'r'))
let char = '\'' ( [^'\\''\n''\r''\t''\b'] | escape_char) '\''
let trigraph = '\\' digit digit digit
let hexgraph_short = '\\' 'x' hex hex 
let unicodegraph_short = '\\' 'u' hex hex hex hex
let unicodegraph_long =  '\\' 'U' hex hex hex hex hex hex hex hex
let newline = ('\n' | '\r' '\n')
let chevronLeft = '\194' '\171'
let chevronRight = '\194' '\187'
let plusMinus = '\194' '\177'
let sectionSymbol = '\194' '\167'  (* UTF-8 Section symbol *)


(* UTF-8 encodings of unicode characters. We trim a few symbols out of the UTF8-two set *)
let utf8_two   = ((['\192'-'\193'] | ['\195'-'\223']) ['\128'-'\191'])                     (* U-00000080 - U-000007FF:  110xxxxx 10xxxxxx   *)  
                 | ('\194' (['\128'-'\160']))
let utf8_twoB   = ['\192'-'\223'] ['\128'-'\191']                       (* U-00000080 - U-000007FF:  110xxxxx 10xxxxxx   *)
let utf8_three = ['\224'-'\239'] ['\128'-'\191'] ['\128'-'\191']       (*  U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx  *)
let utf8_four  = ['\240'-'\247'] ['\128'-'\191'] ['\128'-'\191'] ['\128'-'\191']  
let utf8_five  = ['\248'-'\251'] ['\128'-'\191'] ['\128'-'\191'] ['\128'-'\191'] ['\128'-'\191']
let utf8_six   = ['\252'-'\253'] ['\128'-'\191'] ['\128'-'\191'] ['\128'-'\191'] ['\128'-'\191'] ['\128'-'\191']


let ident_start_char = 
    letter | ['_'] 
  | utf8_two 
  | utf8_three 
  | utf8_four (*etc. *)
  | utf8_five (*  etc. *)
  | utf8_six (*  etc. *)
  | unicodegraph_short 
  | unicodegraph_long 
let ident_char = ( ident_start_char| digit | ['\''] )
let ident = ident_start_char ident_char*

rule token args skip = parse
 | chevronLeft  
     { deprecated "The use of this unicode symbol as an operator is now deprecated. Use the <@ symbol instead" (get_lex_range lexbuf) ;
       LQUOTE ("<< >>",false) }
 | chevronRight 
     { deprecated "The use of this unicode symbol as an operator is now deprecated. Use the @> symbol instead" (get_lex_range lexbuf) ;
       RQUOTE ("<< >>",false) }
 | chevronLeft '|' 
     { deprecated "The use of this unicode symbol as an operator is now deprecated. Use the <@| symbol instead" (get_lex_range lexbuf) ;
       LQUOTE ("<<| |>>",false) }
 | '|' chevronRight 
     { deprecated "The use of this unicode symbol as an operator is now deprecated. Use the |@> symbol instead" (get_lex_range lexbuf) ;
       RQUOTE ("<<| |>>",false) }
 | sectionSymbol sectionSymbol 
     { deprecated "The use of this unicode symbol as an operator is now deprecated. Use the %% prefix operator instead" (get_lex_range lexbuf) ;
       SPLICE_SYMBOL "~@@"}
 | sectionSymbol 
     { deprecated "The use of this unicode symbol as an operator is now deprecated. Use the % prefix operator instead" (get_lex_range lexbuf) ;
       SPLICE_SYMBOL "~@" }
 | ident { kwd_or_id args lexbuf (ulexeme lexbuf) }
 | "do!" { DO_BANG } 
 | "yield!" { YIELD_BANG(true)  } 
 | "return!" { YIELD_BANG(false) } 
 | ident '!' { BINDER (trim_right lexbuf 1) } 
 | ident ('?' | '#')  { fail lexbuf "Identifiers followed by '?' or '#' are reserved for future use" }
 | int8 
     { let n = trim_to_i32 lexbuf 1 in 
       if n > 0x80l or n < -0x80l then fail lexbuf "This number is outside the allowable range for 8-bit signed integers";
    (* Allow <max_int+1> to parse as min_int.  Allowed only because we parse '-' as an operator. *)
       if n = 0x80l then INT8(Nums.i32_to_i8(-0x80l))
       else INT8(Nums.i32_to_i8 n) }
 | xint8 
     { let n = trim_to_i32 lexbuf 1 in 
       if n > 0xFFl or n < 0l then fail lexbuf "This number is outside the allowable range for hexadecimal 8-bit signed integers";
       INT8(Nums.u8_to_i8(Nums.i32_to_u8(n))) }
 | uint8
     { let n = trim_to_i32 lexbuf 2 in 
       if n > 0xFFl or n < 0l then fail lexbuf "This number is outside the allowable range for 8-bit unsigned integers";
       UINT8(Nums.i32_to_u8 n) }
 | int16 
     { let n = trim_to_i32 lexbuf 1 in 
       if n > 0x8000l or n < -0x8000l then fail lexbuf "This number is outside the allowable range for 16-bit signed integers";
    (* Allow <max_int+1> to parse as min_int.  Allowed only because we parse '-' as an operator. *)
       if n = 0x8000l then INT16(Nums.i32_to_i16(-0x8000l))
       else INT16(Nums.i32_to_i16 n) }
 | xint16 
     { let n = trim_to_i32 lexbuf 1 in 
       if n > 0xFFFFl or n < 0l then fail lexbuf "This number is outside the allowable range for 16-bit signed integers";
        INT16(Nums.u16_to_i16(Nums.i32_to_u16(n))) }
 | uint16 
     { let n = trim_to_i32 lexbuf 2 in 
       if n > 0xFFFFl or n < 0l then fail lexbuf "This number is outside the allowable range for 16-bit unsigned integers";
       UINT16(Nums.i32_to_u16 n) }
 | int '.' '.' 
     { let s = trim_right lexbuf 2 in 
       (* Allow <max_int+1> to parse as min_int.  Allowed only because we parse '-' as an operator. *)
       if s = "2147483648" then INT32(Int32.min_int )
       else try INT32_DOT_DOT(Int32.of_string s) with _ ->  fail lexbuf "This number is outside the allowable range for 32-bit signed integers"
     } 
 | int 
     { let s = ulexeme lexbuf in 
       (* Allow <max_int+1> to parse as min_int.  Allowed only because we parse '-' as an operator. *)
       if s = "2147483648" then INT32(Int32.min_int )
       else try INT32(Int32.of_string s) with _ ->  fail lexbuf "This number is outside the allowable range for 32-bit signed integers"
     } 
 | xint 
     { let s = ulexeme lexbuf in
       try INT32(Int32.of_string s) with _ ->  fail lexbuf "This number is outside the allowable range for 32-bit signed integers"
     } 

 | int32 
     { let s = trim_right lexbuf 1 in 
       (* Allow <max_int+1> to parse as min_int.  Allowed only because we parse '-' as an operator. *)
       if s = "2147483648" then INT32(Int32.min_int )
       else try INT32(Int32.of_string s) with _ ->  fail lexbuf "This number is outside the allowable range for 32-bit signed integers"
     } 

 | xint32 
     { let s = trim_right lexbuf 1 in 
       try INT32(Int32.of_string s) with _ ->  fail lexbuf "This number is outside the allowable range for 32-bit signed integers" }

 | uint32
     { let n = Int64.of_string (trim_right lexbuf 1) in 
       if n > 0xFFFFFFFFL or n < 0L then fail lexbuf "This number is outside the allowable range for 32-bit unsigned integers";
       UINT32(Nums.u64_to_u32 (Nums.i64_to_u64 n)) } 

 | uint32l
     { let n = Int64.of_string (trim_right lexbuf 2) in 
       if n > 0xFFFFFFFFL or n < 0L then 
           fail lexbuf "This number is outside the allowable range for 32-bit unsigned integers";
       UINT32(Nums.u64_to_u32 (Nums.i64_to_u64 n)) } 

 | int64 
     { let s = trim_right lexbuf 1 in 
       (* Allow <max_int+1> to parse as min_int.  Stupid but allowed because we parse '-' as an operator. *)
       if s = "9223372036854775808" then INT64(Int64.min_int )
       else INT64(Int64.of_string (s)) }

 | uint64     { UINT64(Nums.i64_to_u64 (Int64.of_string (trim_right lexbuf 2))) }

 | nativeint  { NATIVEINT(Int64.of_string (trim_right lexbuf 1)) }
 | unativeint { UNATIVEINT(Nums.i64_to_u64 (Int64.of_string (trim_right lexbuf 2))) }
 | ieee32     { IEEE32 (Nums.string_to_ieee32(trim_right lexbuf 1)) }
 | ieee64     { IEEE64 (Nums.float_to_ieee64 (float_of_string (ulexeme lexbuf))) }
(*  | ieee64d    { IEEE64 (Nums.float_to_ieee64 (float_of_string (trim_right lexbuf 1))) } *)
 | decimal    
     { try 
          let s = trim_right lexbuf 1 in
          (* This is an approximate range check for decimal literals *)
          let f = float_of_string s in 
          if f >= 7.922816252e+28 or f <= -7.922816252e+28 then 
              fail lexbuf "This number is outside the allowable range for decimal literals";
          DECIMAL (Bytes.string_as_unicode_bytes s) 
       with 
          e -> errorRecoveryPoint(e); DECIMAL (Bytes.string_as_unicode_bytes "0")
       
     }
 | xieee32     { let n = Int64.of_string (trim_right lexbuf 2) in 
                 if n > 0xFFFFFFFFL or n < 0L then fail lexbuf "This number is outside the allowable range for 32-bit floats";
                 IEEE32 (Nums.bits_to_ieee32 (Nums.u32_to_i32 (Nums.u64_to_u32 (Nums.i64_to_u64 n)))) }
 | xieee64     { IEEE64 (Nums.bits_to_ieee64 (Int64.of_string (trim_right lexbuf 2))) }
 | bigint     { BIGINT (Bytes.string_as_unicode_bytes (trim_right lexbuf 1)) }
 | bignum     { BIGNUM (Bytes.string_as_unicode_bytes (trim_right lexbuf 1)) }

 | (int | xint | float) ident_char+
       { fail lexbuf "This is not a valid numeric literal. Sample formats include 4, 0x4, 0b0100, 4L, 4UL, 4u, 4s, 4us, 4y, 4uy, 4.0, 4.0f, 4N, 4I" }
 
 | char
     { let s = ulexeme lexbuf in 
       CHAR (Nums.u16_to_unichar (char_to_u16 (if s.[1] = '\\' then escape s.[2] else s.[1])))  }
 | char 'B' 
     { let s = ulexeme lexbuf in 
       UINT8 (Nums.int_to_u8 (Char.code (if s.[2] = '\\' then escape s.[3] else s.[2])))  }
     
 | '\'' trigraph '\''
     { let s = ulexeme lexbuf in 
       CHAR (Nums.u16_to_unichar (char_to_u16 (trigraph s.[2] s.[3] s.[4]))) }
     
 | '\'' hexgraph_short '\'' { CHAR (Nums.u16_to_unichar (hexgraph_short (trim_both lexbuf 3 1))) }
 | '\'' unicodegraph_short '\'' { CHAR (Nums.u16_to_unichar (unicodegraph_short (trim_both lexbuf 3 1))) }
 | '\'' unicodegraph_long '\''  { LEX_FAILURE "This unicode encoding is only valid in string literals" }
 | "(*IF-FSHARP"    { skipToken skip (COMMENT AT_token) (token args) lexbuf }
 | "(*F#"           { skipToken skip (COMMENT AT_token) (token args) lexbuf }
 | "ENDIF-FSHARP*)" { skipToken skip (COMMENT AT_token) (token args) lexbuf  }
 | "F#*)"           { skipToken skip (COMMENT AT_token) (token args) lexbuf }

 | "(*"
     { let m = get_lex_range  lexbuf in 
       skipToken skip (COMMENT (AT_comment(1,m))) (comment(1,m,args)) lexbuf }

 | "(*IF-CAML*)" |  "(*IF-OCAML*)" 
     { let m = get_lex_range  lexbuf in 
       skipToken skip (COMMENT (AT_camlonly(m))) (camlonly m args) lexbuf }

 | '"' 
     { let buf,fin,m = start_string lexbuf in 
       skipToken skip (STRING_TEXT (AT_string(m))) (string (buf,fin,m,args)) lexbuf }
      
 | '@' '"' 
     { let buf,fin,m = start_string lexbuf in 
       skipToken skip (STRING_TEXT (AT_vstring m)) (vstring (buf,fin,m,args)) lexbuf }

 | truewhite+  
     { skipToken skip (WHITESPACE AT_token) (token args) lexbuf }

 | offwhite+  
     { if usingLightSyntax(args.lightSyntaxStatus) then warning(Error("TABs are not allowed in #light code",get_lex_range lexbuf));
       skipToken skip (WHITESPACE AT_token) (token args) lexbuf }

 | "////"  [^'\n''\r']* 
     { (* 4+ slash are 1-line comments, online 3 slash are XMLDoc *)
       skipToken skip (LINE_COMMENT AT_token) (token args) lexbuf }

 | ("///" [^'\n''\r']*)
     { (* Match exactly 3 slash, 4+ slash caught by preceeding rule *)
       let doc = trim_left lexbuf 3 in 
       saveXMLDoc doc; (* see def in pars.mly *)
       skipToken skip (LINE_COMMENT AT_token) (token args) lexbuf }

 | ('(' '*' '*' [^'*']* '*' ')')
     { (* Allow OCamlDoc like comments over multi lines *)
       let doc = trim_both lexbuf 3 2 in 
       saveXMLDoc doc; (* see def in pars.mly *)
       skipToken skip (LINE_COMMENT AT_token) (token args) lexbuf }

 | "//"  [^'\n''\r']* 
     { skipToken skip (LINE_COMMENT AT_token) (token args) lexbuf  }

 | newline 
     { newline lexbuf; skipToken skip (WHITESPACE AT_token) (token args) lexbuf }

 | '`' '`' ([^'`' '\n' '\r' '\t'] | '`' [^'`''\n' '\r' '\t']) + '`' '`' 
     { IDENT (trim_both lexbuf 2 2) }

 | '#' anywhite* digit+ anywhite* ('@'? "\"" [^'\n''\r''"']+ '"')? anywhite* newline
     {  let pos = lexeme_end_p lexbuf in 
        let lnum = pos.pos_lnum in 
        if skip then 
          let s = ulexeme lexbuf in 
          let rec leadws n = match s.[n] with 
            ' ' | '\t' -> leadws (n+1) 
          | _ -> line n n 
          and line start n = match s.[n] with 
            '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' -> line start (n+1)
          | _ -> int_of_string (String.sub s start (n-start)), filews n
          and filews n =  match s.[n] with 
            ' ' | '\t' | '@' -> filews (n+1) 
          | '"' -> Some (file (n+1) (n+1))
          | _ -> None
          and file start n =   match s.[n] with 
            '"' -> String.sub s start (n-start)  
          | _ -> file start (n+1) in 
          let line,file = leadws 1 in 
           set_pos lexbuf {pos with
                pos_fname = (match file with Some f -> encode_file_idx (file_idx_of_file f) | None -> pos.pos_fname); 
                pos_bol= pos.pos_cnum;
                pos_lnum=line };
           token args skip lexbuf 
        else 
          skipToken skip (WHITESPACE AT_token) (token args) lexbuf }
 | "<@" op_char* { let s = trim_left lexbuf 2 in LQUOTE (Printf.sprintf "<@%s %s@>" s (rev_string s), quote_op_is_raw s) }
 | op_char* "@>" { let s = trim_right lexbuf 2 in RQUOTE (Printf.sprintf "<@%s %s@>" (rev_string s) s, quote_op_is_raw (rev_string s)) }
 | '#' { HASH }
 | "#use" { HASH_KEYWORD "use" }
 | '&' { AMP }
 | "&&" { AMP_AMP }
 | "||" { BAR_BAR }
 | '\'' { QUOTE }
 | '(' { LPAREN }
 | ')' { RPAREN }
 | '*' { STAR }
 | ',' { COMMA }
 | "->" { RARROW }
 | "->>" { RARROW2 }
 | "?" { QMARK }
 | "??" { QMARK_QMARK }
 | ".." { DOT_DOT }
 | "." { DOT }
 | ":" { COLON }
 | "::" { COLON_COLON }
 | ":>" { COLON_GREATER }
 | ">." { GREATER_DOT }
 | ":?>" { COLON_QMARK_GREATER }
 | ":?" { COLON_QMARK }
 | ":=" { COLON_EQUALS }
 | ";;" { SEMICOLON_SEMICOLON }
 | ";" { SEMICOLON }
 | "<-" { LARROW }
 | "=" { EQUALS }
 | "[" { LBRACK }
 | "[|" { LBRACK_BAR }
 | "<" { LESS }
 | ">" { GREATER }
 | "[<" { LBRACK_LESS }
 | "]" { RBRACK }
 | "|]" { BAR_RBRACK }
 | ">]" { GREATER_RBRACK }
 | "{" { LBRACE }
 | "|" { BAR }
 | "}" { RBRACE }
 | "$" { DOLLAR }
 | "%" { PERCENT_OP("%") }
 | "%%" { PERCENT_OP("%%") }
 | "-" { MINUS }
 | "~" { RESERVED }
 | "`" { RESERVED }
 | ignored_op_char* '*' '*'                    op_char* { check_op lexbuf; INFIX_STAR_STAR_OP(ulexeme lexbuf) }
 | ignored_op_char* ('*' | '/'|'%')            op_char* { check_op lexbuf; INFIX_STAR_DIV_MOD_OP(ulexeme lexbuf) }
 | ignored_op_char* ('+'|'-')                  op_char* { check_op lexbuf; PLUS_MINUS_OP(ulexeme lexbuf) }
 | ignored_op_char* ('@'|'^')                  op_char* { check_op lexbuf; INFIX_AT_HAT_OP(ulexeme lexbuf) }
 | ignored_op_char* ('=' | "!=" | '<' | '>' | '$')  op_char* { check_op lexbuf; INFIX_COMPARE_OP(ulexeme lexbuf) }
 | ignored_op_char* ('&')                      op_char* { check_op lexbuf; INFIX_AMP_OP(ulexeme lexbuf) }
 | ignored_op_char* '|'                        op_char* { check_op lexbuf; INFIX_BAR_OP(ulexeme lexbuf) }
 | ignored_op_char* ('!' | '?' | '~' )         op_char* { check_op lexbuf; PREFIX_OP(ulexeme lexbuf) }
 | plusMinus { PREFIX_OP(ulexeme lexbuf) }
 | ".[]"  | ".[]<-" | ".[,]<-" | ".[,,]<-" | ".[,,]" | ".[,]" | ".[..]" | ".[..,..]" 
 | ".()"  | ".()<-" | ".(,)<-" | ".(,,)<-" | ".(,,)" | ".(,)" | ".(..)" | ".(..,..)" { FUNKY_OPERATOR_NAME(ulexeme lexbuf) }
 
 (* NOTE: This is the BOM (Byte Order Mark) that indicates we have UTF-8 file.  We can safely ignore this.  *)
 (* since for the moment we support both ASCII and UTF-8.  *)
 (* For more on BOMs see for example http://www.unicode.org/faq/utf_bom.html#22 *)
 (* NOTE: This assumes people will only work with ASCII or UTF-8 encodings.  Very very very naughty. *)
 |  '\239' '\187' '\191'  { skipToken skip (WHITESPACE AT_token) (token args) lexbuf }

 | "#light" anywhite* 
   { setLightSyntax(args.lightSyntaxStatus); skipToken skip (WHITESPACE AT_token) (token args) lexbuf } 
   
 | "#if" anywhite+ ident
    {
      let s = ulexeme lexbuf in 
      let rec leadws n = match s.[n] with 
          | ' ' | '\t' -> leadws (n+1) 
          | _ -> ident n
      and ident n = String.sub s n (String.length s-n) in 
      let id = leadws 3 in 
      let m = get_lex_range lexbuf in 
      args.ifdefStack := IfDefIf :: !(args.ifdefStack);
      if List.mem id args.defines
      then (
        skipToken skip (WHITESPACE AT_token) (token args) lexbuf
      ) else (
        skipToken skip (WHITESPACE (AT_ifdef_skip (0,m))) (ifdef_skip (0,m,args)) lexbuf 
      ) }

 | "#else"
   { match !(args.ifdefStack) with
     | []->  LEX_FAILURE "#else has no matching #if" 
     | IfDefElse :: rest -> LEX_FAILURE "#endif required for #else" 
     | IfDefIf :: rest -> 
       let m = get_lex_range  lexbuf in 
       args.ifdefStack := IfDefElse :: rest;
       skipToken skip (WHITESPACE (AT_ifdef_skip (0,m))) (ifdef_skip (0,m,args)) lexbuf }

 | "#endif" 
   { match !(args.ifdefStack) with
     | []->  LEX_FAILURE "#endif has no matching #if" 
     | _ :: rest -> 
        args.ifdefStack := rest;  
        skipToken skip (WHITESPACE AT_token) (token args) lexbuf }
 | _ { unexpected_char lexbuf }     
 | eof { EOF AT_token }


and ifdef_skip args skip = parse                          
  | "#if" anywhite+ ident
     { let (n,m,args) = args in 
       skipToken skip (WHITESPACE (AT_ifdef_skip (n+1,m))) (ifdef_skip (n+1,m,args)) lexbuf }
  | "#else"
     { let (n,m,args) = args in 
       if n = 0 then 
         match !(args.ifdefStack) with
         | []->  LEX_FAILURE "#else has no matching #if" 
         | IfDefElse :: rest -> LEX_FAILURE "#endif required for #else" 
         | IfDefIf :: rest -> 
           let m = get_lex_range  lexbuf in 
           args.ifdefStack := IfDefElse :: rest;
           skipToken skip (WHITESPACE AT_token) (token args) lexbuf 
       else
         skipToken skip (WHITESPACE (AT_ifdef_skip (n,m))) (ifdef_skip (n,m,args)) lexbuf }
  | "#endif" 
     { let (n,m,args) = args in 
       if n = 0 then 
         match !(args.ifdefStack) with
         | []->  LEX_FAILURE "#endif has no matching #if" 
         | _ :: rest -> 
            args.ifdefStack := rest;
            skipToken skip (WHITESPACE AT_token) (token args) lexbuf 
       else
         skipToken skip (WHITESPACE (AT_ifdef_skip (n-1,m))) (ifdef_skip (n-1,m,args)) lexbuf }
  | newline 
     { let (n,m,args) = args in 
       newline lexbuf; ifdef_skip (n,m,args) skip lexbuf }
  | _    
     { let (n,m,args) = args in 
       skipToken skip (WHITESPACE (AT_ifdef_skip (n,m))) (ifdef_skip (n,m,args)) lexbuf }
  | eof  
     { let (n,m,args) = args in 
       EOF (AT_ifdef_skip(n,m)) }

(* NOTE : OCaml doesn't take tailcalls for functions > ~5 arguments.  Sheesh. *)
(* Hence we have to wrap up arguments for deeply nested *)
(* recursive call targets such as this one *)
and string sargs skip = parse
 |  '\\' newline anywhite* 
    { let (buf,fin,m,args) = sargs in 
      newline lexbuf; 
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf }

 |  escape_char
    { let (buf,fin,m,args) = sargs in 
      add_byte_char buf (escape (lexeme_char lexbuf 1));
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf } 

 | trigraph
    (* REVIEW: Disallow these in string sargs constants, at least if > 127, since then *)
    (* they have no established meaning *)
    { let (buf,fin,m,args) = sargs in 
      let s = ulexeme lexbuf in 
      add_byte_char buf (trigraph s.[1] s.[2] s.[3]);
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf }

 | hexgraph_short
    { let (buf,fin,m,args) = sargs in 
      add_unichar buf (hexgraph_short (trim_left lexbuf 2));
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf  }
      
 | unicodegraph_short
    (* REVIEW: Disallow these in bytearray constants *)
    { let (buf,fin,m,args) = sargs in 
      add_unichar buf (unicodegraph_short (trim_left lexbuf 2));
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf  }
     
 | unicodegraph_long
    { let (buf,fin,m,args) = sargs in 
      let hi,lo = unicodegraph_long (trim_left lexbuf 2) in 
      (match hi with | None -> () | Some c -> add_unichar buf c);
      add_unichar buf lo;
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf  }
     
 |  '"' 
    { let (buf,fin,m,args) = sargs in 
      let m2 = get_lex_range lexbuf in 
      call_string_finish fin buf m2 false }

 |  '"''B' 
    { let (buf,fin,m,args) = sargs in 
      let m2 = get_lex_range lexbuf in 
      call_string_finish fin buf m2 true }

 | newline
    { let (buf,fin,m,args) = sargs in 
      newline lexbuf; 
      add_string buf "\n"; 
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf }

 | anywhite +  
    { let (buf,fin,m,args) = sargs in 
      add_string buf (ulexeme lexbuf); 
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf }

 | (op_char | letter | '_' | digit) +  
    { let (buf,fin,m,args) = sargs in 
      add_string buf (ulexeme lexbuf); 
      skipToken skip (STRING_TEXT (AT_string(m)))  (string sargs) lexbuf }

 | utf8_twoB 
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_two lexbuf);
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf }    

 | utf8_three 
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_three lexbuf);
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf }    

 | utf8_four
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_four lexbuf);
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf }    

 | utf8_five
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_five lexbuf);
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf }    

 | utf8_six
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_six lexbuf);
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf }    

 | eof  
    { let (buf,fin,m,args) = sargs in 
      EOF (AT_string(m)) }
 | _ 
    { let (buf,fin,m,args) = sargs in 
      add_byte_char buf (lexeme_char lexbuf 0); 
      skipToken skip (STRING_TEXT (AT_string(m))) (string sargs) lexbuf }

(* REVIEW: consider sharing this code with the 'string' lexer state *)
and vstring sargs skip = parse
 |  '"' '"'
   { let (buf,fin,m,args) = sargs in 
     add_byte_char buf '\"';
     skipToken skip (STRING_TEXT (AT_vstring(m)))  (vstring sargs) lexbuf } 

 |  '"' 
    { let (buf,fin,m,args) = sargs in 
      let m2 = get_lex_range lexbuf in 
      call_string_finish fin buf m2 false }

 |  '"''B' 
    { let (buf,fin,m,args) = sargs in 
      let m2 = get_lex_range lexbuf in 
      call_string_finish fin buf m2 true }

 | newline 
    { let (buf,fin,m,args) = sargs in 
      newline lexbuf; 
      add_string buf "\n"; 
      skipToken skip (STRING_TEXT (AT_vstring(m)))  (vstring sargs) lexbuf }

 | (op_char | anywhite | letter | digit) +  
    { let (buf,fin,m,args) = sargs in 
      add_string buf (ulexeme lexbuf); 
      skipToken skip (STRING_TEXT (AT_vstring(m)))  (vstring sargs) lexbuf }

 | utf8_twoB 
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_two lexbuf);
      skipToken skip (STRING_TEXT (AT_vstring(m))) (vstring sargs) lexbuf }    

 | utf8_three 
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_three lexbuf);
      skipToken skip (STRING_TEXT (AT_vstring(m))) (vstring sargs) lexbuf }    

 | utf8_four
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_four lexbuf);
      skipToken skip (STRING_TEXT (AT_vstring(m))) (vstring sargs) lexbuf }    

 | utf8_five
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_five lexbuf);
      skipToken skip (STRING_TEXT (AT_vstring(m))) (vstring sargs) lexbuf }    

 | utf8_six
    { let (buf,fin,m,args) = sargs in 
      add_int_char buf (get_utf8_six lexbuf);
      skipToken skip (STRING_TEXT (AT_vstring(m))) (vstring sargs) lexbuf }    

 | eof 
    { let (buf,fin,m,args) = sargs in 
      EOF (AT_vstring(m)) }
 | _ 
    { let (buf,fin,m,args) = sargs in 
      add_byte_char buf (lexeme_char lexbuf 0); 
      skipToken skip (STRING_TEXT (AT_vstring(m))) (vstring sargs) lexbuf }

(* WARNING: taking sargs as a single parameter seems to make a difference as to whether *)
(* OCaml takes tailcalls. *)
and comment cargs skip = parse
 |  char
    { let n,m,args = cargs in 
      skipToken skip (COMMENT (AT_comment(n,m))) (comment(n,m,args)) lexbuf  } 
    
 | '"'   
    { let n,m,args = cargs in 
      skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_string n m args) lexbuf }

 | '@' '"'
    { let n,m,args = cargs in 
      skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_vstring n m args) lexbuf }

 | '(' '*'
    { let n,m,args = cargs in 
      skipToken skip (COMMENT (AT_comment(n+1,m))) (comment (n+1,m,args)) lexbuf }
     
 | newline
    { let n,m,args = cargs in 
      newline lexbuf; 
      skipToken skip (COMMENT (AT_comment(n,m))) (comment cargs) lexbuf }
 | "*)" 
    { 
      let n,m,args = cargs in 
      if n > 1 then skipToken skip (COMMENT (AT_comment(n-1,m))) (comment (n-1,m,args)) lexbuf 
      else skipToken skip (COMMENT AT_token) (token args) lexbuf }
      
 | [^ '\'' '(' '*' '\n' '\r' '"' ')' '@' ]+  
    { 
      let n,m,args = cargs in 
      skipToken skip (COMMENT (AT_comment(n,m))) (comment cargs) lexbuf }
    
 | eof 
     { 
       let n,m,args = cargs in 
       (if verbose then Printf.printf "eof ...\n"; flush stdout);
       EOF (AT_comment(n,m)) }
     
 | _ { let n,m,args = cargs in 
       skipToken skip (COMMENT (AT_comment(n,m))) (comment(n,m,args)) lexbuf }

and comment_string n m args skip = parse
 (* Follow string lexing, skipping tokens until it finishes *)
 | '\\' '"'  
     { skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_string n m args) lexbuf }

 | '"' 
     { skipToken skip (COMMENT (AT_comment(n,m))) (comment(n,m,args)) lexbuf }
     
 | newline 
     { newline lexbuf;  
       skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_string n m args) lexbuf }
     
 | [^ '\\' '(' '*' '\n' '\r' '"' ')' ]+  
     { skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_string n m args) lexbuf }
     
 | eof 
     { EOF (AT_comment_string (n,m)) }
     
 | _  
     { skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_string n m args) lexbuf }

and comment_vstring n m args skip = parse
 (* Follow vstring lexing, in short, skip double-quotes and other chars until we hit a single quote *)
 | '"' '"'
     { skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_vstring n m args) lexbuf }

 | '"' 
     { skipToken skip (COMMENT (AT_comment(n,m))) (comment(n,m,args)) lexbuf }
     
 | newline 
     { newline lexbuf;
       skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_vstring n m args) lexbuf }
     
 | eof 
     { EOF (AT_comment_string (n,m)) }
     
 | _  
     { skipToken skip (COMMENT (AT_comment_string(n,m))) (comment_vstring n m args) lexbuf }
     
and camlonly m args skip = parse
 | "\""
     { let buf = Bytes.Bytebuf.create 100 in 
       let m2 = get_lex_range  lexbuf in 
       let _ = string (buf,default_string_finish,m2,args) skip lexbuf in  
       skipToken skip (COMMENT (AT_camlonly(m))) (camlonly m args) lexbuf }
 | newline { newline lexbuf;  skipToken skip (COMMENT (AT_camlonly(m))) (camlonly m args) lexbuf }
 | "(*ENDIF-CAML*)"  {  skipToken skip (COMMENT AT_token) (token args) lexbuf }
 | "(*ENDIF-OCAML*)" {  skipToken skip (COMMENT AT_token) (token args) lexbuf }
 | [^ '(' '"' '\n' '\r' ]+  { skipToken skip (COMMENT (AT_camlonly(m))) (camlonly m args) lexbuf }
 | eof { EOF (AT_camlonly (m)) }
 | _ {  skipToken skip (COMMENT (AT_camlonly(m))) (camlonly m args) lexbuf }
