(* Preprocessor for C#, to be processed by [ocamllex]. *)

{
(* STRING PROCESSING *)

(* The value of [mk_str len p] ("make string") is a string of length
   [len] containing the [len] characters in the list [p], in reverse
   order. For instance, [mk_str 3 ['c';'b';'a'] = "abc"]. *)

 let mk_str (len: int) (p: char list) : string =
  let () = assert (len = List.length p) in
  let bytes = Bytes.make len ' ' in
  let rec fill i = function
         [] -> bytes
  | char::l -> Bytes.set bytes i char; fill (i-1) l
  in fill (len-1) p |> Bytes.to_string

(* The call [explode s a] is the list made by pushing the characters
   in the string [s] on top of [a], in reverse order. For example,
   [explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)

let explode s acc =
  let rec push = function
    0 -> acc
  | i -> s.[i-1] :: push (i-1)
in push (String.length s)

(* ERROR HANDLING *)

let stop msg seg = raise (Error.Lexer (msg, seg,1))
let fail msg buffer = stop msg (Error.mk_seg buffer)

exception Local_err of Error.message

let handle_err scan buffer =
  try scan buffer with Local_err msg -> fail msg buffer

(* LEXING ENGINE *)

(* Copying the current lexeme to [stdout] *)

let copy buffer = print_string (Lexing.lexeme buffer)

(* End of lines *)

let handle_nl buffer = Lexing.new_line buffer; copy buffer


(* C# PREPROCESSOR DIRECTIVES *)

(* The type [mode] defines the two scanning modes of the preprocessor:
   either we copy the current characters or we skip them. *)

type mode = Copy | Skip

(* Trace of directives

   We keep track of directives #if, #elif, #else, #region and #endregion.
*)

type cond = If of mode | Elif of mode | Else | Region
type trace = cond list

(* The function [reduce_cond] is called when a #endif directive is
   found, and the trace (see type [trace] above) needs updating. *)

let rec reduce_cond seg = function
              [] -> stop "Dangling #endif." seg
| If mode::trace -> trace, mode
|      Region::_ -> stop "Invalid scoping of #region" seg
|       _::trace -> reduce_cond seg trace

(* The function [reduce_reg] is called when a #endregion directive is
   read, and the trace needs updating. *)

let reduce_reg seg = function
             [] -> stop "Dangling #endregion." seg
| Region::trace -> trace
|             _ -> stop "Invalid scoping of #endregion" seg

(* The function [extend] is called when encountering conditional
   directives #if, #else and #elif. As its name suggests, it extends
   the current trace with the current conditional directive, whilst
   performing some validity checks. *)

let extend seg cond trace =
  match cond, trace with
    If _, Elif _::_ ->
      stop "Directive #if cannot follow #elif." seg
  | Else,   Else::_ ->
      stop "Directive #else cannot follow #else." seg
  | Else,        [] ->
      stop "Dangling #else." seg
  | Elif _, Else::_ ->
      stop "Directive #elif cannot follow #else." seg
  | Elif _,      [] ->
      stop "Dangling #elif." seg
  |               _ -> cond::trace

(* The function [last_mode] seeks the last mode as recorded in the
   trace (see type [trace] above). *)

let rec last_mode = function
                        [] -> assert false
| (If mode | Elif mode)::_ -> mode
|                 _::trace -> last_mode trace

(* Line offsets

   The value [Inline] of type [offset] means that the current location
   cannot be reached from the start of the line with only white
   space. The same holds for the special value [Prefix 0]. Values of
   the form [Prefix n] mean that the current location can be reached
   from the start of the line with [n] white spaces (padding). These
   distinctions are needed because preprocessor directives cannot
   occur inside lines.
*)

type offset = Prefix of int | Inline

let expand = function
  Prefix 0 | Inline -> ()
| Prefix n -> print_string (String.make n ' ')

(* Directives *)

let directives = [
  "if"; "else"; "elif"; "endif"; "define"; "undef";
  "error"; "warning"; "line"; "region"; "endregion";
  "include"]

(* Environments and preprocessor expressions

   The evaluation of conditional directives may involve symbols whose
   value may be defined using #define directives, or undefined by
   means of #undef. Therefore, we need to evaluate conditional
   expressions in an environment made of a set of defined symbols.

   Note that we rely on an external lexer and parser for the
   conditional expressions. See modules [Escan] and [Eparser].
*)

module Env = Set.Make(String)

let rec eval env =
  let open Etree
in function
   Or (e1,e2) -> eval env e1 || eval env e2
| And (e1,e2) -> eval env e1 && eval env e2
|  Eq (e1,e2) -> eval env e1 = eval env e2
| Neq (e1,e2) -> eval env e1 != eval env e2
|       Not e -> not (eval env e)
|        True -> true
|       False -> false
|    Ident id -> Env.mem id env

let expr env buffer =
  let tree = Eparser.pp_expression Escan.token buffer
in if eval env tree then Copy else Skip

(* END OF HEADER *)
}

(* REGULAR EXPRESSIONS *)

(* White space *)

let nl = '\n' | '\r' | "\r\n"
let blank = ' ' | '\t'

(* Integers *)

let int_suf = 'U' | 'u' | 'L' | 'l' | "UL" | "Ul" | "uL"
            | "ul" | "LU" | "Lu" | "lU" | "lu"
let digit = ['0'-'9']
let dec = digit+ int_suf?
let hexdigit = digit | ['A'-'F' 'a'-'f']
let hex_pre = "0x" | "0X"
let hexa = hex_pre hexdigit+ int_suf?
let integer = dec | hexa

(* Unicode escape sequences *)

let four_hex = hexdigit hexdigit hexdigit hexdigit
let uni_esc = "\\u" four_hex | "\\U"  four_hex four_hex

(* Identifiers *)

let lowercase = ['a'-'z']
let uppercase = ['A'-'Z']
let letter = lowercase | uppercase | uni_esc
let start = '_' | letter
let alphanum = letter | digit | '_'
let ident = start alphanum*

(* Real *)

let decimal = digit+
let exponent = ['e' 'E'] ['+' '-']? decimal
let real_suf = ['F' 'f' 'D' 'd' 'M' 'm']
let real = (decimal? '.')? decimal exponent? real_suf?

(* Characters *)

let single = [^ '\n' '\r']
let esc = "\\'" | "\\\"" | "\\\\" | "\\0" | "\\a" | "\\b" | "\\f"
        | "\\n" | "\\r" | "\\t" | "\\v"
let hex_esc = "\\x" hexdigit hexdigit? hexdigit? hexdigit?
let character = single | esc | hex_esc | uni_esc
let char = "'" character "'"

(* Directives *)

let directive = '#' (blank* as space) (ident as id)

(* Rules *)

(* The rule [scan] scans the input buffer for directives, strings,
   comments, blanks, new lines and end of file characters. As a
   result, either the matched input is copied to [stdout] or not,
   depending on the compilation directives. If not copied, new line
   characters are output.

   Scanning is triggered by the function call [scan env mode offset
   trace lexbuf], where [env] is the set of defined symbols
   (introduced by `#define'), [mode] specifies whether we are copying
   or skipping the input, [offset] informs about the location in the
   line (either there is a prefix of blanks, or at least a non-blank
   character has been read), and [trace] is the stack of conditional
   directives read so far.

   The first call is [scan Env.empty Copy (Prefix 0) []], meaning that
   we start with an empty environment, that copying the input is
   enabled by default, and that we are at the start of a line and no
   previous conditional directives have been read yet.

   When an "#if" is matched, the trace is extended by the call [extend
   lexbuf (If mode) trace], during the evaluation of which the
   syntactic validity of having encountered an "#if" is checked (for
   example, it would be invalid had an "#elif" been last read). Note
   that the current mode is stored in the trace with the current
   directive -- that mode may be later restored (see below for some
   examples). Moreover, the directive would be deemed invalid if its
   current position in the line (that is, its offset) were not
   preceeded by blanks or nothing, otherwise the rule [expr] is called
   to scan the boolean expression associated with the "#if": if it
   evaluates to [true], the result is [Copy], meaning that we may copy
   what follows, otherwise skip it -- the actual decision depending on
   the current mode. That new mode is used if we were in copy mode,
   and the offset is reset to the start of a new line (as we read a
   new line in [expr]); otherwise we were in skipping mode and the
   value of the conditional expression must be ignored (but not its
   syntax), and we continue skipping the input.

   When an "#else" is matched, the trace is extended with [Else],
   then, if the directive is not at a wrong offset, the rest of the
   line is scanned with [pp_newline]. If we were in copy mode, the new
   mode toggles to skipping mode; otherwise, the trace is searched for
   the last encountered "#if" of "#elif" and the associated mode is
   restored.

   The case "#elif" is the result of the fusion (in the technical
   sense) of the code for dealing with an "#else" followed by an
   "#if".

   When an "#endif" is matched, the trace is reduced, that is, all
   conditional directives are popped until an [If mode'] is found and
   [mode'] is restored as the current mode.

   Consider the following four cases, where the modes (Copy/Skip) are
   located between the lines:

                    Copy ----+                          Copy ----+
   #if true                  |       #if true                    |
                    Copy     |                          Copy     |
   #else                     |       #else                       |
                +-- Skip --+ |                      +-- Skip --+ |
     #if true   |          | |         #if false    |          | |
                |   Skip   | |                      |   Skip   | |
     #else      |          | |         #else        |          | |
                +-> Skip   | |                      +-> Skip   | |
     #endif                | |         #endif                  | |
                    Skip <-+ |                          Skip <-+ |
   #endif                    |       #endif                      |
                    Copy <---+                          Copy <---+


                +-- Copy ----+                          Copy --+-+
   #if false    |            |       #if false                 | |
                |   Skip     |                          Skip   | |
   #else        |            |       #else                     | |
                +-> Copy --+ |                    +-+-- Copy <-+ |
     #if true              | |         #if false  | |            |
                    Copy   | |                    | |   Skip     |
     #else                 | |         #else      | |            |
                    Skip   | |                    | +-> Copy     |
     #endif                | |         #endif     |              |
                    Copy <-+ |                    +---> Copy     |
   #endif                    |       #endif                      |
                    Copy <---+                          Copy <---+

   The following four cases feature #elif. Note that we put between
   brackets the mode saved for the #elif, which is sometimes restored
   later.

                    Copy --+                            Copy --+
   #if true                |         #if true                  |
                    Copy   |                            Copy   |
   #elif true   +--[Skip]  |         #elif false    +--[Skip]  |
                |   Skip   |                        |   Skip   |
   #else        |          |         #else          |          |
                +-> Skip   |                        +-> Skip   |
   #endif                  |         #endif                    |
                    Copy <-+                            Copy <-+


                +-- Copy --+-+                      +-- Copy ----+
   #if false    |          | |       #if false      |            |
                |   Skip   | |                      |   Skip     |
   #elif true   +->[Copy]  | |       #elif false    +->[Copy]--+ |
                    Copy <-+ |                          Skip   | |
   #else                     |       #else                     | |
                    Skip     |                          Copy <-+ |
   #endif                    |       #endif                      |
                    Copy <---+                          Copy <---+

   Note how "#elif" indeed behaves like an "#else" followed by an
   "#if", and the mode stored with the data constructor [Elif]
   corresponds to the mode before the virtual "#if".

   Important note: Comments and strings are recognised as such only in
   copy mode, which is a different behaviour from the preprocessor of
   GNU GCC, which always does.
*)

rule scan env mode offset trace = parse
  nl    { handle_nl lexbuf;
          scan env mode (Prefix 0) trace lexbuf }
| blank { match offset with
            Prefix n -> scan env mode (Prefix (n+1)) trace lexbuf
          |   Inline -> copy lexbuf;
                        scan env mode Inline trace lexbuf }
| directive {
    if   not (List.mem id directives)
    then fail "Invalid preprocessing directive." lexbuf
    else if   offset = Inline
         then fail "Directive invalid inside line." lexbuf
         else let seg = Error.mk_seg lexbuf in
    match id with
      "include" ->
        let curr_line = Lexing.(lexbuf.lex_curr_p.pos_lnum)
        and curr_file = Lexing.(lexbuf.lex_curr_p.pos_fname)
                        |> Filename.basename
        and incl_file = scan_inclusion lexbuf in
        let incl_buffer =
          open_in incl_file |> Lexing.from_channel in
        Printf.printf "# 1 \"%s\" 1\n" incl_file;
        cat incl_buffer;
        Printf.printf "# %i \"%s\" 2\n" (curr_line+1) curr_file;
        scan env mode offset trace lexbuf
    | "if" ->
        let mode' = expr env lexbuf in
        let new_mode = if mode = Copy then mode' else Skip in
        let trace' = extend seg (If mode) trace
        in scan env new_mode (Prefix 0) trace' lexbuf
    | "else" ->
        let () = pp_newline lexbuf in
        let new_mode =
          if mode = Copy then Skip else last_mode trace in
        let trace' = extend seg Else trace
        in scan env new_mode (Prefix 0) trace' lexbuf
    | "elif" ->
        let mode' = expr env lexbuf in
        let trace', new_mode =
          match mode with
            Copy -> extend seg (Elif Skip) trace, Skip
          | Skip -> let old_mode = last_mode trace
                    in extend seg (Elif old_mode) trace,
                       if old_mode = Copy then mode' else Skip
        in scan env new_mode (Prefix 0) trace' lexbuf
    | "endif" ->
        let () = pp_newline lexbuf in
        let trace', new_mode = reduce_cond seg trace
        in scan env new_mode (Prefix 0) trace' lexbuf
    | "define" ->
        let id, seg = ident env lexbuf
        in if id="true" || id="false"
           then let msg = "Symbol \"" ^ id ^ "\" cannot be defined."
                in stop msg seg
           else if Env.mem id env
                then let msg = "Symbol \"" ^ id
                               ^ "\" was already defined."
                     in stop msg seg
                else scan (Env.add id env) mode (Prefix 0) trace lexbuf
    | "undef" ->
        let id, _ = ident env lexbuf
        in scan (Env.remove id env) mode (Prefix 0) trace lexbuf
    | "error" ->
        stop (message [] lexbuf) seg
    | "warning" ->
        let start_p, end_p = seg in
        let msg = message [] lexbuf in
        let open Lexing
        in prerr_endline
             ("Warning at line " ^ string_of_int start_p.pos_lnum
             ^ ", char "
             ^ string_of_int (start_p.pos_cnum - start_p.pos_bol)
             ^ "--" ^ string_of_int (end_p.pos_cnum - end_p.pos_bol)
             ^ ":\n" ^ msg);
           scan env mode (Prefix 0) trace lexbuf
    | "region" ->
        let msg = message [] lexbuf
        in expand offset;
           print_endline ("#" ^ space ^ "region" ^ msg);
           scan env mode (Prefix 0) (Region::trace) lexbuf
    | "endregion" ->
        let msg = message [] lexbuf
        in expand offset;
           print_endline ("#" ^ space ^ "endregion" ^ msg);
           scan env mode (Prefix 0) (reduce_reg seg trace) lexbuf
    | "line" ->
        expand offset;
        print_string ("#" ^ space ^ "line");
        line_ind lexbuf;
        scan env mode (Prefix 0) trace lexbuf
    | _ -> assert false
  }
| eof   { match trace with
            [] -> expand offset; flush stdout; (env, trace)
          |  _ -> fail "Missing #endif." lexbuf }
| '"'   { if mode = Copy then begin
             expand offset; copy lexbuf;
             handle_err in_norm_str lexbuf
          end;
          scan env mode Inline trace lexbuf }
| "@\"" { if mode = Copy then begin
             expand offset; copy lexbuf;
             handle_err in_verb_str lexbuf
          end;
          scan env mode Inline trace lexbuf }
| "//"  { if mode = Copy then begin
             expand offset; copy lexbuf;
             in_line_com mode lexbuf
          end;
          scan env mode Inline trace lexbuf }
| "/*"  { if mode = Copy then begin
             expand offset; copy lexbuf;
             handle_err in_block_com lexbuf
          end;
          scan env mode Inline trace lexbuf }
| _     { if mode = Copy then (expand offset; copy lexbuf);
          scan env mode Inline trace lexbuf }

(* Support for #define and #undef *)

and ident env = parse
  blank* { let r = __ident env lexbuf
           in pp_newline lexbuf; r }

and __ident env = parse
  ident as id { id, Error.mk_seg lexbuf }

(* Line indicator (#line) *)

and line_ind = parse
  blank* as space { print_string space; line_indicator lexbuf }

and line_indicator = parse
  decimal as ind {
    print_string ind;
    end_indicator lexbuf
  }
| ident as id {
    match id with
      "default" | "hidden" ->
        print_endline (id ^ message [] lexbuf)
    | _ -> fail "Invalid line indicator." lexbuf
  }
| nl | eof { fail "Line indicator expected." lexbuf }

and end_indicator = parse
  blank* nl   { copy lexbuf; handle_nl lexbuf }
| blank* eof  { copy lexbuf }
| blank* "//" { copy lexbuf; print_endline (message [] lexbuf) }
| blank+ '"'  { copy lexbuf;
                handle_err in_norm_str lexbuf;
                opt_line_com lexbuf }
| _           { fail "Line comment or blank expected." lexbuf }

and opt_line_com = parse
  nl     { handle_nl lexbuf }
| eof    { copy lexbuf }
| blank+ { copy lexbuf; opt_line_com lexbuf }
| "//"   { print_endline ("//" ^ message [] lexbuf) }

(* New lines and verbatim sequence of characters *)

and pp_newline = parse
  nl     { handle_nl lexbuf }
| blank+ { pp_newline lexbuf }
| "//"   { in_line_com Skip lexbuf }
| _      { fail "Only a single-line comment allowed." lexbuf }

and message acc = parse
  nl     { Lexing.new_line lexbuf;
           mk_str (List.length acc) acc }
| eof    { mk_str (List.length acc) acc }
| _ as c { message (c::acc) lexbuf }

(* Comments *)

and in_line_com mode = parse
  nl  { handle_nl lexbuf }
| eof { flush stdout }
| _   { if mode = Copy then copy lexbuf; in_line_com mode lexbuf }

and in_block_com = parse
  nl   { handle_nl lexbuf; in_block_com lexbuf }
| "*/" { copy lexbuf }
| eof  { raise (Local_err "Unterminated comment.") }
| _    { copy lexbuf; in_block_com lexbuf }

(* Include a file *)

and cat = parse
  eof { () }
| _   { copy lexbuf; cat lexbuf }

(* Included filename *)

and scan_inclusion = parse
  blank+ { scan_inclusion lexbuf                         }
| '"'    { handle_err (in_inclusion [] 0) lexbuf    }

and in_inclusion acc len = parse
  '"'    { mk_str len acc                           }
| nl     { fail "Newline invalid in string." lexbuf }
| eof    { raise (Local_err "Unterminated string.") }
| _ as c { in_inclusion (c::acc) (len+1) lexbuf     }

(* Strings *)

and in_norm_str = parse
  "\\\"" { copy lexbuf; in_norm_str lexbuf }
| '"'    { copy lexbuf }
| nl     { fail "Newline invalid in string." lexbuf }
| eof    { raise (Local_err "Unterminated string.") }
| _      { copy lexbuf; in_norm_str lexbuf }

and in_verb_str = parse
  "\"\"" { copy lexbuf; in_verb_str lexbuf }
| '"'    { copy lexbuf }
| nl     { handle_nl lexbuf; in_verb_str lexbuf }
| eof    { raise (Local_err "Unterminated string.") }
| _      { copy lexbuf; in_verb_str lexbuf }

{
(* The function [lex] is a wrapper of [scan], which also checks that
   the trace is empty at the end.  Note that we discard the
   environment at the end. *)

let lex buffer =
  let _env, trace = scan Env.empty Copy (Prefix 0) [] buffer
in assert (trace = [])

(* Exported definitions *)

type filename = string

let trace (name: filename) : unit =
  match open_in name with
    cin ->
    let open Lexing in
    let buffer = from_channel cin in
    let pos_fname = Filename.basename name in
    let () = buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname} in
    let open Error
      in (try lex buffer with
            Lexer diag    -> print "Lexical" diag
          | Parser diag   -> print "Syntactical" diag
          | Eparser.Error -> print "" ("Parse", mk_seg buffer, 1));
         close_in cin; flush stdout
  | exception Sys_error msg -> prerr_endline msg

}