586 lines
21 KiB
OCaml
586 lines
21 KiB
OCaml
(* Preprocessor for C#, to be processed by [ocamllex]. *)
|
|
|
|
{
|
|
(* STRING PROCESSING *)
|
|
|
|
(* The value of [mk_str len p] ("make string") is a string of length
|
|
[len] containing the [len] characters in the list [p], in reverse
|
|
order. For instance, [mk_str 3 ['c';'b';'a'] = "abc"]. *)
|
|
|
|
let mk_str (len: int) (p: char list) : string =
|
|
let () = assert (len = List.length p) in
|
|
let bytes = Bytes.make len ' ' in
|
|
let rec fill i = function
|
|
[] -> bytes
|
|
| char::l -> Bytes.set bytes i char; fill (i-1) l
|
|
in fill (len-1) p |> Bytes.to_string
|
|
|
|
(* The call [explode s a] is the list made by pushing the characters
|
|
in the string [s] on top of [a], in reverse order. For example,
|
|
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
|
|
|
|
let explode s acc =
|
|
let rec push = function
|
|
0 -> acc
|
|
| i -> s.[i-1] :: push (i-1)
|
|
in push (String.length s)
|
|
|
|
(* ERROR HANDLING *)
|
|
|
|
let stop msg seg = raise (Error.Lexer (msg, seg,1))
|
|
let fail msg buffer = stop msg (Error.mk_seg buffer)
|
|
|
|
exception Local_err of Error.message
|
|
|
|
let handle_err scan buffer =
|
|
try scan buffer with Local_err msg -> fail msg buffer
|
|
|
|
(* LEXING ENGINE *)
|
|
|
|
(* Copying the current lexeme to [stdout] *)
|
|
|
|
let copy buffer = print_string (Lexing.lexeme buffer)
|
|
|
|
(* End of lines *)
|
|
|
|
let handle_nl buffer = Lexing.new_line buffer; copy buffer
|
|
|
|
|
|
(* C# PREPROCESSOR DIRECTIVES *)
|
|
|
|
(* The type [mode] defines the two scanning modes of the preprocessor:
|
|
either we copy the current characters or we skip them. *)
|
|
|
|
type mode = Copy | Skip
|
|
|
|
(* Trace of directives
|
|
|
|
We keep track of directives #if, #elif, #else, #region and #endregion.
|
|
*)
|
|
|
|
type cond = If of mode | Elif of mode | Else | Region
|
|
type trace = cond list
|
|
|
|
(* The function [reduce_cond] is called when a #endif directive is
|
|
found, and the trace (see type [trace] above) needs updating. *)
|
|
|
|
let rec reduce_cond seg = function
|
|
[] -> stop "Dangling #endif." seg
|
|
| If mode::trace -> trace, mode
|
|
| Region::_ -> stop "Invalid scoping of #region" seg
|
|
| _::trace -> reduce_cond seg trace
|
|
|
|
(* The function [reduce_reg] is called when a #endregion directive is
|
|
read, and the trace needs updating. *)
|
|
|
|
let reduce_reg seg = function
|
|
[] -> stop "Dangling #endregion." seg
|
|
| Region::trace -> trace
|
|
| _ -> stop "Invalid scoping of #endregion" seg
|
|
|
|
(* The function [extend] is called when encountering conditional
|
|
directives #if, #else and #elif. As its name suggests, it extends
|
|
the current trace with the current conditional directive, whilst
|
|
performing some validity checks. *)
|
|
|
|
let extend seg cond trace =
|
|
match cond, trace with
|
|
If _, Elif _::_ ->
|
|
stop "Directive #if cannot follow #elif." seg
|
|
| Else, Else::_ ->
|
|
stop "Directive #else cannot follow #else." seg
|
|
| Else, [] ->
|
|
stop "Dangling #else." seg
|
|
| Elif _, Else::_ ->
|
|
stop "Directive #elif cannot follow #else." seg
|
|
| Elif _, [] ->
|
|
stop "Dangling #elif." seg
|
|
| _ -> cond::trace
|
|
|
|
(* The function [last_mode] seeks the last mode as recorded in the
|
|
trace (see type [trace] above). *)
|
|
|
|
let rec last_mode = function
|
|
[] -> assert false
|
|
| (If mode | Elif mode)::_ -> mode
|
|
| _::trace -> last_mode trace
|
|
|
|
(* Line offsets
|
|
|
|
The value [Inline] of type [offset] means that the current location
|
|
cannot be reached from the start of the line with only white
|
|
space. The same holds for the special value [Prefix 0]. Values of
|
|
the form [Prefix n] mean that the current location can be reached
|
|
from the start of the line with [n] white spaces (padding). These
|
|
distinctions are needed because preprocessor directives cannot
|
|
occur inside lines.
|
|
*)
|
|
|
|
type offset = Prefix of int | Inline
|
|
|
|
let expand = function
|
|
Prefix 0 | Inline -> ()
|
|
| Prefix n -> print_string (String.make n ' ')
|
|
|
|
(* Directives *)
|
|
|
|
let directives = [
|
|
"if"; "else"; "elif"; "endif"; "define"; "undef";
|
|
"error"; "warning"; "line"; "region"; "endregion";
|
|
"include"]
|
|
|
|
(* Environments and preprocessor expressions
|
|
|
|
The evaluation of conditional directives may involve symbols whose
|
|
value may be defined using #define directives, or undefined by
|
|
means of #undef. Therefore, we need to evaluate conditional
|
|
expressions in an environment made of a set of defined symbols.
|
|
|
|
Note that we rely on an external lexer and parser for the
|
|
conditional expressions. See modules [Escan] and [Eparser].
|
|
*)
|
|
|
|
module Env = Set.Make(String)
|
|
|
|
let rec eval env =
|
|
let open Etree
|
|
in function
|
|
Or (e1,e2) -> eval env e1 || eval env e2
|
|
| And (e1,e2) -> eval env e1 && eval env e2
|
|
| Eq (e1,e2) -> eval env e1 = eval env e2
|
|
| Neq (e1,e2) -> eval env e1 != eval env e2
|
|
| Not e -> not (eval env e)
|
|
| True -> true
|
|
| False -> false
|
|
| Ident id -> Env.mem id env
|
|
|
|
let expr env buffer =
|
|
let tree = Eparser.pp_expression Escan.token buffer
|
|
in if eval env tree then Copy else Skip
|
|
|
|
(* END OF HEADER *)
|
|
}
|
|
|
|
(* REGULAR EXPRESSIONS *)
|
|
|
|
(* White space *)
|
|
|
|
let nl = '\n' | '\r' | "\r\n"
|
|
let blank = ' ' | '\t'
|
|
|
|
(* Integers *)
|
|
|
|
let int_suf = 'U' | 'u' | 'L' | 'l' | "UL" | "Ul" | "uL"
|
|
| "ul" | "LU" | "Lu" | "lU" | "lu"
|
|
let digit = ['0'-'9']
|
|
let dec = digit+ int_suf?
|
|
let hexdigit = digit | ['A'-'F' 'a'-'f']
|
|
let hex_pre = "0x" | "0X"
|
|
let hexa = hex_pre hexdigit+ int_suf?
|
|
let integer = dec | hexa
|
|
|
|
(* Unicode escape sequences *)
|
|
|
|
let four_hex = hexdigit hexdigit hexdigit hexdigit
|
|
let uni_esc = "\\u" four_hex | "\\U" four_hex four_hex
|
|
|
|
(* Identifiers *)
|
|
|
|
let lowercase = ['a'-'z']
|
|
let uppercase = ['A'-'Z']
|
|
let letter = lowercase | uppercase | uni_esc
|
|
let start = '_' | letter
|
|
let alphanum = letter | digit | '_'
|
|
let ident = start alphanum*
|
|
|
|
(* Real *)
|
|
|
|
let decimal = digit+
|
|
let exponent = ['e' 'E'] ['+' '-']? decimal
|
|
let real_suf = ['F' 'f' 'D' 'd' 'M' 'm']
|
|
let real = (decimal? '.')? decimal exponent? real_suf?
|
|
|
|
(* Characters *)
|
|
|
|
let single = [^ '\n' '\r']
|
|
let esc = "\\'" | "\\\"" | "\\\\" | "\\0" | "\\a" | "\\b" | "\\f"
|
|
| "\\n" | "\\r" | "\\t" | "\\v"
|
|
let hex_esc = "\\x" hexdigit hexdigit? hexdigit? hexdigit?
|
|
let character = single | esc | hex_esc | uni_esc
|
|
let char = "'" character "'"
|
|
|
|
(* Directives *)
|
|
|
|
let directive = '#' (blank* as space) (ident as id)
|
|
|
|
(* Rules *)
|
|
|
|
(* The rule [scan] scans the input buffer for directives, strings,
|
|
comments, blanks, new lines and end of file characters. As a
|
|
result, either the matched input is copied to [stdout] or not,
|
|
depending on the compilation directives. If not copied, new line
|
|
characters are output.
|
|
|
|
Scanning is triggered by the function call [scan env mode offset
|
|
trace lexbuf], where [env] is the set of defined symbols
|
|
(introduced by `#define'), [mode] specifies whether we are copying
|
|
or skipping the input, [offset] informs about the location in the
|
|
line (either there is a prefix of blanks, or at least a non-blank
|
|
character has been read), and [trace] is the stack of conditional
|
|
directives read so far.
|
|
|
|
The first call is [scan Env.empty Copy (Prefix 0) []], meaning that
|
|
we start with an empty environment, that copying the input is
|
|
enabled by default, and that we are at the start of a line and no
|
|
previous conditional directives have been read yet.
|
|
|
|
When an "#if" is matched, the trace is extended by the call [extend
|
|
lexbuf (If mode) trace], during the evaluation of which the
|
|
syntactic validity of having encountered an "#if" is checked (for
|
|
example, it would be invalid had an "#elif" been last read). Note
|
|
that the current mode is stored in the trace with the current
|
|
directive -- that mode may be later restored (see below for some
|
|
examples). Moreover, the directive would be deemed invalid if its
|
|
current position in the line (that is, its offset) were not
|
|
preceeded by blanks or nothing, otherwise the rule [expr] is called
|
|
to scan the boolean expression associated with the "#if": if it
|
|
evaluates to [true], the result is [Copy], meaning that we may copy
|
|
what follows, otherwise skip it -- the actual decision depending on
|
|
the current mode. That new mode is used if we were in copy mode,
|
|
and the offset is reset to the start of a new line (as we read a
|
|
new line in [expr]); otherwise we were in skipping mode and the
|
|
value of the conditional expression must be ignored (but not its
|
|
syntax), and we continue skipping the input.
|
|
|
|
When an "#else" is matched, the trace is extended with [Else],
|
|
then, if the directive is not at a wrong offset, the rest of the
|
|
line is scanned with [pp_newline]. If we were in copy mode, the new
|
|
mode toggles to skipping mode; otherwise, the trace is searched for
|
|
the last encountered "#if" of "#elif" and the associated mode is
|
|
restored.
|
|
|
|
The case "#elif" is the result of the fusion (in the technical
|
|
sense) of the code for dealing with an "#else" followed by an
|
|
"#if".
|
|
|
|
When an "#endif" is matched, the trace is reduced, that is, all
|
|
conditional directives are popped until an [If mode'] is found and
|
|
[mode'] is restored as the current mode.
|
|
|
|
Consider the following four cases, where the modes (Copy/Skip) are
|
|
located between the lines:
|
|
|
|
Copy ----+ Copy ----+
|
|
#if true | #if true |
|
|
Copy | Copy |
|
|
#else | #else |
|
|
+-- Skip --+ | +-- Skip --+ |
|
|
#if true | | | #if false | | |
|
|
| Skip | | | Skip | |
|
|
#else | | | #else | | |
|
|
+-> Skip | | +-> Skip | |
|
|
#endif | | #endif | |
|
|
Skip <-+ | Skip <-+ |
|
|
#endif | #endif |
|
|
Copy <---+ Copy <---+
|
|
|
|
|
|
+-- Copy ----+ Copy --+-+
|
|
#if false | | #if false | |
|
|
| Skip | Skip | |
|
|
#else | | #else | |
|
|
+-> Copy --+ | +-+-- Copy <-+ |
|
|
#if true | | #if false | | |
|
|
Copy | | | | Skip |
|
|
#else | | #else | | |
|
|
Skip | | | +-> Copy |
|
|
#endif | | #endif | |
|
|
Copy <-+ | +---> Copy |
|
|
#endif | #endif |
|
|
Copy <---+ Copy <---+
|
|
|
|
The following four cases feature #elif. Note that we put between
|
|
brackets the mode saved for the #elif, which is sometimes restored
|
|
later.
|
|
|
|
Copy --+ Copy --+
|
|
#if true | #if true |
|
|
Copy | Copy |
|
|
#elif true +--[Skip] | #elif false +--[Skip] |
|
|
| Skip | | Skip |
|
|
#else | | #else | |
|
|
+-> Skip | +-> Skip |
|
|
#endif | #endif |
|
|
Copy <-+ Copy <-+
|
|
|
|
|
|
+-- Copy --+-+ +-- Copy ----+
|
|
#if false | | | #if false | |
|
|
| Skip | | | Skip |
|
|
#elif true +->[Copy] | | #elif false +->[Copy]--+ |
|
|
Copy <-+ | Skip | |
|
|
#else | #else | |
|
|
Skip | Copy <-+ |
|
|
#endif | #endif |
|
|
Copy <---+ Copy <---+
|
|
|
|
Note how "#elif" indeed behaves like an "#else" followed by an
|
|
"#if", and the mode stored with the data constructor [Elif]
|
|
corresponds to the mode before the virtual "#if".
|
|
|
|
Important note: Comments and strings are recognised as such only in
|
|
copy mode, which is a different behaviour from the preprocessor of
|
|
GNU GCC, which always does.
|
|
*)
|
|
|
|
rule scan env mode offset trace = parse
|
|
nl { handle_nl lexbuf;
|
|
scan env mode (Prefix 0) trace lexbuf }
|
|
| blank { match offset with
|
|
Prefix n -> scan env mode (Prefix (n+1)) trace lexbuf
|
|
| Inline -> copy lexbuf;
|
|
scan env mode Inline trace lexbuf }
|
|
| directive {
|
|
if not (List.mem id directives)
|
|
then fail "Invalid preprocessing directive." lexbuf
|
|
else if offset = Inline
|
|
then fail "Directive invalid inside line." lexbuf
|
|
else let seg = Error.mk_seg lexbuf in
|
|
match id with
|
|
"include" ->
|
|
let curr_line = Lexing.(lexbuf.lex_curr_p.pos_lnum)
|
|
and curr_file = Lexing.(lexbuf.lex_curr_p.pos_fname)
|
|
|> Filename.basename
|
|
and incl_file = scan_inclusion lexbuf in
|
|
let incl_buffer =
|
|
open_in incl_file |> Lexing.from_channel in
|
|
Printf.printf "# 1 \"%s\" 1\n" incl_file;
|
|
cat incl_buffer;
|
|
Printf.printf "# %i \"%s\" 2\n" (curr_line+1) curr_file;
|
|
scan env mode offset trace lexbuf
|
|
| "if" ->
|
|
let mode' = expr env lexbuf in
|
|
let new_mode = if mode = Copy then mode' else Skip in
|
|
let trace' = extend seg (If mode) trace
|
|
in scan env new_mode (Prefix 0) trace' lexbuf
|
|
| "else" ->
|
|
let () = pp_newline lexbuf in
|
|
let new_mode =
|
|
if mode = Copy then Skip else last_mode trace in
|
|
let trace' = extend seg Else trace
|
|
in scan env new_mode (Prefix 0) trace' lexbuf
|
|
| "elif" ->
|
|
let mode' = expr env lexbuf in
|
|
let trace', new_mode =
|
|
match mode with
|
|
Copy -> extend seg (Elif Skip) trace, Skip
|
|
| Skip -> let old_mode = last_mode trace
|
|
in extend seg (Elif old_mode) trace,
|
|
if old_mode = Copy then mode' else Skip
|
|
in scan env new_mode (Prefix 0) trace' lexbuf
|
|
| "endif" ->
|
|
let () = pp_newline lexbuf in
|
|
let trace', new_mode = reduce_cond seg trace
|
|
in scan env new_mode (Prefix 0) trace' lexbuf
|
|
| "define" ->
|
|
let id, seg = ident env lexbuf
|
|
in if id="true" || id="false"
|
|
then let msg = "Symbol \"" ^ id ^ "\" cannot be defined."
|
|
in stop msg seg
|
|
else if Env.mem id env
|
|
then let msg = "Symbol \"" ^ id
|
|
^ "\" was already defined."
|
|
in stop msg seg
|
|
else scan (Env.add id env) mode (Prefix 0) trace lexbuf
|
|
| "undef" ->
|
|
let id, _ = ident env lexbuf
|
|
in scan (Env.remove id env) mode (Prefix 0) trace lexbuf
|
|
| "error" ->
|
|
stop (message [] lexbuf) seg
|
|
| "warning" ->
|
|
let start_p, end_p = seg in
|
|
let msg = message [] lexbuf in
|
|
let open Lexing
|
|
in prerr_endline
|
|
("Warning at line " ^ string_of_int start_p.pos_lnum
|
|
^ ", char "
|
|
^ string_of_int (start_p.pos_cnum - start_p.pos_bol)
|
|
^ "--" ^ string_of_int (end_p.pos_cnum - end_p.pos_bol)
|
|
^ ":\n" ^ msg);
|
|
scan env mode (Prefix 0) trace lexbuf
|
|
| "region" ->
|
|
let msg = message [] lexbuf
|
|
in expand offset;
|
|
print_endline ("#" ^ space ^ "region" ^ msg);
|
|
scan env mode (Prefix 0) (Region::trace) lexbuf
|
|
| "endregion" ->
|
|
let msg = message [] lexbuf
|
|
in expand offset;
|
|
print_endline ("#" ^ space ^ "endregion" ^ msg);
|
|
scan env mode (Prefix 0) (reduce_reg seg trace) lexbuf
|
|
| "line" ->
|
|
expand offset;
|
|
print_string ("#" ^ space ^ "line");
|
|
line_ind lexbuf;
|
|
scan env mode (Prefix 0) trace lexbuf
|
|
| _ -> assert false
|
|
}
|
|
| eof { match trace with
|
|
[] -> expand offset; flush stdout; (env, trace)
|
|
| _ -> fail "Missing #endif." lexbuf }
|
|
| '"' { if mode = Copy then begin
|
|
expand offset; copy lexbuf;
|
|
handle_err in_norm_str lexbuf
|
|
end;
|
|
scan env mode Inline trace lexbuf }
|
|
| "@\"" { if mode = Copy then begin
|
|
expand offset; copy lexbuf;
|
|
handle_err in_verb_str lexbuf
|
|
end;
|
|
scan env mode Inline trace lexbuf }
|
|
| "//" { if mode = Copy then begin
|
|
expand offset; copy lexbuf;
|
|
in_line_com mode lexbuf
|
|
end;
|
|
scan env mode Inline trace lexbuf }
|
|
| "/*" { if mode = Copy then begin
|
|
expand offset; copy lexbuf;
|
|
handle_err in_block_com lexbuf
|
|
end;
|
|
scan env mode Inline trace lexbuf }
|
|
| _ { if mode = Copy then (expand offset; copy lexbuf);
|
|
scan env mode Inline trace lexbuf }
|
|
|
|
(* Support for #define and #undef *)
|
|
|
|
and ident env = parse
|
|
blank* { let r = __ident env lexbuf
|
|
in pp_newline lexbuf; r }
|
|
|
|
and __ident env = parse
|
|
ident as id { id, Error.mk_seg lexbuf }
|
|
|
|
(* Line indicator (#line) *)
|
|
|
|
and line_ind = parse
|
|
blank* as space { print_string space; line_indicator lexbuf }
|
|
|
|
and line_indicator = parse
|
|
decimal as ind {
|
|
print_string ind;
|
|
end_indicator lexbuf
|
|
}
|
|
| ident as id {
|
|
match id with
|
|
"default" | "hidden" ->
|
|
print_endline (id ^ message [] lexbuf)
|
|
| _ -> fail "Invalid line indicator." lexbuf
|
|
}
|
|
| nl | eof { fail "Line indicator expected." lexbuf }
|
|
|
|
and end_indicator = parse
|
|
blank* nl { copy lexbuf; handle_nl lexbuf }
|
|
| blank* eof { copy lexbuf }
|
|
| blank* "//" { copy lexbuf; print_endline (message [] lexbuf) }
|
|
| blank+ '"' { copy lexbuf;
|
|
handle_err in_norm_str lexbuf;
|
|
opt_line_com lexbuf }
|
|
| _ { fail "Line comment or blank expected." lexbuf }
|
|
|
|
and opt_line_com = parse
|
|
nl { handle_nl lexbuf }
|
|
| eof { copy lexbuf }
|
|
| blank+ { copy lexbuf; opt_line_com lexbuf }
|
|
| "//" { print_endline ("//" ^ message [] lexbuf) }
|
|
|
|
(* New lines and verbatim sequence of characters *)
|
|
|
|
and pp_newline = parse
|
|
nl { handle_nl lexbuf }
|
|
| blank+ { pp_newline lexbuf }
|
|
| "//" { in_line_com Skip lexbuf }
|
|
| _ { fail "Only a single-line comment allowed." lexbuf }
|
|
|
|
and message acc = parse
|
|
nl { Lexing.new_line lexbuf;
|
|
mk_str (List.length acc) acc }
|
|
| eof { mk_str (List.length acc) acc }
|
|
| _ as c { message (c::acc) lexbuf }
|
|
|
|
(* Comments *)
|
|
|
|
and in_line_com mode = parse
|
|
nl { handle_nl lexbuf }
|
|
| eof { flush stdout }
|
|
| _ { if mode = Copy then copy lexbuf; in_line_com mode lexbuf }
|
|
|
|
and in_block_com = parse
|
|
nl { handle_nl lexbuf; in_block_com lexbuf }
|
|
| "*/" { copy lexbuf }
|
|
| eof { raise (Local_err "Unterminated comment.") }
|
|
| _ { copy lexbuf; in_block_com lexbuf }
|
|
|
|
(* Include a file *)
|
|
|
|
and cat = parse
|
|
eof { () }
|
|
| _ { copy lexbuf; cat lexbuf }
|
|
|
|
(* Included filename *)
|
|
|
|
and scan_inclusion = parse
|
|
blank+ { scan_inclusion lexbuf }
|
|
| '"' { handle_err (in_inclusion [] 0) lexbuf }
|
|
|
|
and in_inclusion acc len = parse
|
|
'"' { mk_str len acc }
|
|
| nl { fail "Newline invalid in string." lexbuf }
|
|
| eof { raise (Local_err "Unterminated string.") }
|
|
| _ as c { in_inclusion (c::acc) (len+1) lexbuf }
|
|
|
|
(* Strings *)
|
|
|
|
and in_norm_str = parse
|
|
"\\\"" { copy lexbuf; in_norm_str lexbuf }
|
|
| '"' { copy lexbuf }
|
|
| nl { fail "Newline invalid in string." lexbuf }
|
|
| eof { raise (Local_err "Unterminated string.") }
|
|
| _ { copy lexbuf; in_norm_str lexbuf }
|
|
|
|
and in_verb_str = parse
|
|
"\"\"" { copy lexbuf; in_verb_str lexbuf }
|
|
| '"' { copy lexbuf }
|
|
| nl { handle_nl lexbuf; in_verb_str lexbuf }
|
|
| eof { raise (Local_err "Unterminated string.") }
|
|
| _ { copy lexbuf; in_verb_str lexbuf }
|
|
|
|
{
|
|
(* The function [lex] is a wrapper of [scan], which also checks that
|
|
the trace is empty at the end. Note that we discard the
|
|
environment at the end. *)
|
|
|
|
let lex buffer =
|
|
let _env, trace = scan Env.empty Copy (Prefix 0) [] buffer
|
|
in assert (trace = [])
|
|
|
|
(* Exported definitions *)
|
|
|
|
type filename = string
|
|
|
|
let trace (name: filename) : unit =
|
|
match open_in name with
|
|
cin ->
|
|
let open Lexing in
|
|
let buffer = from_channel cin in
|
|
let pos_fname = Filename.basename name in
|
|
let () = buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname} in
|
|
let open Error
|
|
in (try lex buffer with
|
|
Lexer diag -> print "Lexical" diag
|
|
| Parser diag -> print "Syntactical" diag
|
|
| Eparser.Error -> print "" ("Parse", mk_seg buffer, 1));
|
|
close_in cin; flush stdout
|
|
| exception Sys_error msg -> prerr_endline msg
|
|
|
|
}
|