ligo/vendors/Preproc/Preproc.mll

586 lines
21 KiB
OCaml
Raw Normal View History

2020-01-27 19:05:47 +04:00
(* Preprocessor for C#, to be processed by [ocamllex]. *)
{
(* STRING PROCESSING *)
(* The value of [mk_str len p] ("make string") is a string of length
[len] containing the [len] characters in the list [p], in reverse
order. For instance, [mk_str 3 ['c';'b';'a'] = "abc"]. *)
let mk_str (len: int) (p: char list) : string =
let () = assert (len = List.length p) in
let bytes = Bytes.make len ' ' in
let rec fill i = function
[] -> bytes
| char::l -> Bytes.set bytes i char; fill (i-1) l
in fill (len-1) p |> Bytes.to_string
(* The call [explode s a] is the list made by pushing the characters
in the string [s] on top of [a], in reverse order. For example,
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
let explode s acc =
let rec push = function
0 -> acc
| i -> s.[i-1] :: push (i-1)
in push (String.length s)
(* ERROR HANDLING *)
let stop msg seg = raise (Error.Lexer (msg, seg,1))
let fail msg buffer = stop msg (Error.mk_seg buffer)
exception Local_err of Error.message
let handle_err scan buffer =
try scan buffer with Local_err msg -> fail msg buffer
(* LEXING ENGINE *)
(* Copying the current lexeme to [stdout] *)
let copy buffer = print_string (Lexing.lexeme buffer)
(* End of lines *)
let handle_nl buffer = Lexing.new_line buffer; copy buffer
(* C# PREPROCESSOR DIRECTIVES *)
(* The type [mode] defines the two scanning modes of the preprocessor:
either we copy the current characters or we skip them. *)
type mode = Copy | Skip
(* Trace of directives
We keep track of directives #if, #elif, #else, #region and #endregion.
*)
type cond = If of mode | Elif of mode | Else | Region
type trace = cond list
(* The function [reduce_cond] is called when a #endif directive is
found, and the trace (see type [trace] above) needs updating. *)
let rec reduce_cond seg = function
[] -> stop "Dangling #endif." seg
| If mode::trace -> trace, mode
| Region::_ -> stop "Invalid scoping of #region" seg
| _::trace -> reduce_cond seg trace
(* The function [reduce_reg] is called when a #endregion directive is
read, and the trace needs updating. *)
let reduce_reg seg = function
[] -> stop "Dangling #endregion." seg
| Region::trace -> trace
| _ -> stop "Invalid scoping of #endregion" seg
(* The function [extend] is called when encountering conditional
directives #if, #else and #elif. As its name suggests, it extends
the current trace with the current conditional directive, whilst
performing some validity checks. *)
let extend seg cond trace =
match cond, trace with
If _, Elif _::_ ->
stop "Directive #if cannot follow #elif." seg
| Else, Else::_ ->
stop "Directive #else cannot follow #else." seg
| Else, [] ->
stop "Dangling #else." seg
| Elif _, Else::_ ->
stop "Directive #elif cannot follow #else." seg
| Elif _, [] ->
stop "Dangling #elif." seg
| _ -> cond::trace
(* The function [last_mode] seeks the last mode as recorded in the
trace (see type [trace] above). *)
let rec last_mode = function
[] -> assert false
| (If mode | Elif mode)::_ -> mode
| _::trace -> last_mode trace
(* Line offsets
The value [Inline] of type [offset] means that the current location
cannot be reached from the start of the line with only white
space. The same holds for the special value [Prefix 0]. Values of
the form [Prefix n] mean that the current location can be reached
from the start of the line with [n] white spaces (padding). These
distinctions are needed because preprocessor directives cannot
occur inside lines.
*)
type offset = Prefix of int | Inline
let expand = function
Prefix 0 | Inline -> ()
| Prefix n -> print_string (String.make n ' ')
(* Directives *)
let directives = [
"if"; "else"; "elif"; "endif"; "define"; "undef";
"error"; "warning"; "line"; "region"; "endregion";
"include"]
(* Environments and preprocessor expressions
The evaluation of conditional directives may involve symbols whose
value may be defined using #define directives, or undefined by
means of #undef. Therefore, we need to evaluate conditional
expressions in an environment made of a set of defined symbols.
Note that we rely on an external lexer and parser for the
conditional expressions. See modules [Escan] and [Eparser].
*)
module Env = Set.Make(String)
let rec eval env =
let open Etree
in function
Or (e1,e2) -> eval env e1 || eval env e2
| And (e1,e2) -> eval env e1 && eval env e2
| Eq (e1,e2) -> eval env e1 = eval env e2
| Neq (e1,e2) -> eval env e1 != eval env e2
| Not e -> not (eval env e)
| True -> true
| False -> false
| Ident id -> Env.mem id env
let expr env buffer =
let tree = Eparser.pp_expression Escan.token buffer
in if eval env tree then Copy else Skip
(* END OF HEADER *)
}
(* REGULAR EXPRESSIONS *)
(* White space *)
let nl = '\n' | '\r' | "\r\n"
let blank = ' ' | '\t'
(* Integers *)
let int_suf = 'U' | 'u' | 'L' | 'l' | "UL" | "Ul" | "uL"
| "ul" | "LU" | "Lu" | "lU" | "lu"
let digit = ['0'-'9']
let dec = digit+ int_suf?
let hexdigit = digit | ['A'-'F' 'a'-'f']
let hex_pre = "0x" | "0X"
let hexa = hex_pre hexdigit+ int_suf?
let integer = dec | hexa
(* Unicode escape sequences *)
let four_hex = hexdigit hexdigit hexdigit hexdigit
let uni_esc = "\\u" four_hex | "\\U" four_hex four_hex
(* Identifiers *)
let lowercase = ['a'-'z']
let uppercase = ['A'-'Z']
let letter = lowercase | uppercase | uni_esc
let start = '_' | letter
let alphanum = letter | digit | '_'
let ident = start alphanum*
(* Real *)
let decimal = digit+
let exponent = ['e' 'E'] ['+' '-']? decimal
let real_suf = ['F' 'f' 'D' 'd' 'M' 'm']
let real = (decimal? '.')? decimal exponent? real_suf?
(* Characters *)
let single = [^ '\n' '\r']
let esc = "\\'" | "\\\"" | "\\\\" | "\\0" | "\\a" | "\\b" | "\\f"
| "\\n" | "\\r" | "\\t" | "\\v"
let hex_esc = "\\x" hexdigit hexdigit? hexdigit? hexdigit?
let character = single | esc | hex_esc | uni_esc
let char = "'" character "'"
(* Directives *)
let directive = '#' (blank* as space) (ident as id)
(* Rules *)
(* The rule [scan] scans the input buffer for directives, strings,
comments, blanks, new lines and end of file characters. As a
result, either the matched input is copied to [stdout] or not,
depending on the compilation directives. If not copied, new line
characters are output.
Scanning is triggered by the function call [scan env mode offset
trace lexbuf], where [env] is the set of defined symbols
(introduced by `#define'), [mode] specifies whether we are copying
or skipping the input, [offset] informs about the location in the
line (either there is a prefix of blanks, or at least a non-blank
character has been read), and [trace] is the stack of conditional
directives read so far.
The first call is [scan Env.empty Copy (Prefix 0) []], meaning that
we start with an empty environment, that copying the input is
enabled by default, and that we are at the start of a line and no
previous conditional directives have been read yet.
When an "#if" is matched, the trace is extended by the call [extend
lexbuf (If mode) trace], during the evaluation of which the
syntactic validity of having encountered an "#if" is checked (for
example, it would be invalid had an "#elif" been last read). Note
that the current mode is stored in the trace with the current
directive -- that mode may be later restored (see below for some
examples). Moreover, the directive would be deemed invalid if its
current position in the line (that is, its offset) were not
preceeded by blanks or nothing, otherwise the rule [expr] is called
to scan the boolean expression associated with the "#if": if it
evaluates to [true], the result is [Copy], meaning that we may copy
what follows, otherwise skip it -- the actual decision depending on
the current mode. That new mode is used if we were in copy mode,
and the offset is reset to the start of a new line (as we read a
new line in [expr]); otherwise we were in skipping mode and the
value of the conditional expression must be ignored (but not its
syntax), and we continue skipping the input.
When an "#else" is matched, the trace is extended with [Else],
then, if the directive is not at a wrong offset, the rest of the
line is scanned with [pp_newline]. If we were in copy mode, the new
mode toggles to skipping mode; otherwise, the trace is searched for
the last encountered "#if" of "#elif" and the associated mode is
restored.
The case "#elif" is the result of the fusion (in the technical
sense) of the code for dealing with an "#else" followed by an
"#if".
When an "#endif" is matched, the trace is reduced, that is, all
conditional directives are popped until an [If mode'] is found and
[mode'] is restored as the current mode.
Consider the following four cases, where the modes (Copy/Skip) are
located between the lines:
Copy ----+ Copy ----+
#if true | #if true |
Copy | Copy |
#else | #else |
+-- Skip --+ | +-- Skip --+ |
#if true | | | #if false | | |
| Skip | | | Skip | |
#else | | | #else | | |
+-> Skip | | +-> Skip | |
#endif | | #endif | |
Skip <-+ | Skip <-+ |
#endif | #endif |
Copy <---+ Copy <---+
+-- Copy ----+ Copy --+-+
#if false | | #if false | |
| Skip | Skip | |
#else | | #else | |
+-> Copy --+ | +-+-- Copy <-+ |
#if true | | #if false | | |
Copy | | | | Skip |
#else | | #else | | |
Skip | | | +-> Copy |
#endif | | #endif | |
Copy <-+ | +---> Copy |
#endif | #endif |
Copy <---+ Copy <---+
The following four cases feature #elif. Note that we put between
brackets the mode saved for the #elif, which is sometimes restored
later.
Copy --+ Copy --+
#if true | #if true |
Copy | Copy |
#elif true +--[Skip] | #elif false +--[Skip] |
| Skip | | Skip |
#else | | #else | |
+-> Skip | +-> Skip |
#endif | #endif |
Copy <-+ Copy <-+
+-- Copy --+-+ +-- Copy ----+
#if false | | | #if false | |
| Skip | | | Skip |
#elif true +->[Copy] | | #elif false +->[Copy]--+ |
Copy <-+ | Skip | |
#else | #else | |
Skip | Copy <-+ |
#endif | #endif |
Copy <---+ Copy <---+
Note how "#elif" indeed behaves like an "#else" followed by an
"#if", and the mode stored with the data constructor [Elif]
corresponds to the mode before the virtual "#if".
Important note: Comments and strings are recognised as such only in
copy mode, which is a different behaviour from the preprocessor of
GNU GCC, which always does.
*)
rule scan env mode offset trace = parse
nl { handle_nl lexbuf;
scan env mode (Prefix 0) trace lexbuf }
| blank { match offset with
Prefix n -> scan env mode (Prefix (n+1)) trace lexbuf
| Inline -> copy lexbuf;
scan env mode Inline trace lexbuf }
| directive {
if not (List.mem id directives)
then fail "Invalid preprocessing directive." lexbuf
else if offset = Inline
then fail "Directive invalid inside line." lexbuf
else let seg = Error.mk_seg lexbuf in
match id with
"include" ->
let curr_line = Lexing.(lexbuf.lex_curr_p.pos_lnum)
and curr_file = Lexing.(lexbuf.lex_curr_p.pos_fname)
|> Filename.basename
and incl_file = scan_inclusion lexbuf in
let incl_buffer =
open_in incl_file |> Lexing.from_channel in
Printf.printf "# 1 \"%s\" 1\n" incl_file;
cat incl_buffer;
Printf.printf "# %i \"%s\" 2\n" (curr_line+1) curr_file;
scan env mode offset trace lexbuf
| "if" ->
let mode' = expr env lexbuf in
let new_mode = if mode = Copy then mode' else Skip in
let trace' = extend seg (If mode) trace
in scan env new_mode (Prefix 0) trace' lexbuf
| "else" ->
let () = pp_newline lexbuf in
let new_mode =
if mode = Copy then Skip else last_mode trace in
let trace' = extend seg Else trace
in scan env new_mode (Prefix 0) trace' lexbuf
| "elif" ->
let mode' = expr env lexbuf in
let trace', new_mode =
match mode with
Copy -> extend seg (Elif Skip) trace, Skip
| Skip -> let old_mode = last_mode trace
in extend seg (Elif old_mode) trace,
if old_mode = Copy then mode' else Skip
in scan env new_mode (Prefix 0) trace' lexbuf
| "endif" ->
let () = pp_newline lexbuf in
let trace', new_mode = reduce_cond seg trace
in scan env new_mode (Prefix 0) trace' lexbuf
| "define" ->
let id, seg = ident env lexbuf
in if id="true" || id="false"
then let msg = "Symbol \"" ^ id ^ "\" cannot be defined."
in stop msg seg
else if Env.mem id env
then let msg = "Symbol \"" ^ id
^ "\" was already defined."
in stop msg seg
else scan (Env.add id env) mode (Prefix 0) trace lexbuf
| "undef" ->
let id, _ = ident env lexbuf
in scan (Env.remove id env) mode (Prefix 0) trace lexbuf
| "error" ->
stop (message [] lexbuf) seg
| "warning" ->
let start_p, end_p = seg in
let msg = message [] lexbuf in
let open Lexing
in prerr_endline
("Warning at line " ^ string_of_int start_p.pos_lnum
^ ", char "
^ string_of_int (start_p.pos_cnum - start_p.pos_bol)
^ "--" ^ string_of_int (end_p.pos_cnum - end_p.pos_bol)
^ ":\n" ^ msg);
scan env mode (Prefix 0) trace lexbuf
| "region" ->
let msg = message [] lexbuf
in expand offset;
print_endline ("#" ^ space ^ "region" ^ msg);
scan env mode (Prefix 0) (Region::trace) lexbuf
| "endregion" ->
let msg = message [] lexbuf
in expand offset;
print_endline ("#" ^ space ^ "endregion" ^ msg);
scan env mode (Prefix 0) (reduce_reg seg trace) lexbuf
| "line" ->
expand offset;
print_string ("#" ^ space ^ "line");
line_ind lexbuf;
scan env mode (Prefix 0) trace lexbuf
| _ -> assert false
}
| eof { match trace with
[] -> expand offset; flush stdout; (env, trace)
| _ -> fail "Missing #endif." lexbuf }
| '"' { if mode = Copy then begin
expand offset; copy lexbuf;
handle_err in_norm_str lexbuf
end;
scan env mode Inline trace lexbuf }
| "@\"" { if mode = Copy then begin
expand offset; copy lexbuf;
handle_err in_verb_str lexbuf
end;
scan env mode Inline trace lexbuf }
| "//" { if mode = Copy then begin
expand offset; copy lexbuf;
in_line_com mode lexbuf
end;
scan env mode Inline trace lexbuf }
| "/*" { if mode = Copy then begin
expand offset; copy lexbuf;
handle_err in_block_com lexbuf
end;
scan env mode Inline trace lexbuf }
| _ { if mode = Copy then (expand offset; copy lexbuf);
scan env mode Inline trace lexbuf }
(* Support for #define and #undef *)
and ident env = parse
blank* { let r = __ident env lexbuf
in pp_newline lexbuf; r }
and __ident env = parse
ident as id { id, Error.mk_seg lexbuf }
(* Line indicator (#line) *)
and line_ind = parse
blank* as space { print_string space; line_indicator lexbuf }
and line_indicator = parse
decimal as ind {
print_string ind;
end_indicator lexbuf
}
| ident as id {
match id with
"default" | "hidden" ->
print_endline (id ^ message [] lexbuf)
| _ -> fail "Invalid line indicator." lexbuf
}
| nl | eof { fail "Line indicator expected." lexbuf }
and end_indicator = parse
blank* nl { copy lexbuf; handle_nl lexbuf }
| blank* eof { copy lexbuf }
| blank* "//" { copy lexbuf; print_endline (message [] lexbuf) }
| blank+ '"' { copy lexbuf;
handle_err in_norm_str lexbuf;
opt_line_com lexbuf }
| _ { fail "Line comment or blank expected." lexbuf }
and opt_line_com = parse
nl { handle_nl lexbuf }
| eof { copy lexbuf }
| blank+ { copy lexbuf; opt_line_com lexbuf }
| "//" { print_endline ("//" ^ message [] lexbuf) }
(* New lines and verbatim sequence of characters *)
and pp_newline = parse
nl { handle_nl lexbuf }
| blank+ { pp_newline lexbuf }
| "//" { in_line_com Skip lexbuf }
| _ { fail "Only a single-line comment allowed." lexbuf }
and message acc = parse
nl { Lexing.new_line lexbuf;
mk_str (List.length acc) acc }
| eof { mk_str (List.length acc) acc }
| _ as c { message (c::acc) lexbuf }
(* Comments *)
and in_line_com mode = parse
nl { handle_nl lexbuf }
| eof { flush stdout }
| _ { if mode = Copy then copy lexbuf; in_line_com mode lexbuf }
and in_block_com = parse
nl { handle_nl lexbuf; in_block_com lexbuf }
| "*/" { copy lexbuf }
| eof { raise (Local_err "Unterminated comment.") }
| _ { copy lexbuf; in_block_com lexbuf }
(* Include a file *)
and cat = parse
eof { () }
| _ { copy lexbuf; cat lexbuf }
(* Included filename *)
and scan_inclusion = parse
blank+ { scan_inclusion lexbuf }
| '"' { handle_err (in_inclusion [] 0) lexbuf }
and in_inclusion acc len = parse
'"' { mk_str len acc }
| nl { fail "Newline invalid in string." lexbuf }
| eof { raise (Local_err "Unterminated string.") }
| _ as c { in_inclusion (c::acc) (len+1) lexbuf }
(* Strings *)
and in_norm_str = parse
"\\\"" { copy lexbuf; in_norm_str lexbuf }
| '"' { copy lexbuf }
| nl { fail "Newline invalid in string." lexbuf }
| eof { raise (Local_err "Unterminated string.") }
| _ { copy lexbuf; in_norm_str lexbuf }
and in_verb_str = parse
"\"\"" { copy lexbuf; in_verb_str lexbuf }
| '"' { copy lexbuf }
| nl { handle_nl lexbuf; in_verb_str lexbuf }
| eof { raise (Local_err "Unterminated string.") }
| _ { copy lexbuf; in_verb_str lexbuf }
{
(* The function [lex] is a wrapper of [scan], which also checks that
the trace is empty at the end. Note that we discard the
environment at the end. *)
let lex buffer =
let _env, trace = scan Env.empty Copy (Prefix 0) [] buffer
in assert (trace = [])
(* Exported definitions *)
type filename = string
let trace (name: filename) : unit =
match open_in name with
cin ->
let open Lexing in
let buffer = from_channel cin in
let pos_fname = Filename.basename name in
let () = buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname} in
let open Error
in (try lex buffer with
Lexer diag -> print "Lexical" diag
| Parser diag -> print "Syntactical" diag
| Eparser.Error -> print "" ("Parse", mk_seg buffer, 1));
close_in cin; flush stdout
| exception Sys_error msg -> prerr_endline msg
}