From 269fd475adb1dbdb076304b3c48f9f9fbf6e1bfb Mon Sep 17 00:00:00 2001 From: Christian Rinderknecht Date: Tue, 12 Mar 2019 18:33:29 +0100 Subject: [PATCH] Enabled the preprocessing by CPP of #include directives in Ligo. Now it is possible to write anywhere in a Ligo source file and have error messages refer to the correct file and line. This is done by extending the lexer to recognise #line directives (generated by CPP). I added the stage "cpp" for the --verbose command-line option: if given, the preprocessed source file is created in the current directory with the name .pp.li, where .li is the original input file. Also the CPP command actually run is printed. I added setter to modules [Pos] and [Region]. --- EvalOpt.ml | 2 +- Lexer.mll | 100 ++++++++++++++++++++++++++++++++++++++++++-------- LexerMain.ml | 31 +++++++++++++++- ParserMain.ml | 30 ++++++++++++++- Pos.ml | 18 ++++++++- Pos.mli | 6 ++- Region.ml | 6 +++ Region.mli | 4 +- 8 files changed, 173 insertions(+), 24 deletions(-) diff --git a/EvalOpt.ml b/EvalOpt.ml index 13c9f51ad..8e78cbe43 100644 --- a/EvalOpt.ml +++ b/EvalOpt.ml @@ -20,7 +20,7 @@ let help () = print_endline " -q, --quiet No output, except errors (default)"; print_endline " --columns Columns for source locations"; print_endline " --bytes Bytes for source locations"; - print_endline " --verbose= cmdline, ast"; + print_endline " --verbose= cmdline, cpp, ast"; print_endline " --version Commit hash on stdout"; print_endline " -h, --help This help"; exit 0 diff --git a/Lexer.mll b/Lexer.mll index e7956337a..709c77803 100644 --- a/Lexer.mll +++ b/Lexer.mll @@ -46,13 +46,30 @@ let reset_file ~file buffer = let open Lexing in buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname = file} -let reset_line line_num buffer = +let reset_line ~line buffer = + assert (line >= 0); let open Lexing in - buffer.lex_curr_p <- {buffer.lex_curr_p with pos_lnum = line_num} + buffer.lex_curr_p <- {buffer.lex_curr_p with pos_lnum = line} -let reset ~file ?(line=1) buffer = - (* Default value per the [Lexing] standard module convention *) - reset_file ~file buffer; reset_line line buffer +let reset_offset ~offset buffer = + assert (offset >= 0); + Printf.printf "[reset] offset=%i\n" offset; + let open Lexing in + let bol = buffer.lex_curr_p.pos_bol in + buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum = bol (*+ offset*)} + +let reset ?file ?line ?offset buffer = + let () = + match file with + Some file -> reset_file ~file buffer + | None -> () in + let () = + match line with + Some line -> reset_line ~line buffer + | None -> () in + match offset with + Some offset -> reset_offset ~offset buffer + | None -> () (* Rolling back one lexeme _within the current semantic action_ *) @@ -192,14 +209,14 @@ module Make (Token: TOKEN) : (S with module Token = Token) = (* STATE *) - (* Beyond tokens, the result of lexing is a state (a so-called - _state monad_). The type [state] represents the logical state - of the lexing engine, that is, a value which is threaded during - scanning and which denotes useful, high-level information - beyond what the type [Lexing.lexbuf] in the standard library - already provides for all generic lexers. + (* Beyond tokens, the result of lexing is a state. The type + [state] represents the logical state of the lexing engine, that + is, a value which is threaded during scanning and which denotes + useful, high-level information beyond what the type + [Lexing.lexbuf] in the standard library already provides for + all generic lexers. - Tokens are the smallest units used by the parser to build the + Tokens are the smallest units used by the parser to build the abstract syntax tree. The state includes a queue of recognised tokens, with the markup at the left of its lexeme until either the start of the file or the end of the previously recognised @@ -225,7 +242,7 @@ module Make (Token: TOKEN) : (S with module Token = Token) = [supply] that takes a byte, a start index and a length and feed it to [decoder]. See the documentation of the third-party library Uutf. - *) + *) type state = { units : (Markup.t list * token) FQueue.t; @@ -427,6 +444,7 @@ module Make (Token: TOKEN) : (S with module Token = Token) = let utf8_bom = "\xEF\xBB\xBF" (* Byte Order Mark for UTF-8 *) let nl = ['\n' '\r'] | "\r\n" +let blank = ' ' | '\t' let digit = ['0'-'9'] let natural = digit | digit (digit | '_')* digit let integer = '-'? natural @@ -446,6 +464,7 @@ let symbol = ';' | ',' | '#' | '|' | "->" | ":=" | '=' | ':' | "||" | "&&" | '<' | "<=" | '>' | ">=" | "=/=" | '+' | '-' | '*' | '.' | '_' | '^' +let string = [^'"' '\\' '\n']* (* For strings of #include *) (* RULES *) @@ -479,14 +498,51 @@ and scan state = parse | "(*" { let opening, _, state = sync state lexbuf in let thread = {opening; len=2; acc=['*';'(']} in - let state = scan_block thread state lexbuf |> push_block + let state = scan_block thread state lexbuf |> push_block in scan state lexbuf } -| "//" { let opening, _, state = sync state lexbuf in +| "//" { let opening, _, state = sync state lexbuf in let thread = {opening; len=2; acc=['/';'/']} in - let state = scan_line thread state lexbuf |> push_line + let state = scan_line thread state lexbuf |> push_line in scan state lexbuf } + (* Management of #include CPP directives + + An input Ligo program may contain GNU CPP (C preprocessor) + directives, and the entry modules (named *Main.ml) run CPP on them + in traditional mode: + + https://gcc.gnu.org/onlinedocs/cpp/Traditional-Mode.html + + The main interest in using CPP is that it can stand for a poor + man's (flat) module system for Ligo thanks to #include + directives, and the traditional mode leaves the markup mostly + undisturbed. + + Some of the #line resulting from processing #include directives + deal with system file headers and thus have to be ignored for our + purpose. Moreover, these #line directives may also carry some + additional flags: + + https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html + + of which 1 and 2 indicate, respectively, the start of a new file + and the return from a file (after its inclusion has been + processed). + *) + +| '#' blank* ("line" blank+)? (integer as line) blank+ + '"' (string as file) '"' { + let _, _, state = sync state lexbuf in + let flags, state = scan_flags state [] lexbuf in + let () = ignore flags in + let line = int_of_string line + and file = Filename.basename file in + let pos = state.pos#set ~file ~line ~offset:0 in + let state = {state with pos} in + scan state lexbuf + } + (* Some special errors Some special errors are recognised in the semantic actions of the @@ -517,6 +573,18 @@ and scan state = parse | _ as c { let region, _, _ = sync state lexbuf in fail region (Unexpected_character c) } +(* Scanning CPP #include flags *) + +and scan_flags state acc = parse + blank+ { let _, _, state = sync state lexbuf + in scan_flags state acc lexbuf } +| integer as code { let _, _, state = sync state lexbuf in + let acc = int_of_string code :: acc + in scan_flags state acc lexbuf } +| nl { List.rev acc, push_newline state lexbuf } +| eof { let _, _, state = sync state lexbuf + in List.rev acc, state (* TODO *) } + (* Finishing a string *) and scan_string thread state = parse diff --git a/LexerMain.ml b/LexerMain.ml index 138a567e7..a2e4dca0d 100644 --- a/LexerMain.ml +++ b/LexerMain.ml @@ -9,9 +9,38 @@ let () = Printexc.record_backtrace true let external_ text = Utils.highlight (Printf.sprintf "External error: %s" text); exit 1;; +(* Preprocessing the input source and opening the input channels *) + +let prefix = + match EvalOpt.input with + None | Some "-" -> "temp" + | Some file -> Filename.(file |> basename |> remove_extension) + +let suffix = ".pp.li" + +let pp_input = + if Utils.String.Set.mem "cpp" EvalOpt.verbose + then prefix ^ suffix + else let pp_input, pp_out = Filename.open_temp_file prefix suffix + in close_out pp_out; pp_input + +let cpp_cmd = + match EvalOpt.input with + None | Some "-" -> + Printf.sprintf "cpp -traditional-cpp - -o %s" pp_input + | Some file -> + Printf.sprintf "cpp -traditional-cpp %s -o %s" file pp_input + +let () = + if Utils.String.Set.mem "cpp" EvalOpt.verbose + then Printf.eprintf "%s\n%!" cpp_cmd; + if Sys.command cpp_cmd <> 0 then + external_ (Printf.sprintf "the command \"%s\" failed." cpp_cmd) + (* Running the lexer on the input file *) module Lexer = Lexer.Make (LexToken) + let () = Lexer.trace ~offsets:EvalOpt.offsets - EvalOpt.mode EvalOpt.input EvalOpt.cmd + EvalOpt.mode (Some pp_input) EvalOpt.cmd diff --git a/ParserMain.ml b/ParserMain.ml index 7644a6769..9b2f79064 100644 --- a/ParserMain.ml +++ b/ParserMain.ml @@ -34,12 +34,40 @@ let lib_path = in List.fold_right mk_I libs "" *) +(* Preprocessing the input source and opening the input channels *) + +let prefix = + match EvalOpt.input with + None | Some "-" -> "temp" + | Some file -> Filename.(file |> basename |> remove_extension) + +let suffix = ".pp.li" + +let pp_input = + if Utils.String.Set.mem "cpp" EvalOpt.verbose + then prefix ^ suffix + else let pp_input, pp_out = Filename.open_temp_file prefix suffix + in close_out pp_out; pp_input + +let cpp_cmd = + match EvalOpt.input with + None | Some "-" -> + Printf.sprintf "cpp -traditional-cpp - -o %s" pp_input + | Some file -> + Printf.sprintf "cpp -traditional-cpp %s -o %s" file pp_input + +let () = + if Utils.String.Set.mem "cpp" EvalOpt.verbose + then Printf.eprintf "%s\n%!" cpp_cmd; + if Sys.command cpp_cmd <> 0 then + external_ (Printf.sprintf "the command \"%s\" failed." cpp_cmd) + (* Instanciating the lexer *) module Lexer = Lexer.Make (LexToken) let Lexer.{read; buffer; get_pos; get_last; close} = - Lexer.open_token_stream EvalOpt.input + Lexer.open_token_stream (Some pp_input) and cout = stdout diff --git a/Pos.ml b/Pos.ml index 7ae0ce55c..b4475aa6e 100644 --- a/Pos.ml +++ b/Pos.ml @@ -7,6 +7,8 @@ type t = < set_file : string -> t; set_line : int -> t; + set_offset : int -> t; + set : file:string -> line:int -> offset:int -> t; new_line : string -> t; add_nl : t; @@ -44,8 +46,20 @@ let make ~byte ~point_num ~point_bol = val point_bol = point_bol method point_bol = point_bol - method set_file file = {< byte = Lexing.{byte with pos_fname = file} >} - method set_line line = {< byte = Lexing.{byte with pos_lnum = line} >} + method set_file file = + {< byte = Lexing.{byte with pos_fname = file} >} + + method set_line line = + {< byte = Lexing.{byte with pos_lnum = line} >} + + method set_offset offset = + {< byte = Lexing.{byte with pos_cnum = byte.pos_bol + offset} >} + + method set ~file ~line ~offset = + let pos = self#set_file file in + let pos = pos#set_line line in + let pos = pos#set_offset offset + in pos (* The string must not contain '\n'. See [new_line]. *) diff --git a/Pos.mli b/Pos.mli index a8b09e23b..998ea9b62 100644 --- a/Pos.mli +++ b/Pos.mli @@ -34,8 +34,10 @@ type t = < (* Setters *) - set_file : string -> t; - set_line : int -> t; + set_file : string -> t; + set_line : int -> t; + set_offset : int -> t; + set : file:string -> line:int -> offset:int -> t; (* The call [pos#new_line s], where the string [s] is either "\n" or "\c\r", updates the position [pos] with a new line. *) diff --git a/Region.ml b/Region.ml index 9bd84893d..68712727f 100644 --- a/Region.ml +++ b/Region.ml @@ -10,6 +10,7 @@ type t = < shift_bytes : int -> t; shift_one_uchar : int -> t; + set_file : string -> t; (* Getters *) @@ -55,6 +56,11 @@ let make ~(start: Pos.t) ~(stop: Pos.t) = and stop = stop#shift_one_uchar len in {< start = start; stop = stop >} + method set_file name = + let start = start#set_file name + and stop = stop#set_file name + in {< start = start; stop = stop >} + (* Getters *) method file = start#file diff --git a/Region.mli b/Region.mli index 973db42d6..fb3b8e240 100644 --- a/Region.mli +++ b/Region.mli @@ -24,10 +24,12 @@ type t = < translation of region [region] of [n] bytes forward in the file. The call [region#shift_one_uchar n] is similar, except that it assumes that [n] is the number of bytes making up one unicode - point. *) + point. The call [region#set_file f] sets the file name to be + [f]. *) shift_bytes : int -> t; shift_one_uchar : int -> t; + set_file : string -> t; (* Getters *)