ligo/src/passes/1-parser/shared/LexerUnit.ml

131 lines
5.2 KiB
OCaml
Raw Normal View History

(* Functor to build a LIGO lexer *)
2020-01-04 23:32:50 +01:00
module Region = Simple_utils.Region
module Preproc = Preprocessor.Preproc
module SSet = Set.Make (String)
2020-01-20 10:57:07 +01:00
module type IO =
2020-01-04 23:32:50 +01:00
sig
val options : EvalOpt.options (* CLI options *)
end
2020-04-28 19:26:31 +02:00
module Make (IO: IO) (Lexer: Lexer.S) =
2020-01-04 23:32:50 +01:00
struct
(* Error printing and exception tracing *)
let () = Printexc.record_backtrace true
(* Preprocessing and lexing the input source *)
2020-01-04 23:32:50 +01:00
let scan () : (Lexer.token list, string Region.reg) Stdlib.result =
(* Preprocessing the input source *)
let preproc cin =
let buffer = Lexing.from_channel cin in
let open Lexing in
let () =
match IO.options#input with
None -> ()
| Some pos_fname ->
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname} in
let opt = (IO.options :> Preprocessor.EvalOpt.options) in
match Preproc.lex opt buffer with
Stdlib.Error (pp_buffer, err) ->
if SSet.mem "preproc" IO.options#verbose then
Printf.printf "%s\n%!" (Buffer.contents pp_buffer);
let formatted =
Preproc.format ~offsets:IO.options#offsets ~file:true err
in Stdlib.Error formatted
| Stdlib.Ok pp_buffer ->
(* Running the lexer on the preprocessed input *)
2020-04-28 19:26:31 +02:00
let source = LexerLib.String (Buffer.contents pp_buffer) in
match LexerLib.open_token_stream
?line:IO.options#line
?block:IO.options#block
~init:Lexer.init
~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
source with
Ok LexerLib.{read; buffer; close; _} ->
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
let close_all () = flush_all (); close () in
let rec read_tokens tokens =
match read ~log:(fun _ _ -> ()) buffer with
token ->
if Lexer.Token.is_eof token
then Stdlib.Ok (List.rev tokens)
else read_tokens (token::tokens)
2020-04-28 19:26:31 +02:00
| exception Lexer.Token.Error error ->
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
let file =
match IO.options#input with
None | Some "-" -> false
| Some _ -> true in
let msg =
Lexer.Token.format_error
~offsets:IO.options#offsets
IO.options#mode ~file error
in Stdlib.Error msg
| exception Lexer.Error error ->
let file =
match IO.options#input with
None | Some "-" -> false
| Some _ -> true in
let msg =
Lexer.format_error ~offsets:IO.options#offsets
IO.options#mode ~file error
in Stdlib.Error msg in
let result = read_tokens []
in close_all (); result
2020-04-28 19:26:31 +02:00
| Stdlib.Error (LexerLib.File_opening msg) ->
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
flush_all (); Stdlib.Error (Region.wrap_ghost msg) in
match IO.options#input with
None -> preproc stdin
| Some file_path ->
try open_in file_path |> preproc with
Sys_error msg -> Stdlib.Error (Region.wrap_ghost msg)
2020-01-20 10:57:07 +01:00
(* Tracing the lexing *)
2020-01-20 10:57:07 +01:00
2020-01-04 23:32:50 +01:00
module Log = LexerLog.Make (Lexer)
let trace () : (unit, string Region.reg) Stdlib.result =
2020-01-20 10:57:07 +01:00
(* Preprocessing the input *)
let preproc cin =
let buffer = Lexing.from_channel cin in
let open Lexing in
let () =
match IO.options#input with
None | Some "-" -> ()
| Some pos_fname ->
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname} in
let opt = (IO.options :> Preprocessor.EvalOpt.options) in
match Preproc.lex opt buffer with
Stdlib.Error (pp_buffer, err) ->
if SSet.mem "preproc" IO.options#verbose then
Printf.printf "%s\n%!" (Buffer.contents pp_buffer);
let formatted =
Preproc.format ~offsets:IO.options#offsets ~file:true err
in Stdlib.Error formatted
| Stdlib.Ok pp_buffer ->
let preproc_str = Buffer.contents pp_buffer in
if SSet.mem "preproc" IO.options#verbose then
begin
Printf.printf "%s\n%!" (Buffer.contents pp_buffer);
Stdlib.Ok ()
end
else Log.trace ~offsets:IO.options#offsets
IO.options#mode
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
?block:IO.options#block
?line:IO.options#line
2020-04-28 19:26:31 +02:00
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
(LexerLib.String preproc_str)
IO.options#cmd
in match IO.options#input with
None -> preproc stdin
| Some file_path ->
try open_in file_path |> preproc with
Sys_error msg -> Stdlib.Error (Region.wrap_ghost msg)
2020-01-04 23:32:50 +01:00
end