2019-05-12 20:56:22 +00:00
|
|
|
(* Lexer specification for LIGO, to be processed by [ocamllex]. *)
|
|
|
|
|
|
|
|
{
|
Refactoring of comments (for [dune build @doc]).
Refactoring of parsing command-line arguments
* The type [options] is now abstract and implemented as an
object type to avoid struggling with scoping and type
inference when record types share some common field names.
Refactoring of ParserLog for PascaLIGO and CameLIGO
* The immediate motivation behind that refactoring was to
remove the use of a couple of global references. A
consequence is that we have a nicer and more compact code, by
threading a state. The files [pascaligo/Tests/pp.ligo] and
[ligodity/Tests/pp.mligo].
* Another consequence is that the choice of making strings from
AST nodes depends on the CLI (offsets? mode?). After this
refactoring, that choice is hardcoded in the simplifiers in a
few places (TODO), waiting for a general solution that would
have all CL options flow through the compiler.
* I removed the use of vendors [x_option.ml], [x_map.ml] and
[x_list.ml] when handling optional values. (Less dependencies
this way.)
Refactoring of the ASTs
* I removed the node [local_decl], which was set to [[]]
already in a previous commit (which removed local
declarations as being redundant, as statements could already
be instructions or declarations).
* I changed [StrLit] to [String] in the AST of CameLIGO and
ReasonLIGO.
* I also changed the type [fun_expr] so now either a block is
present, and therefore followed by the [with] keyword, or it
is not. (Before, the presence of a block was not enforced in
the type with the presence of the keyword.)
Notes
* [LexerMain.ml] and [ParserMain.ml] for CameLIGO and PascaLIGO
are almost identical and differ in the same way (language
name and file extension), which suggests that they should be
in the [shared] folder and instanciated as a functor in the
future (TODO).
* I removed the blank characters at the end of many lines in
the parser of ReasonLIGO.
2019-12-13 12:21:52 +01:00
|
|
|
[@@@warning "-42"]
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
module Region = Simple_utils.Region
|
|
|
|
module Pos = Simple_utils.Pos
|
|
|
|
|
|
|
|
(* START HEADER *)
|
|
|
|
|
|
|
|
type lexeme = string
|
|
|
|
|
|
|
|
(* STRING PROCESSING *)
|
|
|
|
|
|
|
|
(* The value of [mk_str len p] ("make string") is a string of length
|
|
|
|
[len] containing the [len] characters in the list [p], in reverse
|
|
|
|
order. For instance, [mk_str 3 ['c';'b';'a'] = "abc"]. *)
|
|
|
|
|
|
|
|
let mk_str (len: int) (p: char list) : string =
|
|
|
|
let bytes = Bytes.make len ' ' in
|
|
|
|
let rec fill i = function
|
|
|
|
[] -> bytes
|
|
|
|
| char::l -> Bytes.set bytes i char; fill (i-1) l
|
|
|
|
in fill (len-1) p |> Bytes.to_string
|
|
|
|
|
|
|
|
(* The call [explode s a] is the list made by pushing the characters
|
|
|
|
in the string [s] on top of [a], in reverse order. For example,
|
|
|
|
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
|
|
|
|
|
|
|
|
let explode s acc =
|
|
|
|
let rec push = function
|
|
|
|
0 -> acc
|
|
|
|
| i -> s.[i-1] :: push (i-1)
|
|
|
|
in push (String.length s)
|
|
|
|
|
|
|
|
(* LEXER ENGINE *)
|
|
|
|
|
|
|
|
(* Resetting file name and line number in the lexing buffer
|
|
|
|
|
|
|
|
The call [reset ~file ~line buffer] modifies in-place the lexing
|
|
|
|
buffer [buffer] so the lexing engine records that the file
|
|
|
|
associated with [buffer] is named [file], and the current line is
|
|
|
|
[line]. This function is useful when lexing a file that has been
|
|
|
|
previously preprocessed by the C preprocessor, in which case the
|
|
|
|
argument [file] is the name of the file that was preprocessed,
|
|
|
|
_not_ the preprocessed file (of which the user is not normally
|
|
|
|
aware). By default, the [line] argument is [1].
|
|
|
|
*)
|
|
|
|
|
|
|
|
let reset_file ~file buffer =
|
|
|
|
let open Lexing in
|
|
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname = file}
|
|
|
|
|
|
|
|
let reset_line ~line buffer =
|
|
|
|
assert (line >= 0);
|
|
|
|
let open Lexing in
|
|
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_lnum = line}
|
|
|
|
|
|
|
|
let reset_offset ~offset buffer =
|
|
|
|
assert (offset >= 0);
|
|
|
|
let open Lexing in
|
|
|
|
let bol = buffer.lex_curr_p.pos_bol in
|
|
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum = bol + offset }
|
|
|
|
|
|
|
|
let reset ?file ?line ?offset buffer =
|
|
|
|
let () =
|
|
|
|
match file with
|
|
|
|
Some file -> reset_file ~file buffer
|
|
|
|
| None -> () in
|
|
|
|
let () =
|
|
|
|
match line with
|
|
|
|
Some line -> reset_line ~line buffer
|
|
|
|
| None -> () in
|
|
|
|
match offset with
|
|
|
|
Some offset -> reset_offset ~offset buffer
|
|
|
|
| None -> ()
|
|
|
|
|
|
|
|
(* Rolling back one lexeme _within the current semantic action_ *)
|
|
|
|
|
|
|
|
let rollback buffer =
|
|
|
|
let open Lexing in
|
|
|
|
let len = String.length (lexeme buffer) in
|
|
|
|
let pos_cnum = buffer.lex_curr_p.pos_cnum - len in
|
|
|
|
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
|
|
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
|
|
|
|
|
|
|
|
(* ALIASES *)
|
|
|
|
|
|
|
|
let sprintf = Printf.sprintf
|
|
|
|
|
|
|
|
(* TOKENS *)
|
|
|
|
|
|
|
|
(* The signature [TOKEN] exports an abstract type [token], so a lexer
|
|
|
|
can be a functor over tokens. Consequently, generic functions to
|
|
|
|
construct tokens are provided. Note predicate [is_eof], which
|
|
|
|
caracterises the virtual token for end-of-file, because it requires
|
|
|
|
special handling. *)
|
|
|
|
|
|
|
|
module type TOKEN =
|
|
|
|
sig
|
|
|
|
type token
|
|
|
|
|
|
|
|
(* Errors *)
|
|
|
|
|
2019-10-12 23:42:26 +02:00
|
|
|
type int_err = Non_canonical_zero
|
|
|
|
type ident_err = Reserved_name
|
|
|
|
type nat_err = Invalid_natural
|
|
|
|
| Non_canonical_zero_nat
|
|
|
|
type sym_err = Invalid_symbol
|
2020-01-16 19:36:04 +00:00
|
|
|
type attr_err = Invalid_attribute
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
(* Injections *)
|
|
|
|
|
|
|
|
val mk_int : lexeme -> Region.t -> (token, int_err) result
|
2019-10-12 23:42:26 +02:00
|
|
|
val mk_nat : lexeme -> Region.t -> (token, nat_err) result
|
2019-10-27 11:50:24 -05:00
|
|
|
val mk_mutez : lexeme -> Region.t -> (token, int_err) result
|
2019-05-12 20:56:22 +00:00
|
|
|
val mk_ident : lexeme -> Region.t -> (token, ident_err) result
|
2019-10-12 23:42:26 +02:00
|
|
|
val mk_sym : lexeme -> Region.t -> (token, sym_err) result
|
|
|
|
val mk_string : lexeme -> Region.t -> token
|
|
|
|
val mk_bytes : lexeme -> Region.t -> token
|
2019-05-12 20:56:22 +00:00
|
|
|
val mk_constr : lexeme -> Region.t -> token
|
2020-01-20 10:57:07 +01:00
|
|
|
val mk_attr : string -> lexeme -> Region.t -> (token, attr_err) result
|
2019-05-12 20:56:22 +00:00
|
|
|
val eof : Region.t -> token
|
|
|
|
|
|
|
|
(* Predicates *)
|
|
|
|
|
|
|
|
val is_string : token -> bool
|
|
|
|
val is_bytes : token -> bool
|
|
|
|
val is_int : token -> bool
|
|
|
|
val is_ident : token -> bool
|
|
|
|
val is_kwd : token -> bool
|
|
|
|
val is_constr : token -> bool
|
|
|
|
val is_sym : token -> bool
|
|
|
|
val is_eof : token -> bool
|
|
|
|
|
|
|
|
(* Projections *)
|
|
|
|
|
|
|
|
val to_lexeme : token -> lexeme
|
|
|
|
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
|
|
|
|
val to_region : token -> Region.t
|
|
|
|
end
|
|
|
|
|
|
|
|
(* The module type for lexers is [S]. *)
|
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
module type S =
|
|
|
|
sig
|
|
|
|
module Token : TOKEN
|
|
|
|
type token = Token.token
|
|
|
|
|
|
|
|
type file_path = string
|
|
|
|
type logger = Markup.t list -> token -> unit
|
|
|
|
|
|
|
|
type window =
|
|
|
|
Nil
|
|
|
|
| One of token
|
|
|
|
| Two of token * token
|
|
|
|
|
|
|
|
val slide : token -> window -> window
|
|
|
|
|
|
|
|
type instance = {
|
2020-01-31 12:31:25 +01:00
|
|
|
read : log:logger -> Lexing.lexbuf -> token;
|
2019-12-20 16:44:03 +01:00
|
|
|
buffer : Lexing.lexbuf;
|
|
|
|
get_win : unit -> window;
|
|
|
|
get_pos : unit -> Pos.t;
|
|
|
|
get_last : unit -> Region.t;
|
2020-01-14 18:35:30 +01:00
|
|
|
get_file : unit -> file_path;
|
2019-12-20 16:44:03 +01:00
|
|
|
close : unit -> unit
|
2020-01-23 18:28:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
type input =
|
|
|
|
File of file_path (* "-" means stdin *)
|
|
|
|
| Stdin
|
|
|
|
| String of string
|
|
|
|
| Channel of in_channel
|
|
|
|
| Buffer of Lexing.lexbuf
|
|
|
|
|
|
|
|
type open_err = File_opening of string
|
2019-12-20 16:44:03 +01:00
|
|
|
|
2020-01-23 18:28:04 +01:00
|
|
|
val open_token_stream : input -> (instance, open_err) Stdlib.result
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
(* Error reporting *)
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
type error
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2020-01-04 17:47:20 +01:00
|
|
|
val error_to_string : error -> string
|
2020-01-04 08:24:16 +00:00
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
exception Error of error Region.reg
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2020-01-20 10:57:07 +01:00
|
|
|
val format_error :
|
|
|
|
?offsets:bool -> [`Byte | `Point] ->
|
2020-01-23 18:28:04 +01:00
|
|
|
error Region.reg -> file:bool -> string Region.reg
|
2019-12-20 16:44:03 +01:00
|
|
|
end
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
(* The functorised interface
|
|
|
|
|
|
|
|
Note that the module parameter [Token] is re-exported as a
|
|
|
|
submodule in [S].
|
|
|
|
*)
|
|
|
|
|
|
|
|
module Make (Token: TOKEN) : (S with module Token = Token) =
|
|
|
|
struct
|
|
|
|
module Token = Token
|
|
|
|
type token = Token.token
|
|
|
|
|
|
|
|
type file_path = string
|
|
|
|
|
|
|
|
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
|
|
|
|
|
|
|
|
(* When scanning structured constructs, like strings and comments,
|
|
|
|
we need to keep the region of the opening symbol (like double
|
|
|
|
quote, "//" or "(*") in order to report any error more
|
|
|
|
precisely. Since ocamllex is byte-oriented, we need to store
|
|
|
|
the parsed bytes as characters in an accumulator [acc] and
|
|
|
|
also its length [len], so, we are done, it is easy to build the
|
|
|
|
string making up the structured construct with [mk_str] (see
|
|
|
|
above).
|
|
|
|
|
|
|
|
The resulting data structure is called a _thread_.
|
|
|
|
(Note for Emacs: "*)".)
|
|
|
|
*)
|
|
|
|
|
|
|
|
type thread = {
|
|
|
|
opening : Region.t;
|
|
|
|
len : int;
|
|
|
|
acc : char list
|
|
|
|
}
|
|
|
|
|
|
|
|
let push_char char {opening; len; acc} = {opening; len=len+1; acc=char::acc}
|
|
|
|
|
|
|
|
let push_string str {opening; len; acc} =
|
|
|
|
{opening;
|
|
|
|
len = len + String.length str;
|
|
|
|
acc = explode str acc}
|
|
|
|
|
|
|
|
(* STATE *)
|
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
(** The type [buffer] models a two-slot buffer of tokens for
|
|
|
|
reporting after a parse error.
|
|
|
|
|
|
|
|
In [Two(t1,t2)], the token [t2] is the next to be sent to the
|
|
|
|
parser.
|
|
|
|
|
|
|
|
The call [slide token buffer] pushes the token [token] in the
|
|
|
|
buffer [buffer]. If the buffer is full, that is, it is
|
|
|
|
[Two(t1,t2)], then the token [t2] is discarded to make room for
|
|
|
|
[token].
|
|
|
|
*)
|
|
|
|
type window =
|
|
|
|
Nil
|
|
|
|
| One of token
|
|
|
|
| Two of token * token
|
|
|
|
|
|
|
|
let slide token = function
|
|
|
|
Nil -> One token
|
|
|
|
| One t | Two (t,_) -> Two (token,t)
|
|
|
|
|
2020-03-31 19:44:10 +02:00
|
|
|
(* Beyond tokens, the result of lexing is a state. The type
|
2019-05-12 20:56:22 +00:00
|
|
|
[state] represents the logical state of the lexing engine, that
|
|
|
|
is, a value which is threaded during scanning and which denotes
|
|
|
|
useful, high-level information beyond what the type
|
|
|
|
[Lexing.lexbuf] in the standard library already provides for
|
|
|
|
all generic lexers.
|
|
|
|
|
|
|
|
Tokens are the smallest units used by the parser to build the
|
|
|
|
abstract syntax tree. The state includes a queue of recognised
|
|
|
|
tokens, with the markup at the left of its lexeme until either
|
|
|
|
the start of the file or the end of the previously recognised
|
|
|
|
token.
|
|
|
|
|
|
|
|
The markup from the last recognised token or, if the first
|
|
|
|
token has not been recognised yet, from the beginning of the
|
|
|
|
file is stored in the field [markup] of the state because it is
|
|
|
|
a side-effect, with respect to the output token list, and we
|
|
|
|
use a record with a single field [units] because that record
|
|
|
|
may be easily extended during the future maintenance of this
|
|
|
|
lexer.
|
|
|
|
|
|
|
|
The state also includes a field [pos] which holds the current
|
|
|
|
position in the LIGO source file. The position is not always
|
|
|
|
updated after a single character has been matched: that depends
|
|
|
|
on the regular expression that matched the lexing buffer.
|
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
The field [win] is a two-token window, that is, a buffer that
|
|
|
|
contains the last recognised token, and the penultimate (if
|
|
|
|
any).
|
|
|
|
|
2019-05-12 20:56:22 +00:00
|
|
|
The fields [decoder] and [supply] offer the support needed
|
|
|
|
for the lexing of UTF-8 encoded characters in comments (the
|
|
|
|
only place where they are allowed in LIGO). The former is the
|
|
|
|
decoder proper and the latter is the effectful function
|
|
|
|
[supply] that takes a byte, a start index and a length and feed
|
|
|
|
it to [decoder]. See the documentation of the third-party
|
|
|
|
library Uutf.
|
|
|
|
*)
|
2020-03-31 19:44:10 +02:00
|
|
|
|
2019-05-12 20:56:22 +00:00
|
|
|
type state = {
|
|
|
|
units : (Markup.t list * token) FQueue.t;
|
|
|
|
markup : Markup.t list;
|
2019-12-20 16:44:03 +01:00
|
|
|
window : window;
|
2019-05-12 20:56:22 +00:00
|
|
|
last : Region.t;
|
|
|
|
pos : Pos.t;
|
|
|
|
decoder : Uutf.decoder;
|
|
|
|
supply : Bytes.t -> int -> int -> unit
|
|
|
|
}
|
|
|
|
|
|
|
|
(* The call [enqueue (token, state)] updates functionally the
|
|
|
|
state [state] by associating the token [token] with the stored
|
|
|
|
markup and enqueuing the pair into the units queue. The field
|
|
|
|
[markup] is then reset to the empty list. *)
|
|
|
|
|
|
|
|
let enqueue (token, state) = {
|
|
|
|
state with
|
|
|
|
units = FQueue.enq (state.markup, token) state.units;
|
|
|
|
markup = []
|
|
|
|
}
|
|
|
|
|
|
|
|
(* The call [sync state buffer] updates the current position in
|
|
|
|
accordance with the contents of the lexing buffer, more
|
|
|
|
precisely, depending on the length of the string which has just
|
|
|
|
been recognised by the scanner: that length is used as a
|
|
|
|
positive offset to the current column. *)
|
|
|
|
|
|
|
|
let sync state buffer =
|
|
|
|
let lex = Lexing.lexeme buffer in
|
|
|
|
let len = String.length lex in
|
|
|
|
let start = state.pos in
|
|
|
|
let stop = start#shift_bytes len in
|
|
|
|
let state = {state with pos = stop}
|
|
|
|
in Region.make ~start ~stop, lex, state
|
|
|
|
|
|
|
|
(* MARKUP *)
|
|
|
|
|
|
|
|
(* Committing markup to the current logical state *)
|
|
|
|
|
|
|
|
let push_newline state buffer =
|
|
|
|
let value = Lexing.lexeme buffer
|
|
|
|
and () = Lexing.new_line buffer
|
|
|
|
and start = state.pos in
|
|
|
|
let stop = start#new_line value in
|
|
|
|
let state = {state with pos = stop}
|
|
|
|
and region = Region.make ~start ~stop in
|
|
|
|
let unit = Markup.Newline Region.{region; value} in
|
|
|
|
let markup = unit :: state.markup
|
|
|
|
in {state with markup}
|
|
|
|
|
|
|
|
let push_line (thread, state) =
|
|
|
|
let start = thread.opening#start in
|
|
|
|
let region = Region.make ~start ~stop:state.pos
|
|
|
|
and value = mk_str thread.len thread.acc in
|
|
|
|
let unit = Markup.LineCom Region.{region; value} in
|
|
|
|
let markup = unit :: state.markup
|
|
|
|
in {state with markup}
|
|
|
|
|
|
|
|
let push_block (thread, state) =
|
|
|
|
let start = thread.opening#start in
|
|
|
|
let region = Region.make ~start ~stop:state.pos
|
|
|
|
and value = mk_str thread.len thread.acc in
|
|
|
|
let unit = Markup.BlockCom Region.{region; value} in
|
|
|
|
let markup = unit :: state.markup
|
|
|
|
in {state with markup}
|
|
|
|
|
|
|
|
let push_space state buffer =
|
|
|
|
let region, lex, state = sync state buffer in
|
|
|
|
let value = String.length lex in
|
|
|
|
let unit = Markup.Space Region.{region; value} in
|
|
|
|
let markup = unit :: state.markup
|
|
|
|
in {state with markup}
|
|
|
|
|
|
|
|
let push_tabs state buffer =
|
|
|
|
let region, lex, state = sync state buffer in
|
|
|
|
let value = String.length lex in
|
|
|
|
let unit = Markup.Tabs Region.{region; value} in
|
|
|
|
let markup = unit :: state.markup
|
|
|
|
in {state with markup}
|
|
|
|
|
|
|
|
let push_bom state buffer =
|
|
|
|
let region, value, state = sync state buffer in
|
|
|
|
let unit = Markup.BOM Region.{region; value} in
|
|
|
|
let markup = unit :: state.markup
|
|
|
|
in {state with markup}
|
|
|
|
|
|
|
|
(* ERRORS *)
|
|
|
|
|
2019-12-17 14:56:16 +01:00
|
|
|
type error =
|
|
|
|
Invalid_utf8_sequence
|
|
|
|
| Unexpected_character of char
|
|
|
|
| Undefined_escape_sequence
|
|
|
|
| Missing_break
|
|
|
|
| Unterminated_string
|
|
|
|
| Unterminated_integer
|
|
|
|
| Odd_lengthed_bytes
|
|
|
|
| Unterminated_comment
|
|
|
|
| Orphan_minus
|
|
|
|
| Non_canonical_zero
|
|
|
|
| Negative_byte_sequence
|
|
|
|
| Broken_string
|
|
|
|
| Invalid_character_in_string
|
2020-01-04 08:24:16 +00:00
|
|
|
| Reserved_name of string
|
2019-12-17 14:56:16 +01:00
|
|
|
| Invalid_symbol
|
|
|
|
| Invalid_natural
|
2020-01-16 19:36:04 +00:00
|
|
|
| Invalid_attribute
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
let error_to_string = function
|
|
|
|
Invalid_utf8_sequence ->
|
|
|
|
"Invalid UTF-8 sequence.\n"
|
|
|
|
| Unexpected_character c ->
|
|
|
|
sprintf "Unexpected character '%s'.\n" (Char.escaped c)
|
|
|
|
| Undefined_escape_sequence ->
|
|
|
|
"Undefined escape sequence.\n\
|
|
|
|
Hint: Remove or replace the sequence.\n"
|
|
|
|
| Missing_break ->
|
|
|
|
"Missing break.\n\
|
|
|
|
Hint: Insert some space.\n"
|
|
|
|
| Unterminated_string ->
|
|
|
|
"Unterminated string.\n\
|
|
|
|
Hint: Close with double quotes.\n"
|
|
|
|
| Unterminated_integer ->
|
|
|
|
"Unterminated integer.\n\
|
|
|
|
Hint: Remove the sign or proceed with a natural number.\n"
|
|
|
|
| Odd_lengthed_bytes ->
|
|
|
|
"The length of the byte sequence is an odd number.\n\
|
|
|
|
Hint: Add or remove a digit.\n"
|
|
|
|
| Unterminated_comment ->
|
|
|
|
"Unterminated comment.\n\
|
|
|
|
Hint: Close with \"*)\".\n"
|
|
|
|
| Orphan_minus ->
|
|
|
|
"Orphan minus sign.\n\
|
|
|
|
Hint: Remove the trailing space.\n"
|
|
|
|
| Non_canonical_zero ->
|
|
|
|
"Non-canonical zero.\n\
|
|
|
|
Hint: Use 0.\n"
|
|
|
|
| Negative_byte_sequence ->
|
|
|
|
"Negative byte sequence.\n\
|
|
|
|
Hint: Remove the leading minus sign.\n"
|
|
|
|
| Broken_string ->
|
|
|
|
"The string starting here is interrupted by a line break.\n\
|
2020-01-31 12:31:25 +01:00
|
|
|
Hint: Remove the break, close the string before or insert a \
|
|
|
|
backslash.\n"
|
2019-05-12 20:56:22 +00:00
|
|
|
| Invalid_character_in_string ->
|
|
|
|
"Invalid character in string.\n\
|
|
|
|
Hint: Remove or replace the character.\n"
|
2020-01-04 08:24:16 +00:00
|
|
|
| Reserved_name s ->
|
2020-01-31 12:31:25 +01:00
|
|
|
sprintf "Reserved name: \"%s\".\n\
|
|
|
|
Hint: Change the name.\n" s
|
2019-10-12 23:42:26 +02:00
|
|
|
| Invalid_symbol ->
|
|
|
|
"Invalid symbol.\n\
|
|
|
|
Hint: Check the LIGO syntax you use.\n"
|
2019-09-27 13:33:25 +00:00
|
|
|
| Invalid_natural ->
|
|
|
|
"Invalid natural."
|
2020-01-20 10:57:07 +01:00
|
|
|
| Invalid_attribute ->
|
2020-01-16 19:36:04 +00:00
|
|
|
"Invalid attribute."
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2019-12-17 14:56:16 +01:00
|
|
|
exception Error of error Region.reg
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2019-12-20 16:44:03 +01:00
|
|
|
let format_error ?(offsets=true) mode Region.{region; value} ~file =
|
2020-01-20 10:57:07 +01:00
|
|
|
let msg = error_to_string value
|
2020-01-23 18:28:04 +01:00
|
|
|
and reg = region#to_string ~file ~offsets mode in
|
|
|
|
let value = sprintf "Lexical error %s:\n%s" reg msg
|
|
|
|
in Region.{value; region}
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
let fail region value = raise (Error Region.{region; value})
|
|
|
|
|
|
|
|
(* TOKENS *)
|
|
|
|
|
|
|
|
(* Making tokens *)
|
|
|
|
|
|
|
|
let mk_string (thread, state) =
|
|
|
|
let start = thread.opening#start in
|
|
|
|
let stop = state.pos in
|
|
|
|
let region = Region.make ~start ~stop in
|
|
|
|
let lexeme = mk_str thread.len thread.acc in
|
|
|
|
let token = Token.mk_string lexeme region
|
|
|
|
in token, state
|
|
|
|
|
|
|
|
let mk_bytes bytes state buffer =
|
|
|
|
let region, _, state = sync state buffer in
|
|
|
|
let token = Token.mk_bytes bytes region
|
|
|
|
in token, state
|
|
|
|
|
|
|
|
let mk_int state buffer =
|
|
|
|
let region, lexeme, state = sync state buffer in
|
|
|
|
match Token.mk_int lexeme region with
|
|
|
|
Ok token -> token, state
|
|
|
|
| Error Token.Non_canonical_zero ->
|
|
|
|
fail region Non_canonical_zero
|
|
|
|
|
|
|
|
let mk_nat state buffer =
|
|
|
|
let region, lexeme, state = sync state buffer in
|
|
|
|
match Token.mk_nat lexeme region with
|
|
|
|
Ok token -> token, state
|
2019-09-27 13:33:25 +00:00
|
|
|
| Error Token.Non_canonical_zero_nat ->
|
2019-05-12 20:56:22 +00:00
|
|
|
fail region Non_canonical_zero
|
2019-09-27 13:33:25 +00:00
|
|
|
| Error Token.Invalid_natural ->
|
|
|
|
fail region Invalid_natural
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2019-10-27 11:50:24 -05:00
|
|
|
let mk_mutez state buffer =
|
2019-05-12 20:56:22 +00:00
|
|
|
let region, lexeme, state = sync state buffer in
|
2019-10-27 11:50:24 -05:00
|
|
|
match Token.mk_mutez lexeme region with
|
2019-05-12 20:56:22 +00:00
|
|
|
Ok token -> token, state
|
|
|
|
| Error Token.Non_canonical_zero ->
|
|
|
|
fail region Non_canonical_zero
|
|
|
|
|
2020-02-21 17:16:53 +01:00
|
|
|
let mk_tez state buffer =
|
2019-09-27 13:33:25 +00:00
|
|
|
let region, lexeme, state = sync state buffer in
|
|
|
|
let lexeme = Str.string_before lexeme (String.index lexeme 't') in
|
|
|
|
let lexeme = Z.mul (Z.of_int 1_000_000) (Z.of_string lexeme) in
|
2019-10-27 11:50:24 -05:00
|
|
|
match Token.mk_mutez (Z.to_string lexeme ^ "mutez") region with
|
2019-09-27 13:33:25 +00:00
|
|
|
Ok token -> token, state
|
|
|
|
| Error Token.Non_canonical_zero ->
|
|
|
|
fail region Non_canonical_zero
|
|
|
|
|
2020-02-21 17:16:53 +01:00
|
|
|
let format_tez s =
|
2019-09-27 13:33:25 +00:00
|
|
|
match String.index s '.' with
|
|
|
|
index ->
|
|
|
|
let len = String.length s in
|
|
|
|
let integral = Str.first_chars s index
|
|
|
|
and fractional = Str.last_chars s (len-index-1) in
|
|
|
|
let num = Z.of_string (integral ^ fractional)
|
|
|
|
and den = Z.of_string ("1" ^ String.make (len-index-1) '0')
|
|
|
|
and million = Q.of_string "1000000" in
|
2020-01-20 10:57:07 +01:00
|
|
|
let mutez = Q.make num den |> Q.mul million in
|
2019-10-27 11:50:24 -05:00
|
|
|
let should_be_1 = Q.den mutez in
|
|
|
|
if Z.equal Z.one should_be_1 then Some (Q.num mutez) else None
|
2019-09-27 13:33:25 +00:00
|
|
|
| exception Not_found -> assert false
|
|
|
|
|
2020-02-21 17:16:53 +01:00
|
|
|
let mk_tez_decimal state buffer =
|
2019-09-27 13:33:25 +00:00
|
|
|
let region, lexeme, state = sync state buffer in
|
2020-02-21 17:16:53 +01:00
|
|
|
let lexeme = Str.(global_replace (regexp "_") "" lexeme) in
|
2019-09-27 13:33:25 +00:00
|
|
|
let lexeme = Str.string_before lexeme (String.index lexeme 't') in
|
2020-02-21 17:16:53 +01:00
|
|
|
match format_tez lexeme with
|
2020-01-27 16:05:47 +01:00
|
|
|
None -> assert false
|
|
|
|
| Some tz ->
|
|
|
|
match Token.mk_mutez (Z.to_string tz ^ "mutez") region with
|
|
|
|
Ok token -> token, state
|
2019-09-27 13:33:25 +00:00
|
|
|
| Error Token.Non_canonical_zero ->
|
|
|
|
fail region Non_canonical_zero
|
2019-09-27 17:07:36 +02:00
|
|
|
|
2019-05-12 20:56:22 +00:00
|
|
|
let mk_ident state buffer =
|
|
|
|
let region, lexeme, state = sync state buffer in
|
|
|
|
match Token.mk_ident lexeme region with
|
|
|
|
Ok token -> token, state
|
2020-01-04 08:24:16 +00:00
|
|
|
| Error Token.Reserved_name -> fail region (Reserved_name lexeme)
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2020-01-20 10:57:07 +01:00
|
|
|
let mk_attr header attr state buffer =
|
2020-01-16 19:36:04 +00:00
|
|
|
let region, _, state = sync state buffer in
|
2020-01-20 10:57:07 +01:00
|
|
|
match Token.mk_attr header attr region with
|
|
|
|
Ok token ->
|
|
|
|
token, state
|
|
|
|
| Error Token.Invalid_attribute ->
|
|
|
|
fail region Invalid_attribute
|
2020-01-16 19:36:04 +00:00
|
|
|
|
2019-05-12 20:56:22 +00:00
|
|
|
let mk_constr state buffer =
|
|
|
|
let region, lexeme, state = sync state buffer
|
|
|
|
in Token.mk_constr lexeme region, state
|
|
|
|
|
|
|
|
let mk_sym state buffer =
|
2019-10-12 23:42:26 +02:00
|
|
|
let region, lexeme, state = sync state buffer in
|
|
|
|
match Token.mk_sym lexeme region with
|
|
|
|
Ok token -> token, state
|
|
|
|
| Error Token.Invalid_symbol -> fail region Invalid_symbol
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
let mk_eof state buffer =
|
|
|
|
let region, _, state = sync state buffer
|
|
|
|
in Token.eof region, state
|
2020-01-20 10:57:07 +01:00
|
|
|
|
2019-05-12 20:56:22 +00:00
|
|
|
(* END HEADER *)
|
|
|
|
}
|
|
|
|
|
|
|
|
(* START LEXER DEFINITION *)
|
|
|
|
|
|
|
|
(* Named regular expressions *)
|
|
|
|
|
|
|
|
let utf8_bom = "\xEF\xBB\xBF" (* Byte Order Mark for UTF-8 *)
|
|
|
|
let nl = ['\n' '\r'] | "\r\n"
|
|
|
|
let blank = ' ' | '\t'
|
|
|
|
let digit = ['0'-'9']
|
|
|
|
let natural = digit | digit (digit | '_')* digit
|
2020-02-21 17:16:53 +01:00
|
|
|
let decimal = natural '.' natural
|
2019-05-12 20:56:22 +00:00
|
|
|
let small = ['a'-'z']
|
|
|
|
let capital = ['A'-'Z']
|
|
|
|
let letter = small | capital
|
Refactoring of Ligodity (CameLIGO) and making an AST pretty-printer
- AST.ml/AST.mli:
- The AST now distinguishes the constructors `None` and `Some` as being
predefined, as in PascaLIGO. See type `AST.constr_pattern`.
- I removed the nodes specific to Liquidity,
e.g. `let%entry`, and, in particular, the natural literals
ending with `p`. Now it should be `n`, as in `10n`.
- I renamed the node `TAlias` to `TVar`.
- I have applied the rule of expanding type expressions after `of` when
those were not records.
- The type of the argument to a data constructor is now
`type_expr`, instead of `cartesian`.
- I added the patterns for bytes (`PBytes`) and natural literals (`PNat`).
- I renamed the node `Sugar` into
`PListComp` (meaning "pattern of list comprehension").
- Record types in CameLIGO now must have at least one field declaration.
- Replaced the type `closing` and `opening` with one type `compound`,
which captures only the right combinations of opening and closing.
- Components of tuples in a selection must not be written
between parentheses. For example, `a.b.(1).(0)` is now
`a.b.1.0`, as in PascaLIGO.
- LexToken.mli/LexToken.mll
- I renamed the string literal `Str` into `String`.
- I added the tokens `C_None` and `C_Some` (to distinguish the
constructors `None` and `Some`. See AST.ml)
- Fixed the function `mk_sym` so it does not fail with `failwith`, but
with `Error Invalid_symbol`.
- Lexer.mll (shared)
- I removed the character `%` from the identifiers (used to
support Liquidity, like `entry%point` and `match%nat`).
- I adde to the hint on broken strings: "or insert a backslash"
(from a Gitlab issue).
- ParToken.mly
- I added the tokens `C_None` and `C_Some` (to distinguish the
constructors `None` and `Some`. See AST.ml and LexToken.mll)
- Parser.mly
- Fixed the order of declarations in the AST (it was reversed).
- I removed syntax support for Liquidity.
- I added user-defined constructor applications to irrefutable
patterns (the ones afer a `let`), even though only the type
checker can decide that they are truly irrefutable because they
are the only constructors of their types.
- I added natural numbers and bytes to patterns.
- Access of tuple components do not require parentheses now, like
`a.b.1.0`.
- I refactored the semantic actions.
- I added the empty sequence `begin end`.
- ParserLog.ml/ParserLog.mli
- I added a pretty-printer for the AST (with source locations).
- ParserMain.ml
- The CLI for the pretty-printer is now `--verbose=ast`.
- The old CLI `--verbose=ast` is now `--verbose=ast-tokens`.
- ligodity.ml (simplifier)
- I removed the constructions of sets, lists and maps with
`Set [...]`, `List [...]` and `Map [...]`, as there are already
better ways (that is, more like the OCaml's way), like
`Set.literal [...]` and `Map.literal [...]`. (The case for lists
was entirely redundant with the rest of the language as it is.)
- Everywhere there is now a non-empty list of elements, I made a
change. In particular, I removed a corner case ("let without
binding"), thanks to more precise OCaml types for non-empty
lists.
- I ported all the changes to the AST above.
- region.ml (vendors)
- I changed the method `compact` so the end-line is not repeated
if it is the same as the start line: this is even more compact. I
use this in the new pretty-printer for the AST (see above)
- I updated all the CameLIGO contracts.
2019-11-04 23:51:47 +01:00
|
|
|
let ident = small (letter | '_' | digit)*
|
2019-05-12 20:56:22 +00:00
|
|
|
let constr = capital (letter | '_' | digit)*
|
2020-01-20 10:57:07 +01:00
|
|
|
let attr = ident | constr
|
2020-02-20 12:07:01 -06:00
|
|
|
let hexa_digit = digit | ['A'-'F' 'a'-'f']
|
2019-05-12 20:56:22 +00:00
|
|
|
let byte = hexa_digit hexa_digit
|
|
|
|
let byte_seq = byte | byte (byte | '_')* byte
|
|
|
|
let bytes = "0x" (byte_seq? as seq)
|
|
|
|
let esc = "\\n" | "\\\"" | "\\\\" | "\\b"
|
|
|
|
| "\\r" | "\\t" | "\\x" byte
|
2020-01-27 16:05:47 +01:00
|
|
|
|
|
|
|
let pascaligo_sym = "=/=" | '#' | ":="
|
|
|
|
let cameligo_sym = "<>" | "::" | "||" | "&&"
|
2020-01-20 10:57:07 +01:00
|
|
|
let reasonligo_sym = '!' | "=>" | "!=" | "==" | "++" | "..." | "||" | "&&"
|
2019-10-12 23:42:26 +02:00
|
|
|
|
|
|
|
let symbol =
|
|
|
|
';' | ',' | '(' | ')'| '[' | ']' | '{' | '}'
|
|
|
|
| '=' | ':' | '|' | "->" | '.' | '_' | '^'
|
|
|
|
| '+' | '-' | '*' | '/'
|
|
|
|
| '<' | "<=" | '>' | ">="
|
2019-12-10 13:47:31 +00:00
|
|
|
| pascaligo_sym | cameligo_sym | reasonligo_sym
|
2019-10-12 23:42:26 +02:00
|
|
|
|
|
|
|
let string = [^'"' '\\' '\n']* (* For strings of #include *)
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
(* RULES *)
|
|
|
|
|
|
|
|
(* Except for the first rule [init], all rules bear a name starting
|
|
|
|
with "scan".
|
|
|
|
|
|
|
|
All have a parameter [state] that they thread through their
|
|
|
|
recursive calls. The rules for the structured constructs (strings
|
|
|
|
and comments) have an extra parameter of type [thread] (see above).
|
|
|
|
*)
|
|
|
|
|
|
|
|
rule init state = parse
|
|
|
|
utf8_bom { scan (push_bom state lexbuf) lexbuf }
|
|
|
|
| _ { rollback lexbuf; scan state lexbuf }
|
|
|
|
|
|
|
|
and scan state = parse
|
2020-01-20 10:57:07 +01:00
|
|
|
nl { scan (push_newline state lexbuf) lexbuf }
|
|
|
|
| ' '+ { scan (push_space state lexbuf) lexbuf }
|
|
|
|
| '\t'+ { scan (push_tabs state lexbuf) lexbuf }
|
|
|
|
| ident { mk_ident state lexbuf |> enqueue }
|
|
|
|
| constr { mk_constr state lexbuf |> enqueue }
|
|
|
|
| bytes { mk_bytes seq state lexbuf |> enqueue }
|
|
|
|
| natural 'n' { mk_nat state lexbuf |> enqueue }
|
|
|
|
| natural "mutez" { mk_mutez state lexbuf |> enqueue }
|
|
|
|
| natural "tz"
|
2020-02-21 17:16:53 +01:00
|
|
|
| natural "tez" { mk_tez state lexbuf |> enqueue }
|
2020-01-20 10:57:07 +01:00
|
|
|
| decimal "tz"
|
2020-02-21 17:16:53 +01:00
|
|
|
| decimal "tez" { mk_tez_decimal state lexbuf |> enqueue }
|
2020-01-20 10:57:07 +01:00
|
|
|
| natural { mk_int state lexbuf |> enqueue }
|
|
|
|
| symbol { mk_sym state lexbuf |> enqueue }
|
|
|
|
| eof { mk_eof state lexbuf |> enqueue }
|
|
|
|
| "[@" (attr as a) "]" { mk_attr "[@" a state lexbuf |> enqueue }
|
|
|
|
| "[@@" (attr as a) "]" { mk_attr "[@@" a state lexbuf |> enqueue }
|
|
|
|
|
2019-05-12 20:56:22 +00:00
|
|
|
| '"' { let opening, _, state = sync state lexbuf in
|
|
|
|
let thread = {opening; len=1; acc=['"']} in
|
|
|
|
scan_string thread state lexbuf |> mk_string |> enqueue }
|
|
|
|
|
|
|
|
| "(*" { let opening, _, state = sync state lexbuf in
|
|
|
|
let thread = {opening; len=2; acc=['*';'(']} in
|
|
|
|
let state = scan_block thread state lexbuf |> push_block
|
|
|
|
in scan state lexbuf }
|
|
|
|
|
|
|
|
| "//" { let opening, _, state = sync state lexbuf in
|
|
|
|
let thread = {opening; len=2; acc=['/';'/']} in
|
|
|
|
let state = scan_line thread state lexbuf |> push_line
|
|
|
|
in scan state lexbuf }
|
|
|
|
|
2020-03-31 19:44:10 +02:00
|
|
|
(* Management of #include preprocessing directives
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2020-03-31 19:44:10 +02:00
|
|
|
An input LIGO program may contain preprocessing directives, and
|
|
|
|
the entry modules (named *Main.ml) run the preprocessor on them,
|
|
|
|
as if using the GNU C preprocessor in traditional mode:
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
https://gcc.gnu.org/onlinedocs/cpp/Traditional-Mode.html
|
|
|
|
|
2020-03-31 19:44:10 +02:00
|
|
|
The main interest in using a preprocessor is that it can stand
|
|
|
|
for a poor man's (flat) module system for LIGO thanks to #include
|
|
|
|
directives, and the equivalent of the traditional mode leaves the
|
|
|
|
markup undisturbed.
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2020-03-31 19:44:10 +02:00
|
|
|
Contrary to the C preprocessor, our preprocessor does not
|
|
|
|
generate #line resulting from processing #include directives deal
|
|
|
|
with system file headers and thus have to be ignored for our
|
2019-05-12 20:56:22 +00:00
|
|
|
purpose. Moreover, these #line directives may also carry some
|
|
|
|
additional flags:
|
|
|
|
|
|
|
|
https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
|
|
|
|
|
|
|
|
of which 1 and 2 indicate, respectively, the start of a new file
|
|
|
|
and the return from a file (after its inclusion has been
|
|
|
|
processed).
|
2020-03-31 19:44:10 +02:00
|
|
|
*)
|
2019-05-12 20:56:22 +00:00
|
|
|
|
2019-09-26 16:35:16 +02:00
|
|
|
| '#' blank* ("line" blank+)? (natural as line) blank+
|
2019-05-12 20:56:22 +00:00
|
|
|
'"' (string as file) '"' {
|
|
|
|
let _, _, state = sync state lexbuf in
|
|
|
|
let flags, state = scan_flags state [] lexbuf in
|
|
|
|
let () = ignore flags in
|
|
|
|
let line = int_of_string line
|
|
|
|
and file = Filename.basename file in
|
|
|
|
let pos = state.pos#set ~file ~line ~offset:0 in
|
|
|
|
let state = {state with pos} in
|
2020-01-20 10:57:07 +01:00
|
|
|
scan state lexbuf }
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
(* Some special errors
|
|
|
|
|
|
|
|
Some special errors are recognised in the semantic actions of the
|
|
|
|
following regular expressions. The first error is a minus sign
|
2020-01-27 16:05:47 +01:00
|
|
|
separated from the integer it applies to by some markup (space or
|
2019-05-12 20:56:22 +00:00
|
|
|
tabs). The second is a minus sign immediately followed by
|
|
|
|
anything else than a natural number (matched above) or markup and
|
|
|
|
a number (previous error). The third is the strange occurrence of
|
|
|
|
an attempt at defining a negative byte sequence. Finally, the
|
|
|
|
catch-all rule reports unexpected characters in the buffer (and
|
|
|
|
is not so special, after all).
|
|
|
|
*)
|
|
|
|
|
|
|
|
| '-' { let region, _, state = sync state lexbuf in
|
|
|
|
let state = scan state lexbuf in
|
|
|
|
let open Markup in
|
|
|
|
match FQueue.peek state.units with
|
|
|
|
None -> assert false
|
|
|
|
| Some (_, ((Space _ | Tabs _)::_, token))
|
|
|
|
when Token.is_int token ->
|
|
|
|
fail region Orphan_minus
|
|
|
|
| _ -> fail region Unterminated_integer }
|
|
|
|
|
2019-09-26 16:35:16 +02:00
|
|
|
| "-0x" byte_seq?
|
2019-05-12 20:56:22 +00:00
|
|
|
{ let region, _, _ = sync state lexbuf
|
|
|
|
in fail region Negative_byte_sequence }
|
|
|
|
|
|
|
|
| _ as c { let region, _, _ = sync state lexbuf
|
|
|
|
in fail region (Unexpected_character c) }
|
|
|
|
|
|
|
|
(* Scanning CPP #include flags *)
|
|
|
|
|
|
|
|
and scan_flags state acc = parse
|
|
|
|
blank+ { let _, _, state = sync state lexbuf
|
|
|
|
in scan_flags state acc lexbuf }
|
2019-09-26 16:35:16 +02:00
|
|
|
| natural as code { let _, _, state = sync state lexbuf in
|
2019-05-12 20:56:22 +00:00
|
|
|
let acc = int_of_string code :: acc
|
|
|
|
in scan_flags state acc lexbuf }
|
|
|
|
| nl { List.rev acc, push_newline state lexbuf }
|
|
|
|
| eof { let _, _, state = sync state lexbuf
|
|
|
|
in List.rev acc, state (* TODO *) }
|
|
|
|
|
|
|
|
(* Finishing a string *)
|
|
|
|
|
|
|
|
and scan_string thread state = parse
|
|
|
|
nl { fail thread.opening Broken_string }
|
|
|
|
| eof { fail thread.opening Unterminated_string }
|
|
|
|
| ['\t' '\r' '\b']
|
|
|
|
{ let region, _, _ = sync state lexbuf
|
|
|
|
in fail region Invalid_character_in_string }
|
|
|
|
| '"' { let _, _, state = sync state lexbuf
|
|
|
|
in push_char '"' thread, state }
|
|
|
|
| esc { let _, lexeme, state = sync state lexbuf
|
|
|
|
in scan_string (push_string lexeme thread) state lexbuf }
|
|
|
|
| '\\' _ { let region, _, _ = sync state lexbuf
|
|
|
|
in fail region Undefined_escape_sequence }
|
|
|
|
| _ as c { let _, _, state = sync state lexbuf in
|
|
|
|
scan_string (push_char c thread) state lexbuf }
|
|
|
|
|
|
|
|
(* Finishing a block comment
|
|
|
|
|
|
|
|
(Note for Emacs: ("(*")
|
|
|
|
The lexing of block comments must take care of embedded block
|
|
|
|
comments that may occur within, as well as strings, so no substring
|
|
|
|
"*)" may inadvertently close the block. This is the purpose
|
|
|
|
of the first case of the scanner [scan_block].
|
|
|
|
*)
|
|
|
|
|
|
|
|
and scan_block thread state = parse
|
|
|
|
'"' | "(*" { let opening = thread.opening in
|
|
|
|
let opening', lexeme, state = sync state lexbuf in
|
|
|
|
let thread = push_string lexeme thread in
|
|
|
|
let thread = {thread with opening=opening'} in
|
|
|
|
let next = if lexeme = "\"" then scan_string
|
|
|
|
else scan_block in
|
|
|
|
let thread, state = next thread state lexbuf in
|
|
|
|
let thread = {thread with opening}
|
|
|
|
in scan_block thread state lexbuf }
|
|
|
|
| "*)" { let _, lexeme, state = sync state lexbuf
|
|
|
|
in push_string lexeme thread, state }
|
|
|
|
| nl as nl { let () = Lexing.new_line lexbuf
|
|
|
|
and state = {state with pos = state.pos#new_line nl}
|
|
|
|
and thread = push_string nl thread
|
|
|
|
in scan_block thread state lexbuf }
|
|
|
|
| eof { fail thread.opening Unterminated_comment }
|
|
|
|
| _ { let () = rollback lexbuf in
|
|
|
|
let len = thread.len in
|
|
|
|
let thread,
|
|
|
|
status = scan_utf8 thread state lexbuf in
|
|
|
|
let delta = thread.len - len in
|
|
|
|
let pos = state.pos#shift_one_uchar delta in
|
|
|
|
match status with
|
|
|
|
None -> scan_block thread {state with pos} lexbuf
|
|
|
|
| Some error ->
|
|
|
|
let region = Region.make ~start:state.pos ~stop:pos
|
|
|
|
in fail region error }
|
|
|
|
|
|
|
|
(* Finishing a line comment *)
|
|
|
|
|
|
|
|
and scan_line thread state = parse
|
|
|
|
nl as nl { let () = Lexing.new_line lexbuf
|
|
|
|
and thread = push_string nl thread
|
|
|
|
and state = {state with pos = state.pos#new_line nl}
|
|
|
|
in thread, state }
|
2020-01-31 12:31:25 +01:00
|
|
|
| eof { thread, state }
|
2019-05-12 20:56:22 +00:00
|
|
|
| _ { let () = rollback lexbuf in
|
|
|
|
let len = thread.len in
|
|
|
|
let thread,
|
|
|
|
status = scan_utf8 thread state lexbuf in
|
|
|
|
let delta = thread.len - len in
|
|
|
|
let pos = state.pos#shift_one_uchar delta in
|
|
|
|
match status with
|
|
|
|
None -> scan_line thread {state with pos} lexbuf
|
|
|
|
| Some error ->
|
|
|
|
let region = Region.make ~start:state.pos ~stop:pos
|
|
|
|
in fail region error }
|
|
|
|
|
|
|
|
and scan_utf8 thread state = parse
|
|
|
|
eof { fail thread.opening Unterminated_comment }
|
|
|
|
| _ as c { let thread = push_char c thread in
|
|
|
|
let lexeme = Lexing.lexeme lexbuf in
|
|
|
|
let () = state.supply (Bytes.of_string lexeme) 0 1 in
|
|
|
|
match Uutf.decode state.decoder with
|
|
|
|
`Uchar _ -> thread, None
|
|
|
|
| `Malformed _ -> thread, Some Invalid_utf8_sequence
|
|
|
|
| `Await -> scan_utf8 thread state lexbuf
|
|
|
|
| `End -> assert false }
|
|
|
|
|
|
|
|
(* END LEXER DEFINITION *)
|
|
|
|
|
|
|
|
{
|
|
|
|
(* START TRAILER *)
|
|
|
|
|
|
|
|
(* Scanning the lexing buffer for tokens (and markup, as a
|
|
|
|
side-effect).
|
|
|
|
|
|
|
|
Because we want the lexer to have access to the right lexical
|
|
|
|
context of a recognised lexeme (to enforce stylistic constraints or
|
|
|
|
report special error patterns), we need to keep a hidden reference
|
|
|
|
to a queue of recognised lexical units (that is, tokens and markup)
|
2020-01-31 14:11:56 +01:00
|
|
|
that acts as a mutable state between the calls to [read]. When
|
|
|
|
[read] is called, that queue is examined first and, if it contains
|
|
|
|
at least one token, that token is returned; otherwise, the lexing
|
|
|
|
buffer is scanned for at least one more new token. That is the
|
|
|
|
general principle: we put a high-level buffer (our queue) on top of
|
|
|
|
the low-level lexing buffer.
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
One tricky and important detail is that we must make any parser
|
2020-01-31 14:11:56 +01:00
|
|
|
generated by Menhir (and calling [read]) believe that the last
|
|
|
|
region of the input source that was matched indeed corresponds to
|
|
|
|
the returned token, despite that many tokens and markup may have
|
2019-05-12 20:56:22 +00:00
|
|
|
been matched since it was actually read from the input. In other
|
|
|
|
words, the parser requests a token that is taken from the
|
|
|
|
high-level buffer, but the parser requests the source regions from
|
|
|
|
the _low-level_ lexing buffer, and they may disagree if more than
|
|
|
|
one token has actually been recognised.
|
|
|
|
|
|
|
|
Consequently, in order to maintain a consistent view for the
|
|
|
|
parser, we have to patch some fields of the lexing buffer, namely
|
|
|
|
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
|
|
|
|
generated by Menhir when querying source positions (regions). This
|
|
|
|
is the purpose of the function [patch_buffer]. After reading one
|
2019-06-27 10:32:07 +02:00
|
|
|
or more tokens and markup by the scanning rule [scan], we have to
|
2019-05-12 20:56:22 +00:00
|
|
|
save in the hidden reference [buf_reg] the region of the source
|
|
|
|
that was matched by [scan]. This atomic sequence of patching,
|
|
|
|
scanning and saving is implemented by the _function_ [scan]
|
|
|
|
(beware: it shadows the scanning rule [scan]). The function
|
|
|
|
[patch_buffer] is, of course, also called just before returning the
|
|
|
|
token, so the parser has a view of the lexing buffer consistent
|
|
|
|
with the token.
|
|
|
|
|
|
|
|
Note that an additional reference [first_call] is needed to
|
|
|
|
distinguish the first call to the function [scan], as the first
|
|
|
|
scanning rule is actually [init] (which can handle the BOM), not
|
|
|
|
[scan].
|
2020-01-31 14:11:56 +01:00
|
|
|
*)
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
type logger = Markup.t list -> token -> unit
|
|
|
|
|
|
|
|
type instance = {
|
2020-01-31 12:31:25 +01:00
|
|
|
read : log:logger -> Lexing.lexbuf -> token;
|
2019-05-12 20:56:22 +00:00
|
|
|
buffer : Lexing.lexbuf;
|
2019-12-20 16:44:03 +01:00
|
|
|
get_win : unit -> window;
|
2019-05-12 20:56:22 +00:00
|
|
|
get_pos : unit -> Pos.t;
|
|
|
|
get_last : unit -> Region.t;
|
2020-01-14 18:35:30 +01:00
|
|
|
get_file : unit -> file_path;
|
2019-05-12 20:56:22 +00:00
|
|
|
close : unit -> unit
|
|
|
|
}
|
|
|
|
|
2020-01-23 18:28:04 +01:00
|
|
|
type input =
|
|
|
|
File of file_path (* "-" means stdin *)
|
|
|
|
| Stdin
|
|
|
|
| String of string
|
|
|
|
| Channel of in_channel
|
|
|
|
| Buffer of Lexing.lexbuf
|
|
|
|
|
|
|
|
type open_err = File_opening of string
|
|
|
|
|
|
|
|
let open_token_stream input =
|
|
|
|
let file_path = match input with
|
|
|
|
File file_path ->
|
|
|
|
if file_path = "-" then "" else file_path
|
|
|
|
| _ -> "" in
|
2020-01-21 18:35:36 +01:00
|
|
|
let pos = Pos.min ~file:file_path in
|
2019-05-12 20:56:22 +00:00
|
|
|
let buf_reg = ref (pos#byte, pos#byte)
|
|
|
|
and first_call = ref true
|
|
|
|
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
|
|
|
|
let supply = Uutf.Manual.src decoder in
|
|
|
|
let state = ref {units = FQueue.empty;
|
|
|
|
last = Region.ghost;
|
2019-12-20 16:44:03 +01:00
|
|
|
window = Nil;
|
2019-05-12 20:56:22 +00:00
|
|
|
pos;
|
|
|
|
markup = [];
|
|
|
|
decoder;
|
|
|
|
supply} in
|
|
|
|
|
|
|
|
let get_pos () = !state.pos
|
2019-12-20 16:44:03 +01:00
|
|
|
and get_last () = !state.last
|
2020-01-14 18:35:30 +01:00
|
|
|
and get_win () = !state.window
|
|
|
|
and get_file () = file_path in
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
let patch_buffer (start, stop) buffer =
|
|
|
|
let open Lexing in
|
|
|
|
let file_path = buffer.lex_curr_p.pos_fname in
|
|
|
|
buffer.lex_start_p <- {start with pos_fname = file_path};
|
|
|
|
buffer.lex_curr_p <- {stop with pos_fname = file_path}
|
|
|
|
|
|
|
|
and save_region buffer =
|
|
|
|
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
|
|
|
|
|
|
|
|
let scan buffer =
|
|
|
|
patch_buffer !buf_reg buffer;
|
|
|
|
(if !first_call
|
|
|
|
then (state := init !state buffer; first_call := false)
|
|
|
|
else state := scan !state buffer);
|
|
|
|
save_region buffer in
|
|
|
|
|
|
|
|
let next_token buffer =
|
|
|
|
scan buffer;
|
|
|
|
match FQueue.peek !state.units with
|
|
|
|
None -> assert false
|
|
|
|
| Some (units, ext_token) ->
|
|
|
|
state := {!state with units}; Some ext_token in
|
|
|
|
|
|
|
|
let check_right_context token buffer =
|
|
|
|
let open Token in
|
|
|
|
if is_int token || is_bytes token then
|
|
|
|
match next_token buffer with
|
|
|
|
Some ([], next) ->
|
|
|
|
let pos = (Token.to_region token)#stop in
|
|
|
|
let region = Region.make ~start:pos ~stop:pos in
|
2019-09-26 16:35:16 +02:00
|
|
|
if is_int next then
|
2019-05-12 20:56:22 +00:00
|
|
|
fail region Odd_lengthed_bytes
|
|
|
|
else
|
2019-09-26 16:35:16 +02:00
|
|
|
if is_ident next || is_string next || is_bytes next then
|
2019-05-12 20:56:22 +00:00
|
|
|
fail region Missing_break
|
|
|
|
| _ -> ()
|
|
|
|
else
|
2019-09-26 16:35:16 +02:00
|
|
|
if is_ident token || is_string token then
|
2019-05-12 20:56:22 +00:00
|
|
|
match next_token buffer with
|
|
|
|
Some ([], next) ->
|
2019-09-26 16:35:16 +02:00
|
|
|
if is_ident next || is_string next
|
|
|
|
|| is_bytes next || is_int next
|
2019-05-12 20:56:22 +00:00
|
|
|
then
|
|
|
|
let pos = (Token.to_region token)#stop in
|
|
|
|
let region = Region.make ~start:pos ~stop:pos
|
|
|
|
in fail region Missing_break
|
|
|
|
| _ -> () in
|
|
|
|
|
2020-01-31 14:11:56 +01:00
|
|
|
let rec read ~log buffer =
|
2019-05-12 20:56:22 +00:00
|
|
|
match FQueue.deq !state.units with
|
|
|
|
None ->
|
|
|
|
scan buffer;
|
2020-01-23 18:28:04 +01:00
|
|
|
read ~log buffer
|
2019-05-12 20:56:22 +00:00
|
|
|
| Some (units, (left_mark, token)) ->
|
|
|
|
log left_mark token;
|
2019-12-20 16:44:03 +01:00
|
|
|
state := {!state with units;
|
|
|
|
last = Token.to_region token;
|
|
|
|
window = slide token !state.window};
|
2019-05-12 20:56:22 +00:00
|
|
|
check_right_context token buffer;
|
|
|
|
patch_buffer (Token.to_region token)#byte_pos buffer;
|
|
|
|
token in
|
|
|
|
|
2020-01-23 18:28:04 +01:00
|
|
|
let buf_close_res =
|
|
|
|
match input with
|
|
|
|
File "" | File "-" | Stdin ->
|
|
|
|
Ok (Lexing.from_channel stdin, fun () -> close_in stdin)
|
|
|
|
| File path ->
|
|
|
|
(try
|
|
|
|
let chan = open_in path in
|
|
|
|
let close () = close_in chan in
|
|
|
|
Ok (Lexing.from_channel chan, close)
|
|
|
|
with
|
|
|
|
Sys_error msg -> Stdlib.Error (File_opening msg))
|
|
|
|
| String s ->
|
2020-01-24 15:57:41 +01:00
|
|
|
Ok (Lexing.from_string s, fun () -> ())
|
2020-01-23 18:28:04 +01:00
|
|
|
| Channel chan ->
|
|
|
|
let close () = close_in chan in
|
|
|
|
Ok (Lexing.from_channel chan, close)
|
|
|
|
| Buffer b -> Ok (b, fun () -> ()) in
|
|
|
|
match buf_close_res with
|
|
|
|
Ok (buffer, close) ->
|
|
|
|
let () =
|
|
|
|
match input with
|
|
|
|
File path when path <> "" -> reset ~file:path buffer
|
|
|
|
| _ -> () in
|
|
|
|
let instance = {
|
|
|
|
read; buffer; get_win; get_pos; get_last; get_file; close}
|
|
|
|
in Ok instance
|
|
|
|
| Error _ as e -> e
|
2019-05-12 20:56:22 +00:00
|
|
|
|
|
|
|
end (* of functor [Make] in HEADER *)
|
|
|
|
(* END TRAILER *)
|
|
|
|
}
|