148 lines
4.7 KiB
OCaml
148 lines
4.7 KiB
OCaml
(* Lexer specification for LIGO, to be processed by [ocamllex].
|
|
|
|
The underlying design principles are:
|
|
|
|
(1) enforce stylistic constraints at a lexical level, in order to
|
|
early reject potentially misleading or poorly written
|
|
LIGO contracts;
|
|
|
|
(2) provide precise error messages with hints as how to fix the
|
|
issue, which is achieved by consulting the lexical
|
|
right-context of lexemes;
|
|
|
|
(3) be as independent as possible from the LIGO version, so
|
|
upgrades have as little impact as possible on this
|
|
specification: this is achieved by using the most general
|
|
regular expressions to match the lexing buffer and broadly
|
|
distinguish the syntactic categories, and then delegating a
|
|
finer, second analysis to an external module making the
|
|
tokens (hence a functor below);
|
|
|
|
(4) support unit testing (lexing of the whole input with debug
|
|
traces).
|
|
|
|
A limitation to the independence with respect to the LIGO version
|
|
lies in the errors that the external module building the tokens
|
|
(which may be version-dependent) may have to report. Indeed these
|
|
errors have to be contextualised by the lexer in terms of input
|
|
source regions, so useful error messages can be printed, therefore
|
|
they are part of the signature [TOKEN] that parameterises the
|
|
functor generated here. For instance, if, in a future release of
|
|
LIGO, new tokens are added, and the recognition of their lexemes
|
|
entails new errors, the signature [TOKEN] will have to be augmented
|
|
and this lexer specification changed. However, in practice, it is
|
|
more likely that instructions or types will be added, instead of
|
|
new kinds of tokens.
|
|
*)
|
|
|
|
module Region = Simple_utils.Region
|
|
module Pos = Simple_utils.Pos
|
|
|
|
(* TOKENS *)
|
|
|
|
(* The signature [TOKEN] exports an abstract type [token], so a lexer
|
|
can be a functor over tokens. This enables to externalise
|
|
version-dependent constraints in any module whose signature matches
|
|
[TOKEN]. Generic functions to construct tokens are required.
|
|
|
|
Note the predicate [is_eof], which caracterises the virtual token
|
|
for end-of-file, because it requires special handling. Some of
|
|
those functions may yield errors, which are defined as values of
|
|
the type [int_err] etc. These errors can be better understood by
|
|
reading the ocamllex specification for the lexer ([Lexer.mll]).
|
|
*)
|
|
|
|
type lexeme = string
|
|
|
|
module type TOKEN =
|
|
sig
|
|
type token
|
|
|
|
(* Errors *)
|
|
|
|
type int_err = Non_canonical_zero
|
|
type ident_err = Reserved_name
|
|
type nat_err = Invalid_natural
|
|
| Non_canonical_zero_nat
|
|
type sym_err = Invalid_symbol
|
|
type attr_err = Invalid_attribute
|
|
|
|
(* Injections *)
|
|
|
|
val mk_int : lexeme -> Region.t -> (token, int_err) result
|
|
val mk_nat : lexeme -> Region.t -> (token, nat_err) result
|
|
val mk_mutez : lexeme -> Region.t -> (token, int_err) result
|
|
val mk_ident : lexeme -> Region.t -> (token, ident_err) result
|
|
val mk_sym : lexeme -> Region.t -> (token, sym_err) result
|
|
val mk_string : lexeme -> Region.t -> token
|
|
val mk_verbatim : lexeme -> Region.t -> token
|
|
val mk_bytes : lexeme -> Region.t -> token
|
|
val mk_constr : lexeme -> Region.t -> token
|
|
val mk_attr : string -> lexeme -> Region.t -> (token, attr_err) result
|
|
val mk_lang : lexeme Region.reg -> Region.t -> token
|
|
val eof : Region.t -> token
|
|
|
|
(* Predicates *)
|
|
|
|
val is_eof : token -> bool
|
|
|
|
(* Projections *)
|
|
|
|
val to_lexeme : token -> lexeme
|
|
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
|
|
val to_region : token -> Region.t
|
|
|
|
(* Style *)
|
|
|
|
type error
|
|
|
|
val error_to_string : error -> string
|
|
|
|
exception Error of error Region.reg
|
|
|
|
val format_error :
|
|
?offsets:bool ->
|
|
[`Byte | `Point] ->
|
|
error Region.reg ->
|
|
file:bool ->
|
|
string Region.reg
|
|
|
|
val check_right_context :
|
|
token ->
|
|
(Lexing.lexbuf -> (Markup.t list * token) option) ->
|
|
Lexing.lexbuf ->
|
|
unit
|
|
end
|
|
|
|
(* The signature of the lexer *)
|
|
|
|
module type S =
|
|
sig
|
|
module Token : TOKEN
|
|
type token = Token.token
|
|
|
|
(* The scanner *)
|
|
|
|
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
|
|
|
|
(* Errors (specific to the generic lexer, not to the tokens) *)
|
|
|
|
type error
|
|
|
|
val error_to_string : error -> string
|
|
|
|
exception Error of error Region.reg
|
|
|
|
val format_error :
|
|
?offsets:bool -> [`Byte | `Point] ->
|
|
error Region.reg -> file:bool -> string Region.reg
|
|
end
|
|
|
|
(* The functorised interface
|
|
|
|
Note that the module parameter [Token] is re-exported as a
|
|
submodule in [S].
|
|
*)
|
|
|
|
module Make (Token : TOKEN) : S with module Token = Token
|