ligo/src/passes/1-parser/shared/Lexer.mli

(* Lexer specification for LIGO, to be processed by [ocamllex].

   The underlying design principles are:

     (1) enforce stylistic constraints at a lexical level, in order to
         early reject potentially misleading or poorly written
         LIGO contracts;

     (2) provide precise error messages with hints as how to fix the
         issue, which is achieved by consulting the lexical
         right-context of lexemes;

     (3) be as independent as possible from the LIGO version, so
         upgrades have as little impact as possible on this
         specification: this is achieved by using the most general
         regular expressions to match the lexing buffer and broadly
         distinguish the syntactic categories, and then delegating a
         finer, second analysis to an external module making the
         tokens (hence a functor below);

     (4) support unit testing (lexing of the whole input with debug
         traces).

     A limitation to the independence with respect to the LIGO version
   lies in the errors that the external module building the tokens
   (which may be version-dependent) may have to report. Indeed these
   errors have to be contextualised by the lexer in terms of input
   source regions, so useful error messages can be printed, therefore
   they are part of the signature [TOKEN] that parameterises the
   functor generated here. For instance, if, in a future release of
   LIGO, new tokens are added, and the recognition of their lexemes
   entails new errors, the signature [TOKEN] will have to be augmented
   and this lexer specification changed. However, in practice, it is
   more likely that instructions or types will be added, instead of
   new kinds of tokens.
*)

module Region = Simple_utils.Region
module Pos = Simple_utils.Pos

(* TOKENS *)

(* The signature [TOKEN] exports an abstract type [token], so a lexer
   can be a functor over tokens. This enables to externalise
   version-dependent constraints in any module whose signature matches
   [TOKEN]. Generic functions to construct tokens are required.

   Note the predicate [is_eof], which caracterises the virtual token
   for end-of-file, because it requires special handling. Some of
   those functions may yield errors, which are defined as values of
   the type [int_err] etc. These errors can be better understood by
   reading the ocamllex specification for the lexer ([Lexer.mll]).
*)

type lexeme = string

module type TOKEN =
  sig
    type token

    (* Errors *)

    type   int_err = Non_canonical_zero
    type ident_err = Reserved_name
    type   nat_err = Invalid_natural
                   | Non_canonical_zero_nat
    type   sym_err = Invalid_symbol
    type  attr_err = Invalid_attribute

    (* Injections *)

    val mk_int    : lexeme -> Region.t -> (token,   int_err) result
    val mk_nat    : lexeme -> Region.t -> (token,   nat_err) result
    val mk_mutez  : lexeme -> Region.t -> (token,   int_err) result
    val mk_ident  : lexeme -> Region.t -> (token, ident_err) result
    val mk_sym    : lexeme -> Region.t -> (token,   sym_err) result
    val mk_string : lexeme -> Region.t -> token
    val mk_bytes  : lexeme -> Region.t -> token
    val mk_constr : lexeme -> Region.t -> token
    val mk_attr   : string -> lexeme -> Region.t -> (token, attr_err) result
    val eof       : Region.t -> token

    (* Predicates *)

    val is_eof    : token -> bool

    (* Projections *)

    val to_lexeme : token -> lexeme
    val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
    val to_region : token -> Region.t

    (* Style *)

    type error

    val error_to_string : error -> string

    exception Error of error Region.reg

    val format_error :
      ?offsets:bool ->
      [`Byte | `Point] ->
      error Region.reg ->
      file:bool ->
      string Region.reg

    val check_right_context :
      token ->
      (Lexing.lexbuf -> (Markup.t list * token) option) ->
      Lexing.lexbuf ->
      unit
  end

(* The signature of the lexer *)

module type S =
  sig
    module Token : TOKEN
    type token = Token.token

    (* The scanner [init] is meant to be called first to read the
       BOM. Then [scan] is called. *)

    val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
    val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state

    (* Errors (specific to the generic lexer, not to the tokens) *)

    type error

    val error_to_string : error -> string

    exception Error of error Region.reg

    val format_error :
      ?offsets:bool -> [`Byte | `Point] ->
      error Region.reg -> file:bool -> string Region.reg
  end

(* The functorised interface

   Note that the module parameter [Token] is re-exported as a
   submodule in [S].
*)

module Make (Token : TOKEN) : S with module Token = Token
initial commit 2019-05-12 20:56:22 +00:00			`(* Lexer specification for LIGO, to be processed by [ocamllex].`

			`The underlying design principles are:`

			`(1) enforce stylistic constraints at a lexical level, in order to`
			`early reject potentially misleading or poorly written`
			`LIGO contracts;`

Parsing the command line by calling [EvalOpt.read], not a side-effect. 2019-07-24 15:41:52 +02:00			`(2) provide precise error messages with hints as how to fix the`
initial commit 2019-05-12 20:56:22 +00:00			`issue, which is achieved by consulting the lexical`
			`right-context of lexemes;`

			`(3) be as independent as possible from the LIGO version, so`
			`upgrades have as little impact as possible on this`
			`specification: this is achieved by using the most general`
			`regular expressions to match the lexing buffer and broadly`
			`distinguish the syntactic categories, and then delegating a`
Parsing the command line by calling [EvalOpt.read], not a side-effect. 2019-07-24 15:41:52 +02:00			`finer, second analysis to an external module making the`
			`tokens (hence a functor below);`
initial commit 2019-05-12 20:56:22 +00:00
			`(4) support unit testing (lexing of the whole input with debug`
Parsing the command line by calling [EvalOpt.read], not a side-effect. 2019-07-24 15:41:52 +02:00			`traces).`

			`A limitation to the independence with respect to the LIGO version`
			`lies in the errors that the external module building the tokens`
			`(which may be version-dependent) may have to report. Indeed these`
			`errors have to be contextualised by the lexer in terms of input`
			`source regions, so useful error messages can be printed, therefore`
			`they are part of the signature [TOKEN] that parameterises the`
			`functor generated here. For instance, if, in a future release of`
Removed `entrypoint` and `storage` as keywords. Bug fix: `n-1` was scanned as `n` and `-1`, which was rejected by a style constraint. I removed useless style constraints in the lexer. I removed dead code in `pascaligo.ml`. I added my first draft for the reference manual of PascaLIGO: DO NOT EDIT. Edit the website instead. About the bug fix: This was an awkward attempt at rejecting at lexing time negative integer literals whose sign is separated from the digits, like `- 1`. The fix was simple: remove the `integer` regular expression and use `natural`. 2019-09-26 16:35:16 +02:00			`LIGO, new tokens are added, and the recognition of their lexemes`
			`entails new errors, the signature [TOKEN] will have to be augmented`
			`and this lexer specification changed. However, in practice, it is`
			`more likely that instructions or types will be added, instead of`
			`new kinds of tokens.`
initial commit 2019-05-12 20:56:22 +00:00			`*)`

			`module Region = Simple_utils.Region`
			`module Pos = Simple_utils.Pos`

			`(* TOKENS *)`

			`(* The signature [TOKEN] exports an abstract type [token], so a lexer`
			`can be a functor over tokens. This enables to externalise`
			`version-dependent constraints in any module whose signature matches`
			`[TOKEN]. Generic functions to construct tokens are required.`

			`Note the predicate [is_eof], which caracterises the virtual token`
			`for end-of-file, because it requires special handling. Some of`
			`those functions may yield errors, which are defined as values of`
			`the type [int_err] etc. These errors can be better understood by`
			`reading the ocamllex specification for the lexer ([Lexer.mll]).`
			`*)`

Finished refactoring of lexer. 2020-04-28 19:26:31 +02:00			`type lexeme = string`

initial commit 2019-05-12 20:56:22 +00:00			`module type TOKEN =`
			`sig`
			`type token`

			`(* Errors *)`

Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-12 23:42:26 +02:00			`type int_err = Non_canonical_zero`
			`type ident_err = Reserved_name`
			`type nat_err = Invalid_natural`
			`\| Non_canonical_zero_nat`
			`type sym_err = Invalid_symbol`
Add inline attribute 2020-01-16 19:36:04 +00:00			`type attr_err = Invalid_attribute`
initial commit 2019-05-12 20:56:22 +00:00
			`(* Injections *)`

			`val mk_int : lexeme -> Region.t -> (token, int_err) result`
Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-12 23:42:26 +02:00			`val mk_nat : lexeme -> Region.t -> (token, nat_err) result`
Replace "mtz" with "mutez" 2019-10-27 11:50:24 -05:00			`val mk_mutez : lexeme -> Region.t -> (token, int_err) result`
initial commit 2019-05-12 20:56:22 +00:00			`val mk_ident : lexeme -> Region.t -> (token, ident_err) result`
Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-12 23:42:26 +02:00			`val mk_sym : lexeme -> Region.t -> (token, sym_err) result`
			`val mk_string : lexeme -> Region.t -> token`
			`val mk_bytes : lexeme -> Region.t -> token`
initial commit 2019-05-12 20:56:22 +00:00			`val mk_constr : lexeme -> Region.t -> token`
[WIP] Refactoring of front-end. 2020-01-20 10:57:07 +01:00			`val mk_attr : string -> lexeme -> Region.t -> (token, attr_err) result`
initial commit 2019-05-12 20:56:22 +00:00			`val eof : Region.t -> token`

			`(* Predicates *)`

			`val is_eof : token -> bool`

			`(* Projections *)`

			`val to_lexeme : token -> lexeme`
			val to_string : token -> ?offsets:bool -> [`Byte \| `Point] -> string
			`val to_region : token -> Region.t`
[WIP] Adding the infrastructure for supporting the incremental API of Menhir. * I added CLI option "--mono" to select the monolithic API of Menhir. * I added a field "win" to the state of the lexer (a two-token window for error reporting). * I escaped LIGO strings before making them OCaml strings (for example for printing). 2019-12-20 16:44:03 +01:00
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. ************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. ************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is. 2020-04-24 21:06:18 +02:00			`(* Style *)`
initial commit 2019-05-12 20:56:22 +00:00
Removed the open type [Error.t] (less [assert false]). I also had to remove the keywords [Down], [Fail] and [Step] in PascaLIGO that made a mysterious and unwanted come back. (I did not bother with [git blame]). 2019-12-17 14:56:16 +01:00			`type error`

Show lexer error messages in ReasonLIGO 2020-01-04 08:24:16 +00:00			`val error_to_string : error -> string`

Removed the open type [Error.t] (less [assert false]). I also had to remove the keywords [Down], [Fail] and [Step] in PascaLIGO that made a mysterious and unwanted come back. (I did not bother with [git blame]). 2019-12-17 14:56:16 +01:00			`exception Error of error Region.reg`
initial commit 2019-05-12 20:56:22 +00:00
[WIP] Adding the infrastructure for supporting the incremental API of Menhir. * I added CLI option "--mono" to select the monolithic API of Menhir. * I added a field "win" to the state of the lexer (a two-token window for error reporting). * I escaped LIGO strings before making them OCaml strings (for example for printing). 2019-12-20 16:44:03 +01:00			`val format_error :`
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. ************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. ************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is. 2020-04-24 21:06:18 +02:00			`?offsets:bool ->`
			[`Byte \| `Point] ->
			`error Region.reg ->`
			`file:bool ->`
			`string Region.reg`

			`val check_right_context :`
			`token ->`
			`(Lexing.lexbuf -> (Markup.t list * token) option) ->`
			`Lexing.lexbuf ->`
			`unit`
initial commit 2019-05-12 20:56:22 +00:00			`end`

Finished refactoring of lexer. 2020-04-28 19:26:31 +02:00			`(* The signature of the lexer *)`

			`module type S =`
			`sig`
			`module Token : TOKEN`
			`type token = Token.token`

			`(* The scanner [init] is meant to be called first to read the`
			`BOM. Then [scan] is called. *)`

			`val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state`
			`val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state`

			`(* Errors (specific to the generic lexer, not to the tokens) *)`

			`type error`

			`val error_to_string : error -> string`

			`exception Error of error Region.reg`

			`val format_error :`
			?offsets:bool -> [`Byte \| `Point] ->
			`error Region.reg -> file:bool -> string Region.reg`
			`end`

initial commit 2019-05-12 20:56:22 +00:00			`(* The functorised interface`

			`Note that the module parameter [Token] is re-exported as a`
			`submodule in [S].`
			`*)`

Finished refactoring of lexer. 2020-04-28 19:26:31 +02:00			`module Make (Token : TOKEN) : S with module Token = Token`