ligo/src/passes/1-parser/pascaligo/LexToken.mli

(** This signature defines the lexical tokens for LIGO

   _Tokens_ are the abstract units which are used by the parser to
   build the abstract syntax tree (AST), in other words, the stream of
   tokens is the minimal model of the input program, carrying
   implicitly all its structure in a linear encoding, and nothing
   else, in particular, comments and whitespace are absent.

     A _lexeme_ is a specific character string (concrete
   representation) denoting a token (abstract representation). Tokens
   can be thought of as sets, and lexemes as elements of those sets --
   there is often an infinite number of lexemes, but a small number of
   tokens. (Think of identifiers as lexemes and one token.)

     The tokens are qualified here as being "lexical" because the
   parser generator Menhir expects to define them, in which context
   they are called "parsing tokens", and they are made to match each
   other. (This is an idiosyncratic terminology.)

     The type of the lexical tokens is the variant [t], also
   aliased to [token].
*)

module Region = Simple_utils.Region
module Pos    = Simple_utils.Pos

type lexeme = string

(* TOKENS *)

type attribute = {
  header : string;
  string : lexeme Region.reg
}

type t =
  (* Literals *)

  String of lexeme Region.reg
| Bytes  of (lexeme * Hex.t) Region.reg
| Int    of (lexeme * Z.t) Region.reg
| Nat    of (lexeme * Z.t) Region.reg
| Mutez  of (lexeme * Z.t) Region.reg
| Ident  of lexeme Region.reg
| Constr of lexeme Region.reg

  (* Symbols *)

| SEMI     of Region.t  (* ";"   *)
| COMMA    of Region.t  (* ","   *)
| LPAR     of Region.t  (* "("   *)
| RPAR     of Region.t  (* ")"   *)
| LBRACE   of Region.t  (* "{"   *)
| RBRACE   of Region.t  (* "}"   *)
| LBRACKET of Region.t  (* "["   *)
| RBRACKET of Region.t  (* "]"   *)
| CONS     of Region.t  (* "#"   *)
| VBAR     of Region.t  (* "|"   *)
| ARROW    of Region.t  (* "->"  *)
| ASS      of Region.t  (* ":="  *)
| EQ       of Region.t  (* "="   *)
| COLON    of Region.t  (* ":"   *)
| LT       of Region.t  (* "<"   *)
| LE       of Region.t  (* "<="  *)
| GT       of Region.t  (* ">"   *)
| GE       of Region.t  (* ">="  *)
| NE       of Region.t  (* "=/=" *)
| PLUS     of Region.t  (* "+"   *)
| MINUS    of Region.t  (* "-"   *)
| SLASH    of Region.t  (* "/"   *)
| TIMES    of Region.t  (* "*"   *)
| DOT      of Region.t  (* "."   *)
| WILD     of Region.t  (* "_"   *)
| CAT      of Region.t  (* "^"   *)

  (* Keywords *)

| And        of Region.t  (* "and"        *)
| Attributes of Region.t  (* "attributes" *)
| Begin      of Region.t  (* "begin"      *)
| BigMap     of Region.t  (* "big_map"    *)
| Block      of Region.t  (* "block"      *)
| Case       of Region.t  (* "case"       *)
| Const      of Region.t  (* "const"      *)
| Contains   of Region.t  (* "contains"   *)
| Else       of Region.t  (* "else"       *)
| End        of Region.t  (* "end"        *)
| False      of Region.t  (* "False"      *)
| For        of Region.t  (* "for"        *)
| From       of Region.t  (* "from"       *)
| Function   of Region.t  (* "function"   *)
| If         of Region.t  (* "if"         *)
| In         of Region.t  (* "in"         *)
| Is         of Region.t  (* "is"         *)
| List       of Region.t  (* "list"       *)
| Map        of Region.t  (* "map"        *)
| Mod        of Region.t  (* "mod"        *)
| Nil        of Region.t  (* "nil"        *)
| Not        of Region.t  (* "not"        *)
| Of         of Region.t  (* "of"         *)
| Or         of Region.t  (* "or"         *)
| Patch      of Region.t  (* "patch"      *)
| Record     of Region.t  (* "record"     *)
| Remove     of Region.t  (* "remove"     *)
| Set        of Region.t  (* "set"        *)
| Skip       of Region.t  (* "skip"       *)
| Then       of Region.t  (* "then"       *)
| To         of Region.t  (* "to"         *)
| True       of Region.t  (* "True"       *)
| Type       of Region.t  (* "type"       *)
| Unit       of Region.t  (* "Unit"       *)
| Var        of Region.t  (* "var"        *)
| While      of Region.t  (* "while"      *)
| With       of Region.t  (* "with"       *)

  (* Data constructors *)

| C_None  of Region.t  (* "None"  *)
| C_Some  of Region.t  (* "Some"  *)

  (* Virtual tokens *)

| EOF of Region.t


type token = t

(* Projections

   The difference between extracting the lexeme and a string from a
   token is that the latter is the textual representation of the OCaml
   value denoting the token (its abstract syntax), rather than its
   lexeme (concrete syntax).
*)

val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t

(* Injections *)

type   int_err = Non_canonical_zero
type ident_err = Reserved_name
type   nat_err = Invalid_natural
               | Non_canonical_zero_nat
type   sym_err = Invalid_symbol
type attr_err  = Invalid_attribute
type   kwd_err = Invalid_keyword

val mk_int    : lexeme -> Region.t -> (token,   int_err) result
val mk_nat    : lexeme -> Region.t -> (token,   nat_err) result
val mk_mutez  : lexeme -> Region.t -> (token,   int_err) result
val mk_ident  : lexeme -> Region.t -> (token, ident_err) result
val mk_sym    : lexeme -> Region.t -> (token,   sym_err) result
val mk_kwd    : lexeme -> Region.t -> (token,   kwd_err) result
val mk_string : lexeme -> Region.t -> token
val mk_bytes  : lexeme -> Region.t -> token
val mk_constr : lexeme -> Region.t -> token
val mk_attr   : string -> lexeme -> Region.t -> (token, attr_err) result
val eof       : Region.t -> token

(* Predicates *)

val is_string : token -> bool
val is_bytes  : token -> bool
val is_int    : token -> bool
val is_ident  : token -> bool
val is_kwd    : token -> bool
val is_constr : token -> bool
val is_sym    : token -> bool
val is_eof    : token -> bool
Convert top comment in LexToken.mli to ocamldoc comment 2019-10-04 00:59:53 +04:00			`(** This signature defines the lexical tokens for LIGO`
initial commit 2019-05-13 00:56:22 +04:00
			`_Tokens_ are the abstract units which are used by the parser to`
			`build the abstract syntax tree (AST), in other words, the stream of`
			`tokens is the minimal model of the input program, carrying`
			`implicitly all its structure in a linear encoding, and nothing`
			`else, in particular, comments and whitespace are absent.`

			`A _lexeme_ is a specific character string (concrete`
			`representation) denoting a token (abstract representation). Tokens`
			`can be thought of as sets, and lexemes as elements of those sets --`
			`there is often an infinite number of lexemes, but a small number of`
			`tokens. (Think of identifiers as lexemes and one token.)`

			`The tokens are qualified here as being "lexical" because the`
			`parser generator Menhir expects to define them, in which context`
			`they are called "parsing tokens", and they are made to match each`
			`other. (This is an idiosyncratic terminology.)`

			`The type of the lexical tokens is the variant [t], also`
			`aliased to [token].`
			`*)`

			`module Region = Simple_utils.Region`
			`module Pos = Simple_utils.Pos`

			`type lexeme = string`

			`(* TOKENS *)`

[WIP] Refactoring of front-end. 2020-01-20 13:57:07 +04:00			`type attribute = {`
			`header : string;`
			`string : lexeme Region.reg`
			`}`

initial commit 2019-05-13 00:56:22 +04:00			`type t =`
			`(* Literals *)`

			`String of lexeme Region.reg`
			`\| Bytes of (lexeme * Hex.t) Region.reg`
			`\| Int of (lexeme * Z.t) Region.reg`
			`\| Nat of (lexeme * Z.t) Region.reg`
Refactoring of the parsers * [CameLIGO/ReasonLIGO] The AST node [EAnnot] (expressions annotated by a type) now records the region in the source code for the colon. * [CameLIGO/ReasonLIGO/PascaLIGO] I added the syntax %token <...> TOKEN "lexeme" * [ReasonLIGO] I changed the AST nodes [Mtz] and [Str] to [Mutez] and [String], respectively (in accordance with the PascaLIGO front-end). I changed token [DOTDOTDOT] to [ELLIPSIS]. * [ReasonLIGO] I added what was missing to make a loca build with my Makefile. 2019-12-15 20:46:08 +04:00			`\| Mutez of (lexeme * Z.t) Region.reg`
initial commit 2019-05-13 00:56:22 +04:00			`\| Ident of lexeme Region.reg`
			`\| Constr of lexeme Region.reg`

			`(* Symbols *)`

			`\| SEMI of Region.t (* ";" *)`
			`\| COMMA of Region.t (* "," *)`
			`\| LPAR of Region.t (* "(" *)`
			`\| RPAR of Region.t (* ")" *)`
			`\| LBRACE of Region.t (* "{" *)`
			`\| RBRACE of Region.t (* "}" *)`
			`\| LBRACKET of Region.t (* "[" *)`
			`\| RBRACKET of Region.t (* "]" *)`
			`\| CONS of Region.t (* "#" *)`
			`\| VBAR of Region.t (* "\|" *)`
			`\| ARROW of Region.t (* "->" *)`
			`\| ASS of Region.t (* ":=" *)`
Refactorings for PascaLIGO. - I aligned the names of the tokens in common with Ligodity. - I removed the "down" and "step" clauses in loops. - Note: the stratification of the rule "pattern" in the previous commit has the pleasant effect to remove a call to "corner_case" in function "simpl_case" of the file "2-simplify/pascaligo.ml". - Added more cases to the pretty-printer of the AST. 2019-10-13 21:51:01 +04:00			`\| EQ of Region.t (* "=" *)`
initial commit 2019-05-13 00:56:22 +04:00			`\| COLON of Region.t (* ":" *)`
			`\| LT of Region.t (* "<" *)`
Refactorings for PascaLIGO. - I aligned the names of the tokens in common with Ligodity. - I removed the "down" and "step" clauses in loops. - Note: the stratification of the rule "pattern" in the previous commit has the pleasant effect to remove a call to "corner_case" in function "simpl_case" of the file "2-simplify/pascaligo.ml". - Added more cases to the pretty-printer of the AST. 2019-10-13 21:51:01 +04:00			`\| LE of Region.t (* "<=" *)`
initial commit 2019-05-13 00:56:22 +04:00			`\| GT of Region.t (* ">" *)`
Refactorings for PascaLIGO. - I aligned the names of the tokens in common with Ligodity. - I removed the "down" and "step" clauses in loops. - Note: the stratification of the rule "pattern" in the previous commit has the pleasant effect to remove a call to "corner_case" in function "simpl_case" of the file "2-simplify/pascaligo.ml". - Added more cases to the pretty-printer of the AST. 2019-10-13 21:51:01 +04:00			`\| GE of Region.t (* ">=" *)`
			`\| NE of Region.t (* "=/=" *)`
initial commit 2019-05-13 00:56:22 +04:00			`\| PLUS of Region.t (* "+" *)`
			`\| MINUS of Region.t (* "-" *)`
			`\| SLASH of Region.t (* "/" *)`
			`\| TIMES of Region.t (* "" )`
			`\| DOT of Region.t (* "." *)`
			`\| WILD of Region.t (* "_" *)`
			`\| CAT of Region.t (* "^" *)`

			`(* Keywords *)`

Add inline attribute 2020-01-16 23:36:04 +04:00			`\| And of Region.t (* "and" *)`
			`\| Attributes of Region.t (* "attributes" *)`
			`\| Begin of Region.t (* "begin" *)`
			`\| BigMap of Region.t (* "big_map" *)`
			`\| Block of Region.t (* "block" *)`
			`\| Case of Region.t (* "case" *)`
			`\| Const of Region.t (* "const" *)`
			`\| Contains of Region.t (* "contains" *)`
			`\| Else of Region.t (* "else" *)`
			`\| End of Region.t (* "end" *)`
			`\| False of Region.t (* "False" *)`
			`\| For of Region.t (* "for" *)`
			`\| From of Region.t (* "from" *)`
			`\| Function of Region.t (* "function" *)`
			`\| If of Region.t (* "if" *)`
			`\| In of Region.t (* "in" *)`
			`\| Is of Region.t (* "is" *)`
			`\| List of Region.t (* "list" *)`
			`\| Map of Region.t (* "map" *)`
			`\| Mod of Region.t (* "mod" *)`
			`\| Nil of Region.t (* "nil" *)`
			`\| Not of Region.t (* "not" *)`
			`\| Of of Region.t (* "of" *)`
			`\| Or of Region.t (* "or" *)`
			`\| Patch of Region.t (* "patch" *)`
			`\| Record of Region.t (* "record" *)`
			`\| Remove of Region.t (* "remove" *)`
			`\| Set of Region.t (* "set" *)`
			`\| Skip of Region.t (* "skip" *)`
			`\| Then of Region.t (* "then" *)`
			`\| To of Region.t (* "to" *)`
			`\| True of Region.t (* "True" *)`
			`\| Type of Region.t (* "type" *)`
			`\| Unit of Region.t (* "Unit" *)`
			`\| Var of Region.t (* "var" *)`
			`\| While of Region.t (* "while" *)`
			`\| With of Region.t (* "with" *)`
initial commit 2019-05-13 00:56:22 +04:00
			`(* Data constructors *)`

			`\| C_None of Region.t (* "None" *)`
			`\| C_Some of Region.t (* "Some" *)`

			`(* Virtual tokens *)`

			`\| EOF of Region.t`


			`type token = t`

			`(* Projections`

			`The difference between extracting the lexeme and a string from a`
			`token is that the latter is the textual representation of the OCaml`
			`value denoting the token (its abstract syntax), rather than its`
			`lexeme (concrete syntax).`
			`*)`

			`val to_lexeme : token -> lexeme`
			val to_string : token -> ?offsets:bool -> [`Byte \| `Point] -> string
			`val to_region : token -> Region.t`

			`(* Injections *)`

Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-13 01:42:26 +04:00			`type int_err = Non_canonical_zero`
initial commit 2019-05-13 00:56:22 +04:00			`type ident_err = Reserved_name`
Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-13 01:42:26 +04:00			`type nat_err = Invalid_natural`
			`\| Non_canonical_zero_nat`
			`type sym_err = Invalid_symbol`
Add inline attribute 2020-01-16 23:36:04 +04:00			`type attr_err = Invalid_attribute`
Added support for language-specific parse errors for PascaLIGO: * Duplicate variants in the same type declaration * Duplicate parameter in the same function declaration * Shadowing of predefined value in a declaration I fixed the architecture for that support: ParserMain.ml is now where those specific errors are handled, and they are produced by the semantic actions of the parsers. 2020-01-08 19:39:52 +04:00			`type kwd_err = Invalid_keyword`
initial commit 2019-05-13 00:56:22 +04:00
			`val mk_int : lexeme -> Region.t -> (token, int_err) result`
Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-13 01:42:26 +04:00			`val mk_nat : lexeme -> Region.t -> (token, nat_err) result`
Replace "mtz" with "mutez" 2019-10-27 20:50:24 +04:00			`val mk_mutez : lexeme -> Region.t -> (token, int_err) result`
initial commit 2019-05-13 00:56:22 +04:00			`val mk_ident : lexeme -> Region.t -> (token, ident_err) result`
Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-13 01:42:26 +04:00			`val mk_sym : lexeme -> Region.t -> (token, sym_err) result`
Added support for language-specific parse errors for PascaLIGO: * Duplicate variants in the same type declaration * Duplicate parameter in the same function declaration * Shadowing of predefined value in a declaration I fixed the architecture for that support: ParserMain.ml is now where those specific errors are handled, and they are produced by the semantic actions of the parsers. 2020-01-08 19:39:52 +04:00			`val mk_kwd : lexeme -> Region.t -> (token, kwd_err) result`
Bug fixing in the lexers and the parser. Started AST pretty-printer. LexToken, AST: Tiny refactoring. Bug: Added the making of the AST node PBytes. Parser: The rule "pattern" was not properly stratified (the constructor "PCons" was always produced, even when no consing was done (now a fall-through to "core_pattern"). Bug: When sharing the lexers between Ligodity and Pascaligo, a regression was introduced with the lexing of symbols. Indeed, symbols specific to Ligodity (like "<>") and Pascaligo (like "=/=") were scanned, but the function "LexToken.mk_sym" for each only accepted their own, yielding to an assertion to be invalidated. Fix: I created an error "sym_err" now to gracefully handle that situation and provide a hint to the programmer (to wit, to check the LIGO syntax in use). WIP: Started to write pretty-printing functions for the nodes of the AST. CLI: The option "--verbose=ast" now calls that function instead of printing the tokens from the AST. When the pretty-printer is finished, the option for printing the tokens will likely be "--verbose=ast-tokens". 2019-10-13 01:42:26 +04:00			`val mk_string : lexeme -> Region.t -> token`
			`val mk_bytes : lexeme -> Region.t -> token`
initial commit 2019-05-13 00:56:22 +04:00			`val mk_constr : lexeme -> Region.t -> token`
[WIP] Refactoring of front-end. 2020-01-20 13:57:07 +04:00			`val mk_attr : string -> lexeme -> Region.t -> (token, attr_err) result`
initial commit 2019-05-13 00:56:22 +04:00			`val eof : Region.t -> token`

			`(* Predicates *)`

			`val is_string : token -> bool`
			`val is_bytes : token -> bool`
			`val is_int : token -> bool`
			`val is_ident : token -> bool`
			`val is_kwd : token -> bool`
			`val is_constr : token -> bool`
			`val is_sym : token -> bool`
			`val is_eof : token -> bool`