ligo/LexToken.mli

(* This signature defines the lexical tokens for LIGO

   _Tokens_ are the abstract units which are used by the parser to
   build the abstract syntax tree (AST), in other words, the stream of
   tokens is the minimal model of the input program, carrying
   implicitly all its structure in a linear encoding, and nothing
   else, in particular, comments and whitespace are absent.

     A _lexeme_ is a specific character string (concrete
   representation) denoting a token (abstract representation). Tokens
   can be thought of as sets, and lexemes as elements of those sets --
   there is often an infinite number of lexemes, but a small number of
   tokens. (Think of identifiers as lexemes and one token.)

     The tokens are qualified here as being "lexical" because the
   parser generator Menhir expects to define them, in which context
   they are called "parsing tokens", and they are made to match each
   other. (This is an idiosyncratic terminology.)

     The type of the lexical tokens is the variant [t], also
   aliased to [token].
*)

type lexeme = string

(* TOKENS *)

type t =
  (* Literals *)

  String of lexeme Region.reg
| Bytes  of (lexeme * Hex.t) Region.reg
| Int    of (lexeme * Z.t) Region.reg
| Ident  of lexeme Region.reg
| Constr of lexeme Region.reg

  (* Symbols *)

| SEMI     of Region.t  (* ";"   *)
| COMMA    of Region.t  (* ","   *)
| LPAR     of Region.t  (* "("   *)
| RPAR     of Region.t  (* ")"   *)
| LBRACE   of Region.t  (* "{"   *)
| RBRACE   of Region.t  (* "}"   *)
| LBRACKET of Region.t  (* "["   *)
| RBRACKET of Region.t  (* "]"   *)
| CONS     of Region.t  (* "#"   *)
| VBAR     of Region.t  (* "|"   *)
| ARROW    of Region.t  (* "->"  *)
| ASS      of Region.t  (* ":="  *)
| EQUAL    of Region.t  (* "="   *)
| COLON    of Region.t  (* ":"   *)
| LT       of Region.t  (* "<"   *)
| LEQ      of Region.t  (* "<="  *)
| GT       of Region.t  (* ">"   *)
| GEQ      of Region.t  (* ">="  *)
| NEQ      of Region.t  (* "=/=" *)
| PLUS     of Region.t  (* "+"   *)
| MINUS    of Region.t  (* "-"   *)
| SLASH    of Region.t  (* "/"   *)
| TIMES    of Region.t  (* "*"   *)
| DOT      of Region.t  (* "."   *)
| WILD     of Region.t  (* "_"   *)
| CAT      of Region.t  (* "^"   *)

  (* Keywords *)

| And        of Region.t  (* "and"        *)
| Begin      of Region.t  (* "begin"      *)
| Case       of Region.t  (* "case"       *)
| Const      of Region.t  (* "const"      *)
| Down       of Region.t  (* "down"       *)
| Else       of Region.t  (* "else"       *)
| End        of Region.t  (* "end"        *)
| Entrypoint of Region.t  (* "entrypoint" *)
| Fail       of Region.t  (* "fail"       *)
| For        of Region.t  (* "for"        *)
| From       of Region.t  (* "from"       *)
| Function   of Region.t  (* "function"   *)
| If         of Region.t  (* "if"         *)
| In         of Region.t  (* "in"         *)
| Is         of Region.t  (* "is"         *)
| Map        of Region.t  (* "map"        *)
| Mod        of Region.t  (* "mod"        *)
| Not        of Region.t  (* "not"        *)
| Of         of Region.t  (* "of"         *)
| Or         of Region.t  (* "or"         *)
| Patch      of Region.t  (* "patch"      *)
| Procedure  of Region.t  (* "procedure"  *)
| Record     of Region.t  (* "record"     *)
| Remove     of Region.t  (* "remove"     *)
| Skip       of Region.t  (* "skip"       *)
| Step       of Region.t  (* "step"       *)
| Storage    of Region.t  (* "storage"    *)
| Then       of Region.t  (* "then"       *)
| To         of Region.t  (* "to"         *)
| Type       of Region.t  (* "type"       *)
| Var        of Region.t  (* "var"        *)
| While      of Region.t  (* "while"      *)
| With       of Region.t  (* "with"       *)

  (* Data constructors *)

| C_False of Region.t  (* "False" *)
| C_None  of Region.t  (* "None"  *)
| C_Some  of Region.t  (* "Some"  *)
| C_True  of Region.t  (* "True"  *)
| C_Unit  of Region.t  (* "Unit"  *)

  (* Virtual tokens *)

| EOF of Region.t


type token = t

(* Projections

   The difference between extracting the lexeme and a string from a
   token is that the latter is the textual representation of the OCaml
   value denoting the token (its abstract syntax), rather than its
   lexeme (concrete syntax).
*)

val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t

(* Injections *)

type int_err =
  Non_canonical_zero

type ident_err = Reserved_name

val mk_string : lexeme -> Region.t -> token
val mk_bytes  : lexeme -> Region.t -> token
val mk_int    : lexeme -> Region.t -> (token,   int_err) result
val mk_ident  : lexeme -> Region.t -> (token, ident_err) result
val mk_constr : lexeme -> Region.t -> token
val mk_sym    : lexeme -> Region.t -> token
val eof       : Region.t -> token

(* Predicates *)

val is_string : token -> bool
val is_bytes  : token -> bool
val is_int    : token -> bool
val is_ident  : token -> bool
val is_kwd    : token -> bool
val is_constr : token -> bool
val is_sym    : token -> bool
val is_eof    : token -> bool
Removed keyword "null", replaced by two keywords "do" and "nothing". Until now only products of type names were allowed: I extended them to allow type expressions. Removed the destructive update of a map binding "a[b] := c". Record projection has been extended to allow for qualified names: "a.b.c" and "a.b.c[d]". Changed the LIGO extension from ".li" to ".ligo". Fixed the name of the language to be "LIGO" (instead of "Ligo"). 2019-03-18 20:47:11 +04:00			`(* This signature defines the lexical tokens for LIGO`
initial commit 2019-02-26 01:29:29 +04:00
			`_Tokens_ are the abstract units which are used by the parser to`
			`build the abstract syntax tree (AST), in other words, the stream of`
			`tokens is the minimal model of the input program, carrying`
			`implicitly all its structure in a linear encoding, and nothing`
			`else, in particular, comments and whitespace are absent.`

			`A _lexeme_ is a specific character string (concrete`
			`representation) denoting a token (abstract representation). Tokens`
			`can be thought of as sets, and lexemes as elements of those sets --`
			`there is often an infinite number of lexemes, but a small number of`
			`tokens. (Think of identifiers as lexemes and one token.)`

			`The tokens are qualified here as being "lexical" because the`
			`parser generator Menhir expects to define them, in which context`
			`they are called "parsing tokens", and they are made to match each`
			`other. (This is an idiosyncratic terminology.)`

			`The type of the lexical tokens is the variant [t], also`
			`aliased to [token].`
			`*)`

			`type lexeme = string`

			`(* TOKENS *)`

			`type t =`
			`(* Literals *)`

			`String of lexeme Region.reg`
I removed dummy module [MBytes] an use [Hex] directly. 2019-03-20 19:31:33 +04:00			`\| Bytes of (lexeme * Hex.t) Region.reg`
initial commit 2019-02-26 01:29:29 +04:00			`\| Int of (lexeme * Z.t) Region.reg`
			`\| Ident of lexeme Region.reg`
			`\| Constr of lexeme Region.reg`

			`(* Symbols *)`

			`\| SEMI of Region.t (* ";" *)`
			`\| COMMA of Region.t (* "," *)`
			`\| LPAR of Region.t (* "(" *)`
			`\| RPAR of Region.t (* ")" *)`
			`\| LBRACE of Region.t (* "{" *)`
			`\| RBRACE of Region.t (* "}" *)`
			`\| LBRACKET of Region.t (* "[" *)`
			`\| RBRACKET of Region.t (* "]" *)`
Storage and operations are now explicitly named. Refactoring of AST to enable the detection of incomplete pattern matchings by the OCaml compiler. Some record fields renamed for better readability. 2019-03-10 22:41:27 +04:00			`\| CONS of Region.t (* "#" *)`
initial commit 2019-02-26 01:29:29 +04:00			`\| VBAR of Region.t (* "\|" *)`
			`\| ARROW of Region.t (* "->" *)`
I extended the grammar with optional semicolons and vertical bars. 2019-03-07 20:06:02 +04:00			`\| ASS of Region.t (* ":=" *)`
initial commit 2019-02-26 01:29:29 +04:00			`\| EQUAL of Region.t (* "=" *)`
			`\| COLON of Region.t (* ":" *)`
			`\| LT of Region.t (* "<" *)`
			`\| LEQ of Region.t (* "<=" *)`
			`\| GT of Region.t (* ">" *)`
			`\| GEQ of Region.t (* ">=" *)`
			`\| NEQ of Region.t (* "=/=" *)`
			`\| PLUS of Region.t (* "+" *)`
			`\| MINUS of Region.t (* "-" *)`
			`\| SLASH of Region.t (* "/" *)`
			`\| TIMES of Region.t (* "" )`
			`\| DOT of Region.t (* "." *)`
			`\| WILD of Region.t (* "_" *)`
			`\| CAT of Region.t (* "^" *)`

			`(* Keywords *)`

Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| And of Region.t (* "and" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| Begin of Region.t (* "begin" *)`
Changed "match ... with ..." to "case ... of ..." 2019-03-19 14:46:30 +04:00			`\| Case of Region.t (* "case" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| Const of Region.t (* "const" *)`
			`\| Down of Region.t (* "down" *)`
Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| Else of Region.t (* "else" *)`
			`\| End of Region.t (* "end" *)`
			`\| Entrypoint of Region.t (* "entrypoint" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| Fail of Region.t (* "fail" *)`
Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| For of Region.t (* "for" *)`
Added construct `remove k from map m`. 2019-03-22 00:55:59 +04:00			`\| From of Region.t (* "from" *)`
Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| Function of Region.t (* "function" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| If of Region.t (* "if" *)`
			`\| In of Region.t (* "in" *)`
			`\| Is of Region.t (* "is" *)`
Maps can be defined by extension in declarations. "map" has become a keyword to introduce definition of maps by extension in declarations ("map" ... "end"). This entails that a grammar rule had to be created to handle the type expressions "map (..., ...)". Concordantly, I added map patches, modelled after record patches. I created a node in the AST for map expressions (currently only map look-ups). I refactored the parser with parametric rules. 2019-03-20 12:11:19 +04:00			`\| Map of Region.t (* "map" *)`
Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| Mod of Region.t (* "mod" *)`
			`\| Not of Region.t (* "not" *)`
			`\| Of of Region.t (* "of" *)`
			`\| Or of Region.t (* "or" *)`
Replaced expression "copy ... with ..." by instruction "patch ... with". 2019-03-19 17:32:43 +04:00			`\| Patch of Region.t (* "patch" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| Procedure of Region.t (* "procedure" *)`
			`\| Record of Region.t (* "record" *)`
Added construct `remove k from map m`. 2019-03-22 00:55:59 +04:00			`\| Remove of Region.t (* "remove" *)`
Changed "do nothing" to "skip". 2019-03-18 21:09:15 +04:00			`\| Skip of Region.t (* "skip" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| Step of Region.t (* "step" *)`
Added keyword "storage" as parameter kind (with "var" and "const") for entrypoints. 2019-03-14 21:17:19 +04:00			`\| Storage of Region.t (* "storage" *)`
Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| Then of Region.t (* "then" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| To of Region.t (* "to" *)`
Changed "\|\|" -> "or" and "&&" -> "and" to be more Pascal-like. 2019-03-20 15:28:25 +04:00			`\| Type of Region.t (* "type" *)`
			`\| Var of Region.t (* "var" *)`
Added instruction 'fail'. I changed the grammar and AST for local functions and removed global mutable variables. 2019-02-28 18:46:34 +04:00			`\| While of Region.t (* "while" *)`
			`\| With of Region.t (* "with" *)`
initial commit 2019-02-26 01:29:29 +04:00
			`(* Data constructors *)`

			`\| C_False of Region.t (* "False" *)`
			`\| C_None of Region.t (* "None" *)`
			`\| C_Some of Region.t (* "Some" *)`
			`\| C_True of Region.t (* "True" *)`
			`\| C_Unit of Region.t (* "Unit" *)`

			`(* Virtual tokens *)`

			`\| EOF of Region.t`


			`type token = t`

			`(* Projections`

			`The difference between extracting the lexeme and a string from a`
			`token is that the latter is the textual representation of the OCaml`
			`value denoting the token (its abstract syntax), rather than its`
			`lexeme (concrete syntax).`
			`*)`

			`val to_lexeme : token -> lexeme`
			val to_string : token -> ?offsets:bool -> [`Byte \| `Point] -> string
			`val to_region : token -> Region.t`

			`(* Injections *)`

			`type int_err =`
			`Non_canonical_zero`

			`type ident_err = Reserved_name`

			`val mk_string : lexeme -> Region.t -> token`
			`val mk_bytes : lexeme -> Region.t -> token`
			`val mk_int : lexeme -> Region.t -> (token, int_err) result`
			`val mk_ident : lexeme -> Region.t -> (token, ident_err) result`
			`val mk_constr : lexeme -> Region.t -> token`
			`val mk_sym : lexeme -> Region.t -> token`
			`val eof : Region.t -> token`

			`(* Predicates *)`

			`val is_string : token -> bool`
			`val is_bytes : token -> bool`
			`val is_int : token -> bool`
			`val is_ident : token -> bool`
			`val is_kwd : token -> bool`
			`val is_constr : token -> bool`
			`val is_sym : token -> bool`
			`val is_eof : token -> bool`