2020-05-18 13:28:31 +02:00

179 lines
4.8 KiB
OCaml

(* This signature defines the lexical tokens for LIGO
_Tokens_ are the abstract units which are used by the parser to
build the abstract syntax tree (AST), in other words, the stream of
tokens is the minimal model of the input program, carrying
implicitly all its structure in a linear encoding, and nothing
else, in particular, comments and whitespace are absent.
A _lexeme_ is a specific character string (concrete
representation) denoting a token (abstract representation). Tokens
can be thought of as sets, and lexemes as elements of those sets --
there is often an infinite number of lexemes, but a small number of
tokens. (Think of identifiers as lexemes and one token.)
The tokens are qualified here as being "lexical" because the
parser generator Menhir expects to define them, in which context
they are called "parsing tokens", and they are made to match each
other. (This is an idiosyncratic terminology.)
The type of the lexical tokens is the variant [t], also
aliased to [token].
*)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
type lexeme = string
(* TOKENS *)
type t =
(* Symbols *)
CAT of Region.t (* "++" *)
(* Arithmetics *)
| MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *)
(* Compounds *)
| LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *)
| LBRACKET of Region.t (* "[" *)
| RBRACKET of Region.t (* "]" *)
| LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *)
(* Separators *)
| COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *)
| WILD of Region.t (* "_" *)
(* Comparisons *)
| EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *)
| GT of Region.t (* ">" *)
| LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *)
(* Logic *)
| BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *)
| NOT of Region.t (* ! *)
(* Identifiers, labels, numbers and strings *)
| Ident of string Region.reg
| Constr of string Region.reg
| Int of (string * Z.t) Region.reg
| Nat of (string * Z.t) Region.reg
| Mutez of (string * Z.t) Region.reg
| String of string Region.reg
| Verbatim of string Region.reg
| Bytes of (string * Hex.t) Region.reg
| Attr of string Region.reg
(* Keywords *)
| Else of Region.t
| False of Region.t
| If of Region.t
| Let of Region.t
| Mod of Region.t
| Or of Region.t
| Rec of Region.t
| Switch of Region.t
| True of Region.t
| Type of Region.t
(* Data constructors *)
| C_None of Region.t (* "None" *)
| C_Some of Region.t (* "Some" *)
(* Virtual tokens *)
| EOF of Region.t (* End of file *)
(* Projections
The difference between extracting the lexeme and a string from a
token is that the latter is the textual representation of the OCaml
value denoting the token (its abstract syntax), rather than its
lexeme (concrete syntax).
*)
type token = t
val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t
(* comments *)
val block_comment_start : lexeme -> bool
val block_comment_end : lexeme -> bool
val line_comment_start : lexeme -> bool
(* Injections *)
type int_err = Non_canonical_zero
type ident_err = Reserved_name
type nat_err = Invalid_natural
| Non_canonical_zero_nat
type sym_err = Invalid_symbol
type attr_err = Invalid_attribute
type kwd_err = Invalid_keyword
val mk_int : lexeme -> Region.t -> (token, int_err) result
val mk_nat : lexeme -> Region.t -> (token, nat_err) result
val mk_mutez : lexeme -> Region.t -> (token, int_err) result
val mk_ident : lexeme -> Region.t -> (token, ident_err) result
val mk_attr : string -> lexeme -> Region.t -> (token, attr_err) result
val mk_sym : lexeme -> Region.t -> (token, sym_err) result
val mk_kwd : lexeme -> Region.t -> (token, kwd_err) result
val mk_string : lexeme -> Region.t -> token
val mk_verbatim : lexeme -> Region.t -> token
val mk_bytes : lexeme -> Region.t -> token
val mk_constr : lexeme -> Region.t -> token
val eof : Region.t -> token
(* Predicates *)
val is_eof : token -> bool
(* Style *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
val check_right_context :
token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit