609 lines
16 KiB
OCaml
Raw Normal View History

2019-05-12 20:56:22 +00:00
(* Lexer specification for LIGO, to be processed by [ocamllex] *)
{
(* START HEADER *)
(* Shorthands *)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
module SMap = Map.Make (String)
module SSet = Set.Make (String)
2019-05-12 20:56:22 +00:00
2020-04-28 19:26:31 +02:00
type lexeme = string
let sprintf = Printf.sprintf
2019-05-12 20:56:22 +00:00
(* TOKENS *)
2020-01-20 10:57:07 +01:00
type attribute = {
header : string;
string : lexeme Region.reg
}
2019-05-12 20:56:22 +00:00
type t =
(* Literals *)
String of lexeme Region.reg
| Bytes of (lexeme * Hex.t) Region.reg
| Int of (lexeme * Z.t) Region.reg
| Nat of (lexeme * Z.t) Region.reg
2020-01-20 10:57:07 +01:00
| Mutez of (lexeme * Z.t) Region.reg
2019-05-12 20:56:22 +00:00
| Ident of lexeme Region.reg
| Constr of lexeme Region.reg
(* Symbols *)
| SEMI of Region.t
| COMMA of Region.t
| LPAR of Region.t
| RPAR of Region.t
| LBRACE of Region.t
| RBRACE of Region.t
| LBRACKET of Region.t
| RBRACKET of Region.t
| CONS of Region.t
| VBAR of Region.t
| ARROW of Region.t
| ASS of Region.t
| EQ of Region.t
2019-05-12 20:56:22 +00:00
| COLON of Region.t
| LT of Region.t
| LE of Region.t
2019-05-12 20:56:22 +00:00
| GT of Region.t
| GE of Region.t
| NE of Region.t
2019-05-12 20:56:22 +00:00
| PLUS of Region.t
| MINUS of Region.t
| SLASH of Region.t
| TIMES of Region.t
| DOT of Region.t
| WILD of Region.t
| CAT of Region.t
(* Keywords *)
2020-01-16 19:36:04 +00:00
| And of Region.t (* "and" *)
| Attributes of Region.t (* "attributes" *)
| Begin of Region.t (* "begin" *)
| BigMap of Region.t (* "big_map" *)
| Block of Region.t (* "block" *)
| Case of Region.t (* "case" *)
| Const of Region.t (* "const" *)
| Contains of Region.t (* "contains" *)
| Else of Region.t (* "else" *)
| End of Region.t (* "end" *)
| False of Region.t (* "False" *)
| For of Region.t (* "for" *)
| From of Region.t (* "from" *)
| Function of Region.t (* "function" *)
2020-02-20 22:31:47 +01:00
| Recursive of Region.t (* "recursive" *)
2020-01-16 19:36:04 +00:00
| If of Region.t (* "if" *)
| In of Region.t (* "in" *)
| Is of Region.t (* "is" *)
| List of Region.t (* "list" *)
| Map of Region.t (* "map" *)
| Mod of Region.t (* "mod" *)
| Nil of Region.t (* "nil" *)
| Not of Region.t (* "not" *)
| Of of Region.t (* "of" *)
| Or of Region.t (* "or" *)
| Patch of Region.t (* "patch" *)
| Record of Region.t (* "record" *)
| Remove of Region.t (* "remove" *)
| Set of Region.t (* "set" *)
| Skip of Region.t (* "skip" *)
| Step of Region.t (* "step" *)
2020-01-16 19:36:04 +00:00
| Then of Region.t (* "then" *)
| To of Region.t (* "to" *)
| True of Region.t (* "True" *)
| Type of Region.t (* "type" *)
| Unit of Region.t (* "Unit" *)
| Var of Region.t (* "var" *)
| While of Region.t (* "while" *)
| With of Region.t (* "with" *)
2019-11-06 17:23:49 +01:00
2019-05-12 20:56:22 +00:00
(* Data constructors *)
| C_None of Region.t (* "None" *)
| C_Some of Region.t (* "Some" *)
(* Virtual tokens *)
| EOF of Region.t
2020-04-28 19:26:31 +02:00
(* Projections *)
2019-05-12 20:56:22 +00:00
type token = t
let proj_token = function
(* Literals *)
String Region.{region; value} ->
region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} ->
region,
2020-04-28 19:26:31 +02:00
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
2019-05-12 20:56:22 +00:00
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
2019-10-27 11:50:24 -05:00
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
2019-05-12 20:56:22 +00:00
| Ident Region.{region; value} ->
region, sprintf "Ident \"%s\"" value
| Constr Region.{region; value} ->
region, sprintf "Constr \"%s\"" value
(* Symbols *)
| SEMI region -> region, "SEMI"
| COMMA region -> region, "COMMA"
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| CONS region -> region, "CONS"
| VBAR region -> region, "VBAR"
| ARROW region -> region, "ARROW"
| ASS region -> region, "ASS"
| EQ region -> region, "EQ"
2019-05-12 20:56:22 +00:00
| COLON region -> region, "COLON"
| LT region -> region, "LT"
| LE region -> region, "LE"
2019-05-12 20:56:22 +00:00
| GT region -> region, "GT"
| GE region -> region, "GE"
| NE region -> region, "NE"
2019-05-12 20:56:22 +00:00
| PLUS region -> region, "PLUS"
| MINUS region -> region, "MINUS"
| SLASH region -> region, "SLASH"
| TIMES region -> region, "TIMES"
| DOT region -> region, "DOT"
| WILD region -> region, "WILD"
| CAT region -> region, "CAT"
(* Keywords *)
| And region -> region, "And"
2020-01-16 19:36:04 +00:00
| Attributes region -> region, "Attributes"
2019-05-12 20:56:22 +00:00
| Begin region -> region, "Begin"
2019-10-08 12:22:22 +02:00
| BigMap region -> region, "BigMap"
2019-05-12 20:56:22 +00:00
| Block region -> region, "Block"
| Case region -> region, "Case"
| Const region -> region, "Const"
| Contains region -> region, "Contains"
| Else region -> region, "Else"
| End region -> region, "End"
2019-11-06 17:23:49 +01:00
| False region -> region, "False"
2019-05-12 20:56:22 +00:00
| For region -> region, "For"
| From region -> region, "From"
| Function region -> region, "Function"
2020-02-20 22:31:47 +01:00
| Recursive region -> region, "Recursive"
2019-05-12 20:56:22 +00:00
| If region -> region, "If"
| In region -> region, "In"
| Is region -> region, "Is"
| List region -> region, "List"
| Map region -> region, "Map"
| Mod region -> region, "Mod"
| Nil region -> region, "Nil"
| Not region -> region, "Not"
| Of region -> region, "Of"
| Or region -> region, "Or"
| Patch region -> region, "Patch"
| Record region -> region, "Record"
| Remove region -> region, "Remove"
| Set region -> region, "Set"
| Skip region -> region, "Skip"
| Step region -> region, "Step"
2019-05-12 20:56:22 +00:00
| Then region -> region, "Then"
| To region -> region, "To"
2019-11-06 17:23:49 +01:00
| True region -> region, "True"
2019-05-12 20:56:22 +00:00
| Type region -> region, "Type"
2019-11-06 17:23:49 +01:00
| Unit region -> region, "Unit"
2019-05-12 20:56:22 +00:00
| Var region -> region, "Var"
| While region -> region, "While"
| With region -> region, "With"
(* Data *)
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
2020-01-20 10:57:07 +01:00
2019-05-12 20:56:22 +00:00
(* Virtual tokens *)
| EOF region -> region, "EOF"
let to_lexeme = function
(* Literals *)
String s -> String.escaped s.Region.value
2019-05-12 20:56:22 +00:00
| Bytes b -> fst b.Region.value
| Int i
| Nat i
2020-04-28 19:26:31 +02:00
| Mutez i -> fst i.Region.value
2019-05-12 20:56:22 +00:00
| Ident id
| Constr id -> id.Region.value
(* Symbols *)
| SEMI _ -> ";"
| COMMA _ -> ","
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| CONS _ -> "#"
| VBAR _ -> "|"
| ARROW _ -> "->"
| ASS _ -> ":="
| EQ _ -> "="
2019-05-12 20:56:22 +00:00
| COLON _ -> ":"
| LT _ -> "<"
| LE _ -> "<="
2019-05-12 20:56:22 +00:00
| GT _ -> ">"
| GE _ -> ">="
| NE _ -> "=/="
2019-05-12 20:56:22 +00:00
| PLUS _ -> "+"
| MINUS _ -> "-"
| SLASH _ -> "/"
| TIMES _ -> "*"
| DOT _ -> "."
| WILD _ -> "_"
| CAT _ -> "^"
(* Keywords *)
| And _ -> "and"
2020-01-16 19:36:04 +00:00
| Attributes _ -> "attributes"
2019-05-12 20:56:22 +00:00
| Begin _ -> "begin"
2019-10-08 12:22:22 +02:00
| BigMap _ -> "big_map"
2019-05-12 20:56:22 +00:00
| Block _ -> "block"
| Case _ -> "case"
| Const _ -> "const"
| Contains _ -> "contains"
| Else _ -> "else"
| End _ -> "end"
2019-11-06 17:23:49 +01:00
| False _ -> "False"
2019-05-12 20:56:22 +00:00
| For _ -> "for"
| From _ -> "from"
| Function _ -> "function"
2020-02-20 22:31:47 +01:00
| Recursive _ -> "recursive"
2019-05-12 20:56:22 +00:00
| If _ -> "if"
| In _ -> "in"
| Is _ -> "is"
| List _ -> "list"
| Map _ -> "map"
| Mod _ -> "mod"
| Nil _ -> "nil"
| Not _ -> "not"
| Of _ -> "of"
| Or _ -> "or"
| Patch _ -> "patch"
| Record _ -> "record"
| Remove _ -> "remove"
| Set _ -> "set"
| Skip _ -> "skip"
| Step _ -> "step"
2019-05-12 20:56:22 +00:00
| Then _ -> "then"
| To _ -> "to"
2019-11-06 17:23:49 +01:00
| True _ -> "True"
2019-05-12 20:56:22 +00:00
| Type _ -> "type"
2019-11-06 17:23:49 +01:00
| Unit _ -> "Unit"
2019-05-12 20:56:22 +00:00
| Var _ -> "var"
| While _ -> "while"
| With _ -> "with"
(* Data constructors *)
| C_None _ -> "None"
| C_Some _ -> "Some"
(* Virtual tokens *)
| EOF _ -> ""
2020-01-20 10:57:07 +01:00
(* CONVERSIONS *)
2019-05-12 20:56:22 +00:00
let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode
in sprintf "%s: %s" reg_str val_str
let to_region token = proj_token token |> fst
(* LEXIS *)
let keywords = [
(fun reg -> And reg);
2020-01-16 19:36:04 +00:00
(fun reg -> Attributes reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Begin reg);
2019-10-08 12:22:22 +02:00
(fun reg -> BigMap reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Block reg);
(fun reg -> Case reg);
(fun reg -> Const reg);
(fun reg -> Contains reg);
(fun reg -> Else reg);
(fun reg -> End reg);
(fun reg -> For reg);
(fun reg -> From reg);
(fun reg -> Function reg);
2019-11-06 17:23:49 +01:00
(fun reg -> False reg);
2019-05-12 20:56:22 +00:00
(fun reg -> If reg);
(fun reg -> In reg);
(fun reg -> Is reg);
(fun reg -> List reg);
(fun reg -> Map reg);
(fun reg -> Mod reg);
(fun reg -> Nil reg);
(fun reg -> Not reg);
2019-11-06 17:23:49 +01:00
(fun reg -> C_None reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Of reg);
(fun reg -> Or reg);
(fun reg -> Patch reg);
(fun reg -> Record reg);
(fun reg -> Recursive reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Remove reg);
(fun reg -> Set reg);
(fun reg -> Skip reg);
(fun reg -> Step reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Then reg);
(fun reg -> To reg);
2019-11-06 17:23:49 +01:00
(fun reg -> True reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Type reg);
2019-11-06 17:23:49 +01:00
(fun reg -> Unit reg);
2019-05-12 20:56:22 +00:00
(fun reg -> Var reg);
(fun reg -> While reg);
(fun reg -> With reg)
]
2020-04-28 19:26:31 +02:00
let reserved = SSet.empty
2019-05-12 20:56:22 +00:00
let constructors = [
2019-11-06 17:23:49 +01:00
(fun reg -> False reg);
(fun reg -> True reg);
(fun reg -> Unit reg);
(fun reg -> C_None reg);
(fun reg -> C_Some reg)
2019-05-12 20:56:22 +00:00
]
let add map (key, value) = SMap.add key value map
let mk_map mk_key list =
let apply map value = add map (mk_key value, value)
in List.fold_left apply SMap.empty list
type lexis = {
kwd : (Region.t -> token) SMap.t;
cstr : (Region.t -> token) SMap.t;
res : SSet.t
}
let lexicon : lexis =
let build list = mk_map (fun f -> to_lexeme (f Region.ghost)) list
in {kwd = build keywords;
cstr = build constructors;
res = reserved}
(* Keywords *)
type kwd_err = Invalid_keyword
let mk_kwd ident region =
match SMap.find_opt ident lexicon.kwd with
Some mk_kwd -> Ok (mk_kwd region)
| None -> Error Invalid_keyword
2019-05-12 20:56:22 +00:00
(* Identifiers *)
type ident_err = Reserved_name
(* END HEADER *)
}
(* START LEXER DEFINITION *)
(* Named regular expressions *)
let small = ['a'-'z']
let capital = ['A'-'Z']
let letter = small | capital
let digit = ['0'-'9']
let ident = small (letter | '_' | digit)*
let constr = capital (letter | '_' | digit)*
(* Rules *)
rule scan_ident region lexicon = parse
(ident as value) eof {
if SSet.mem value lexicon.res
then Error Reserved_name
else Ok (match SMap.find_opt value lexicon.kwd with
Some mk_kwd -> mk_kwd region
| None -> Ident Region.{region; value}) }
and scan_constr region lexicon = parse
(constr as value) eof {
match SMap.find_opt value lexicon.cstr with
Some mk_cstr -> mk_cstr region
| None -> Constr Region.{region; value} }
(* END LEXER DEFINITION *)
{
(* START TRAILER *)
(* Smart constructors (injections) *)
let mk_string lexeme region = String Region.{region; value=lexeme}
let mk_bytes lexeme region =
let norm = Str.(global_replace (regexp "_") "" lexeme) in
2020-01-09 16:50:27 -06:00
let value = lexeme, `Hex norm
2019-05-12 20:56:22 +00:00
in Bytes Region.{region; value}
type int_err = Non_canonical_zero
let mk_int lexeme region =
2019-11-06 17:23:49 +01:00
let z =
Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string
in if Z.equal z Z.zero && lexeme <> "0"
then Error Non_canonical_zero
else Ok (Int Region.{region; value = lexeme,z})
2019-05-12 20:56:22 +00:00
type nat_err =
Invalid_natural
| Non_canonical_zero_nat
2019-05-12 20:56:22 +00:00
let mk_nat lexeme region =
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
match String.index_opt lexeme 'n' with
2020-04-28 19:26:31 +02:00
None -> Error Invalid_natural
| Some _ -> let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme,z})
2019-05-12 20:56:22 +00:00
2019-10-27 11:50:24 -05:00
let mk_mutez lexeme region =
2020-04-28 19:26:31 +02:00
let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez"
2019-05-12 20:56:22 +00:00
then Error Non_canonical_zero
2019-10-27 11:50:24 -05:00
else Ok (Mutez Region.{region; value = lexeme, z})
2019-05-12 20:56:22 +00:00
let eof region = EOF region
type sym_err = Invalid_symbol
2019-05-12 20:56:22 +00:00
let mk_sym lexeme region =
match lexeme with
(* Lexemes in common with all concrete syntaxes *)
";" -> Ok (SEMI region)
| "," -> Ok (COMMA region)
| "(" -> Ok (LPAR region)
| ")" -> Ok (RPAR region)
| "[" -> Ok (LBRACKET region)
| "]" -> Ok (RBRACKET region)
| "{" -> Ok (LBRACE region)
| "}" -> Ok (RBRACE region)
| "=" -> Ok (EQ region)
| ":" -> Ok (COLON region)
| "|" -> Ok (VBAR region)
| "->" -> Ok (ARROW region)
| "." -> Ok (DOT region)
| "_" -> Ok (WILD region)
| "^" -> Ok (CAT region)
| "+" -> Ok (PLUS region)
| "-" -> Ok (MINUS region)
| "*" -> Ok (TIMES region)
| "/" -> Ok (SLASH region)
| "<" -> Ok (LT region)
2019-11-06 17:23:49 +01:00
| "<=" -> Ok (LE region)
| ">" -> Ok (GT region)
2019-11-06 17:23:49 +01:00
| ">=" -> Ok (GE region)
(* Lexemes specific to PascaLIGO *)
| "=/=" -> Ok (NE region)
| "#" -> Ok (CONS region)
| ":=" -> Ok (ASS region)
(* Invalid lexemes *)
| _ -> Error Invalid_symbol
2019-05-12 20:56:22 +00:00
2019-11-06 17:23:49 +01:00
2019-05-12 20:56:22 +00:00
(* Identifiers *)
2019-11-06 17:23:49 +01:00
let mk_ident lexeme region =
2019-05-12 20:56:22 +00:00
Lexing.from_string lexeme |> scan_ident region lexicon
(* Constructors *)
2019-11-06 17:23:49 +01:00
let mk_constr lexeme region =
2019-05-12 20:56:22 +00:00
Lexing.from_string lexeme |> scan_constr region lexicon
2020-01-16 19:36:04 +00:00
(* Attributes *)
2020-01-20 10:57:07 +01:00
type attr_err = Invalid_attribute
2020-01-16 19:36:04 +00:00
2020-04-28 19:26:31 +02:00
let mk_attr _ _ _ = Error Invalid_attribute
2020-01-16 19:36:04 +00:00
2019-05-12 20:56:22 +00:00
(* Predicates *)
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
let is_string = function String _ -> true | _ -> false
2020-04-28 19:26:31 +02:00
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
(* Errors *)
type error =
Odd_lengthed_bytes
| Missing_break
2020-04-28 19:26:31 +02:00
| Negative_byte_sequence
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
let error_to_string = function
Odd_lengthed_bytes ->
"The length of the byte sequence is an odd number.\n\
Hint: Add or remove a digit."
| Missing_break ->
"Missing break.\n\
Hint: Insert some space."
2020-04-28 19:26:31 +02:00
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
The preprocessor library depends now on the kinds of comments instead of a closed set of languages. I also removed the offsets: I simply use the current region to determine whether the preprocessing directie starts at the beginning of a line. I also removed scanning line indicators, to make the lexer simpler. LexToken.mll: Moved the function [check_right_context] that checks stylistic constraints from Lexer.mll to LexToken.mll. While this triplicates code (as CameLIGO, PascaLIGO and ReasonLIGO share the same constraints), the benefit is that Lexer.mll becomes more generic and the signature for the TOKEN module is simpler (no more exporting predicates, except for EOF). In accordance with the change of the preprocessor, the lexers and parsers for LIGO now depend on the kind of comments, not a fixed set of syntaxes. This gives more versatility when adding a new language: only the kinds of its comments are needed, although Lexer.mll and Preproc.mll may have to be modified if they do not already know the comment delimiters, for example line comments starting with #. **************************************************************** BUG: The exceptions coming from LexToken.mll when a stylistic constraint is broken in [LexToken.check_right_context] are not caught yet. **************************************************************** Lexer.mll: I moved out as much as I could from the header into a new module LexerLib. The aim is to make it easy to reuse as much as possible of the lexer machinerie, when it cannot be used as is.
2020-04-24 21:06:18 +02:00
exception Error of error Region.reg
let format_error ?(offsets=true) mode Region.{region; value} ~file =
let msg = error_to_string value
and reg = region#to_string ~file ~offsets mode in
let value = sprintf "Lexical error %s:\n%s\n" reg msg
in Region.{value; region}
let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit =
2020-04-28 19:26:31 +02:00
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with
None -> ()
| Some (markup, next) ->
if is_minus token && is_bytes next
then let region =
Region.cover (to_region token) (to_region next)
in fail region Negative_byte_sequence
else
match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
2019-05-12 20:56:22 +00:00
(* END TRAILER *)
}