Finished refactoring of lexer.

This commit is contained in:
Christian Rinderknecht 2020-04-28 19:26:31 +02:00
parent 9618a48848
commit 005b7fd69b
27 changed files with 1174 additions and 1079 deletions

View File

@ -54,7 +54,7 @@ ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9:
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-23: ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-31:
Negative byte sequence. Negative byte sequence.
Hint: Remove the leading minus sign. Hint: Remove the leading minus sign.
{} {}
@ -71,7 +71,7 @@ ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-13: ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-21:
Negative byte sequence. Negative byte sequence.
Hint: Remove the leading minus sign. Hint: Remove the leading minus sign.
{} {}
@ -88,7 +88,7 @@ ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-13: ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-21:
Negative byte sequence. Negative byte sequence.
Hint: Remove the leading minus sign. Hint: Remove the leading minus sign.
{} {}
@ -103,6 +103,7 @@ ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, character
* Check the changelog by running 'ligo changelog' * Check the changelog by running 'ligo changelog'
|} ]; |} ];
(*
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13: ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
@ -119,6 +120,7 @@ ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
* Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new * Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new
* Check the changelog by running 'ligo changelog' * Check the changelog by running 'ligo changelog'
|} ]; |} ];
*)
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ;
[%expect {| [%expect {|
@ -204,10 +206,9 @@ ligo: : Lexical error in file "unexpected_character.religo", line 1, characters
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0-2: ligo: : Preprocessing error in file "../../test/lexer/unterminated_comment.mligo", line 1, characters 0-2:
Unterminated comment. Unterminated comment.
Hint: Close with "*)". Hint: Close with "*)". {}
{}
If you're not sure how to fix this error, you can If you're not sure how to fix this error, you can

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli ../shared/Lexer.mli
../shared/Lexer.mll ../shared/Lexer.mll
../shared/LexerLib.ml
../shared/EvalOpt.ml ../shared/EvalOpt.ml
../shared/EvalOpt.mli ../shared/EvalOpt.mli
../shared/FQueue.ml ../shared/FQueue.ml

View File

@ -102,138 +102,167 @@ type t =
| EOF of Region.t (* End of file *) | EOF of Region.t (* End of file *)
(* Projections *)
type token = t type token = t
let proj_token = function let proj_token = function
ARROW region -> region, "ARROW" (* Literals *)
| CONS region -> region, "CONS"
| CAT region -> region, "CAT" String Region.{region; value} ->
| MINUS region -> region, "MINUS" region, sprintf "Str %s" value
| PLUS region -> region, "PLUS" | Bytes Region.{region; value = s,b} ->
| SLASH region -> region, "SLASH" region,
| TIMES region -> region, "TIMES" sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
| Int Region.{region; value = s,n} -> | Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} -> | Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} -> | Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| String Region.{region; value} -> | Ident Region.{region; value} ->
region, sprintf "Str %s" value region, sprintf "Ident %s" value
| Bytes Region.{region; value = s,b} -> | Constr Region.{region; value} ->
region, region, sprintf "Constr %s" value
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b)
| Attr Region.{region; value} -> | Attr Region.{region; value} ->
region, sprintf "Attr \"%s\"" value region, sprintf "Attr \"%s\"" value
(* Symbols *)
| ARROW region -> region, "ARROW"
| CONS region -> region, "CONS"
| CAT region -> region, "CAT"
| MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS"
| SLASH region -> region, "SLASH"
| TIMES region -> region, "TIMES"
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
(* Keywords *)
| Begin region -> region, "Begin" | Begin region -> region, "Begin"
| Else region -> region, "Else" | Else region -> region, "Else"
| End region -> region, "End" | End region -> region, "End"
| False region -> region, "False" | False region -> region, "False"
| Fun region -> region, "Fun" | Fun region -> region, "Fun"
| Rec region -> region, "Rec" | Rec region -> region, "Rec"
| If region -> region, "If" | If region -> region, "If"
| In region -> region, "In" | In region -> region, "In"
| Let region -> region, "Let" | Let region -> region, "Let"
| Match region -> region, "Match" | Match region -> region, "Match"
| Mod region -> region, "Mod" | Mod region -> region, "Mod"
| Not region -> region, "Not" | Not region -> region, "Not"
| Of region -> region, "Of" | Of region -> region, "Of"
| Or region -> region, "Or" | Or region -> region, "Or"
| Then region -> region, "Then" | Then region -> region, "Then"
| True region -> region, "True" | True region -> region, "True"
| Type region -> region, "Type" | Type region -> region, "Type"
| With region -> region, "With" | With region -> region, "With"
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some" (* Data *)
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
(* Virtual tokens *)
| EOF region -> region, "EOF" | EOF region -> region, "EOF"
let to_lexeme = function
ARROW _ -> "->"
| CONS _ -> "::"
| CAT _ -> "^"
| MINUS _ -> "-"
| PLUS _ -> "+"
| SLASH _ -> "/"
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| WILD _ -> "_"
| EQ _ -> "="
| NE _ -> "<>"
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
| Ident id -> id.Region.value let to_lexeme = function
| Constr id -> id.Region.value (* Literals *)
String s -> String.escaped s.Region.value
| Bytes b -> fst b.Region.value
| Int i | Int i
| Nat i | Nat i
| Mutez i -> fst i.Region.value | Mutez i -> fst i.Region.value
| String s -> String.escaped s.Region.value | Ident id -> id.Region.value
| Bytes b -> fst b.Region.value | Constr id -> id.Region.value
| Attr a -> a.Region.value | Attr a -> a.Region.value
(* Symbols *)
| ARROW _ -> "->"
| CONS _ -> "::"
| CAT _ -> "^"
| MINUS _ -> "-"
| PLUS _ -> "+"
| SLASH _ -> "/"
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| WILD _ -> "_"
| EQ _ -> "="
| NE _ -> "<>"
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
(* Keywords *)
| Begin _ -> "begin" | Begin _ -> "begin"
| Else _ -> "else" | Else _ -> "else"
| End _ -> "end" | End _ -> "end"
| False _ -> "false" | False _ -> "false"
| Fun _ -> "fun" | Fun _ -> "fun"
| Rec _ -> "rec" | Rec _ -> "rec"
| If _ -> "if" | If _ -> "if"
| In _ -> "in" | In _ -> "in"
| Let _ -> "let" | Let _ -> "let"
| Match _ -> "match" | Match _ -> "match"
| Mod _ -> "mod" | Mod _ -> "mod"
| Not _ -> "not" | Not _ -> "not"
| Of _ -> "of" | Of _ -> "of"
| Or _ -> "or" | Or _ -> "or"
| True _ -> "true" | True _ -> "true"
| Type _ -> "type" | Type _ -> "type"
| Then _ -> "then" | Then _ -> "then"
| With _ -> "with" | With _ -> "with"
(* Data constructors *)
| C_None _ -> "None" | C_None _ -> "None"
| C_Some _ -> "Some" | C_Some _ -> "Some"
(* Virtual tokens *)
| EOF _ -> "" | EOF _ -> ""
(* CONVERSIONS *)
let to_string token ?(offsets=true) mode = let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode let reg_str = region#compact ~offsets mode
@ -241,10 +270,6 @@ let to_string token ?(offsets=true) mode =
let to_region token = proj_token token |> fst let to_region token = proj_token token |> fst
(* Injections *)
type int_err = Non_canonical_zero
(* LEXIS *) (* LEXIS *)
let keywords = [ let keywords = [
@ -385,6 +410,8 @@ let mk_bytes lexeme region =
let value = lexeme, `Hex norm let value = lexeme, `Hex norm
in Bytes Region.{region; value} in Bytes Region.{region; value}
type int_err = Non_canonical_zero
let mk_int lexeme region = let mk_int lexeme region =
let z = let z =
Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string
@ -398,23 +425,21 @@ type nat_err =
let mk_nat lexeme region = let mk_nat lexeme region =
match (String.index_opt lexeme 'n') with match (String.index_opt lexeme 'n') with
| None -> Error Invalid_natural None -> Error Invalid_natural
| Some _ -> ( | Some _ -> let z =
let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "n") "") |>
Str.(global_replace (regexp "n") "") |> Z.of_string in
Z.of_string in if Z.equal z Z.zero && lexeme <> "0n"
if Z.equal z Z.zero && lexeme <> "0n" then Error Non_canonical_zero_nat
then Error Non_canonical_zero_nat else Ok (Nat Region.{region; value = lexeme,z})
else Ok (Nat Region.{region; value = lexeme,z})
)
let mk_mutez lexeme region = let mk_mutez lexeme region =
let z = let z =
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |> Str.(global_replace (regexp "mutez") "") |>
Z.of_string in Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez" if Z.equal z Z.zero && lexeme <> "0mutez"
then Error Non_canonical_zero then Error Non_canonical_zero
else Ok (Mutez Region.{region; value = lexeme, z}) else Ok (Mutez Region.{region; value = lexeme, z})
@ -422,8 +447,6 @@ let eof region = EOF region
type sym_err = Invalid_symbol type sym_err = Invalid_symbol
type attr_err = Invalid_attribute
let mk_sym lexeme region = let mk_sym lexeme region =
match lexeme with match lexeme with
(* Lexemes in common with all concrete syntaxes *) (* Lexemes in common with all concrete syntaxes *)
@ -473,24 +496,27 @@ let mk_constr lexeme region =
(* Attributes *) (* Attributes *)
type attr_err = Invalid_attribute
let mk_attr header lexeme region = let mk_attr header lexeme region =
if header = "[@" then if header = "[@" then Error Invalid_attribute
Error Invalid_attribute
else Ok (Attr Region.{value=lexeme; region}) else Ok (Attr Region.{value=lexeme; region})
(* Predicates *) (* Predicates *)
let is_string = function String _ -> true | _ -> false let is_string = function String _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *) (* Errors *)
type error = type error =
Odd_lengthed_bytes Odd_lengthed_bytes
| Missing_break | Missing_break
| Negative_byte_sequence
let error_to_string = function let error_to_string = function
Odd_lengthed_bytes -> Odd_lengthed_bytes ->
@ -499,6 +525,9 @@ let error_to_string = function
| Missing_break -> | Missing_break ->
"Missing break.\n\ "Missing break.\n\
Hint: Insert some space." Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg exception Error of error Region.reg
@ -511,29 +540,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value}) let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit = let check_right_context token next_token buffer : unit =
if not (is_eof token) then let pos = (to_region token)#stop in
if is_int token || is_bytes token then let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with match next_token buffer with
Some ([], next) -> None -> ()
let pos = (to_region token)#stop in | Some (markup, next) ->
let region = Region.make ~start:pos ~stop:pos in if is_minus token && is_bytes next
if is_int next then then let region =
fail region Odd_lengthed_bytes Region.cover (to_region token) (to_region next)
else in fail region Negative_byte_sequence
if is_ident next || is_string next || is_bytes next then else
fail region Missing_break match markup with
| Some (_::_, _) | None -> () [] ->
else if is_int token
if is_ident token || is_string token then then if is_string next || is_ident next
match next_token buffer with then fail region Missing_break
Some ([], next) -> else ()
if is_ident next || is_string next else
|| is_bytes next || is_int next if is_string token
then then if is_int next || is_bytes next || is_ident next
let pos = (to_region token)#stop in then fail region Missing_break
let region = Region.make ~start:pos ~stop:pos else ()
in fail region Missing_break else
| Some (_::_, _) | None -> () if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli ../shared/Lexer.mli
../shared/Lexer.mll ../shared/Lexer.mll
../shared/LexerLib.mli
../shared/LexerLib.ml ../shared/LexerLib.ml
../shared/EvalOpt.ml ../shared/EvalOpt.ml
../shared/EvalOpt.mli ../shared/EvalOpt.mli

View File

@ -1,4 +1,4 @@
(** This signature defines the lexical tokens for LIGO (* This signature defines the lexical tokens for LIGO
_Tokens_ are the abstract units which are used by the parser to _Tokens_ are the abstract units which are used by the parser to
build the abstract syntax tree (AST), in other words, the stream of build the abstract syntax tree (AST), in other words, the stream of

View File

@ -5,24 +5,14 @@
(* Shorthands *) (* Shorthands *)
type lexeme = string
let sprintf = Printf.sprintf
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
module SMap = Map.Make (String) module SMap = Map.Make (String)
module SSet = Set.Make (String) module SSet = Set.Make (String)
(* Hack to roll back one lexeme in the current semantic action *) type lexeme = string
(*
let rollback buffer = let sprintf = Printf.sprintf
let open Lexing in
let len = String.length (lexeme buffer) in
let pos_cnum = buffer.lex_curr_p.pos_cnum - len in
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
*)
(* TOKENS *) (* TOKENS *)
@ -123,6 +113,8 @@ type t =
| EOF of Region.t | EOF of Region.t
(* Projections *)
type token = t type token = t
let proj_token = function let proj_token = function
@ -130,32 +122,20 @@ let proj_token = function
String Region.{region; value} -> String Region.{region; value} ->
region, sprintf "String %s" value region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} -> | Bytes Region.{region; value = s,b} ->
region, region,
sprintf "Bytes (\"%s\", \"0x%s\")" sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
s (Hex.show b)
| Int Region.{region; value = s,n} -> | Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} -> | Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} -> | Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| Ident Region.{region; value} -> | Ident Region.{region; value} ->
region, sprintf "Ident \"%s\"" value region, sprintf "Ident \"%s\"" value
| Constr Region.{region; value} -> | Constr Region.{region; value} ->
region, sprintf "Constr \"%s\"" value region, sprintf "Constr \"%s\"" value
(*
| Attr {header; string={region; value}} ->
region, sprintf "Attr (\"%s\",\"%s\")" header value
*)
(* Symbols *) (* Symbols *)
| SEMI region -> region, "SEMI" | SEMI region -> region, "SEMI"
@ -244,7 +224,7 @@ let to_lexeme = function
| Bytes b -> fst b.Region.value | Bytes b -> fst b.Region.value
| Int i | Int i
| Nat i | Nat i
| Mutez i -> fst i.Region.value | Mutez i -> fst i.Region.value
| Ident id | Ident id
| Constr id -> id.Region.value | Constr id -> id.Region.value
@ -382,9 +362,7 @@ let keywords = [
(fun reg -> With reg) (fun reg -> With reg)
] ]
let reserved = let reserved = SSet.empty
let open SSet in
empty |> add "arguments"
let constructors = [ let constructors = [
(fun reg -> False reg); (fun reg -> False reg);
@ -484,22 +462,20 @@ type nat_err =
let mk_nat lexeme region = let mk_nat lexeme region =
match String.index_opt lexeme 'n' with match String.index_opt lexeme 'n' with
None -> Error Invalid_natural None -> Error Invalid_natural
| Some _ -> | Some _ -> let z =
let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "n") "") |>
Str.(global_replace (regexp "n") "") |> Z.of_string in
Z.of_string in if Z.equal z Z.zero && lexeme <> "0n"
if Z.equal z Z.zero && lexeme <> "0n" then Error Non_canonical_zero_nat
then Error Non_canonical_zero_nat else Ok (Nat Region.{region; value = lexeme,z})
else Ok (Nat Region.{region; value = lexeme,z})
let mk_mutez lexeme region = let mk_mutez lexeme region =
let z = let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "mutez") "") |>
Str.(global_replace (regexp "mutez") "") |> Z.of_string in
Z.of_string in if Z.equal z Z.zero && lexeme <> "0mutez"
if Z.equal z Z.zero && lexeme <> "0mutez"
then Error Non_canonical_zero then Error Non_canonical_zero
else Ok (Mutez Region.{region; value = lexeme, z}) else Ok (Mutez Region.{region; value = lexeme, z})
@ -557,22 +533,23 @@ let mk_constr lexeme region =
type attr_err = Invalid_attribute type attr_err = Invalid_attribute
let mk_attr _header _string _region = let mk_attr _ _ _ = Error Invalid_attribute
Error Invalid_attribute
(* Predicates *) (* Predicates *)
let is_string = function String _ -> true | _ -> false let is_string = function String _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *) (* Errors *)
type error = type error =
Odd_lengthed_bytes Odd_lengthed_bytes
| Missing_break | Missing_break
| Negative_byte_sequence
let error_to_string = function let error_to_string = function
Odd_lengthed_bytes -> Odd_lengthed_bytes ->
@ -581,6 +558,9 @@ let error_to_string = function
| Missing_break -> | Missing_break ->
"Missing break.\n\ "Missing break.\n\
Hint: Insert some space." Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg exception Error of error Region.reg
@ -593,29 +573,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value}) let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit = let check_right_context token next_token buffer : unit =
if not (is_eof token) then let pos = (to_region token)#stop in
if is_int token || is_bytes token then let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with match next_token buffer with
Some ([], next) -> None -> ()
let pos = (to_region token)#stop in | Some (markup, next) ->
let region = Region.make ~start:pos ~stop:pos in if is_minus token && is_bytes next
if is_int next then then let region =
fail region Odd_lengthed_bytes Region.cover (to_region token) (to_region next)
else in fail region Negative_byte_sequence
if is_ident next || is_string next || is_bytes next then else
fail region Missing_break match markup with
| Some (_::_, _) | None -> () [] ->
else if is_int token
if is_ident token || is_string token then then if is_string next || is_ident next
match next_token buffer with then fail region Missing_break
Some ([], next) -> else ()
if is_ident next || is_string next else
|| is_bytes next || is_int next if is_string token
then then if is_int next || is_bytes next || is_ident next
let pos = (to_region token)#stop in then fail region Missing_break
let region = Region.make ~start:pos ~stop:pos else ()
in fail region Missing_break else
| Some (_::_, _) | None -> () if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli ../shared/Lexer.mli
../shared/Lexer.mll ../shared/Lexer.mll
../shared/LexerLib.ml
../shared/EvalOpt.ml ../shared/EvalOpt.ml
../shared/EvalOpt.mli ../shared/EvalOpt.mli
../shared/FQueue.ml ../shared/FQueue.ml

View File

@ -31,48 +31,49 @@ type lexeme = string
type t = type t =
(* Symbols *) (* Symbols *)
CAT of Region.t (* "++" *) CAT of Region.t (* "++" *)
(* Arithmetics *) (* Arithmetics *)
| MINUS of Region.t (* "-" *) | MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *) | PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *) | SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *) | TIMES of Region.t (* "*" *)
(* Compounds *) (* Compounds *)
| LPAR of Region.t (* "(" *) | LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *) | RPAR of Region.t (* ")" *)
| LBRACKET of Region.t (* "[" *) | LBRACKET of Region.t (* "[" *)
| RBRACKET of Region.t (* "]" *) | RBRACKET of Region.t (* "]" *)
| LBRACE of Region.t (* "{" *) | LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *) | RBRACE of Region.t (* "}" *)
(* Separators *) (* Separators *)
| COMMA of Region.t (* "," *) | COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *) | SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *) | VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *) | COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *) | DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *) | ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *) (* Wildcard *)
| WILD of Region.t (* "_" *) | WILD of Region.t (* "_" *)
(* Comparisons *) (* Comparisons *)
| EQ of Region.t (* "=" *) | EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *) | EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *) | NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *) | LT of Region.t (* "<" *)
| GT of Region.t (* ">" *) | GT of Region.t (* ">" *)
| LE of Region.t (* "=<" *) | LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *) | GE of Region.t (* ">=" *)
| ARROW of Region.t (* "=>" *) (* Logic *)
| BOOL_OR of Region.t (* "||" *) | BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *) | BOOL_AND of Region.t (* "&&" *)
@ -91,18 +92,18 @@ type t =
(* Keywords *) (* Keywords *)
| Else of Region.t | Else of Region.t
| False of Region.t | False of Region.t
| If of Region.t | If of Region.t
| Let of Region.t | Let of Region.t
| Rec of Region.t | Mod of Region.t
| Or of Region.t
| Rec of Region.t
| Switch of Region.t | Switch of Region.t
| Mod of Region.t | True of Region.t
| Or of Region.t | Type of Region.t
| True of Region.t
| Type of Region.t
(* Data constructors *) (* Data constructors *)
| C_None of Region.t (* "None" *) | C_None of Region.t (* "None" *)
| C_Some of Region.t (* "Some" *) | C_Some of Region.t (* "Some" *)
@ -111,8 +112,6 @@ type t =
| EOF of Region.t (* End of file *) | EOF of Region.t (* End of file *)
type token = t
(* Projections (* Projections
The difference between extracting the lexeme and a string from a The difference between extracting the lexeme and a string from a
@ -121,6 +120,8 @@ type token = t
lexeme (concrete syntax). lexeme (concrete syntax).
*) *)
type token = t
val to_lexeme : token -> lexeme val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t val to_region : token -> Region.t

View File

@ -1,66 +1,69 @@
{ {
(* START OF HEADER *) (* START OF HEADER *)
type lexeme = string (* Shorthands *)
let sprintf = Printf.sprintf
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
module SMap = Utils.String.Map module SMap = Utils.String.Map
module SSet = Utils.String.Set module SSet = Utils.String.Set
type lexeme = string
let sprintf = Printf.sprintf
(* TOKENS *) (* TOKENS *)
type t = type t =
(* Symbols *) (* Symbols *)
CAT of Region.t (* "++" *) CAT of Region.t (* "++" *)
(* Arithmetics *) (* Arithmetics *)
| MINUS of Region.t (* "-" *) | MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *) | PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *) | SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *) | TIMES of Region.t (* "*" *)
(* Compounds *) (* Compounds *)
| LPAR of Region.t (* "(" *) | LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *) | RPAR of Region.t (* ")" *)
| LBRACKET of Region.t (* "[" *) | LBRACKET of Region.t (* "[" *)
| RBRACKET of Region.t (* "]" *) | RBRACKET of Region.t (* "]" *)
| LBRACE of Region.t (* "{" *) | LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *) | RBRACE of Region.t (* "}" *)
(* Separators *) (* Separators *)
| COMMA of Region.t (* "," *) | COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *) | SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *) | VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *) | COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *) | DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *) | ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *) (* Wildcard *)
| WILD of Region.t (* "_" *) | WILD of Region.t (* "_" *)
(* Comparisons *) (* Comparisons *)
| EQ of Region.t (* "=" *) | EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *) | EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *) | NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *) | LT of Region.t (* "<" *)
| GT of Region.t (* ">" *) | GT of Region.t (* ">" *)
| LE of Region.t (* "<=" *) | LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *) | GE of Region.t (* ">=" *)
| ARROW of Region.t (* "=>" *)
| BOOL_OR of Region.t (* "||" *) (* Logic *)
| BOOL_AND of Region.t (* "&&" *)
| NOT of Region.t (* ! *) | BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *)
| NOT of Region.t (* ! *)
(* Identifiers, labels, numbers and strings *) (* Identifiers, labels, numbers and strings *)
@ -75,17 +78,17 @@ type t =
(* Keywords *) (* Keywords *)
(*| And*) | Else of Region.t
| Else of Region.t | False of Region.t
| False of Region.t | If of Region.t
| If of Region.t | Let of Region.t
| Let of Region.t | Mod of Region.t
| Rec of Region.t | Or of Region.t
| Rec of Region.t
| Switch of Region.t | Switch of Region.t
| Mod of Region.t | True of Region.t
| Or of Region.t | Type of Region.t
| True of Region.t
| Type of Region.t
(* Data constructors *) (* Data constructors *)
| C_None of Region.t (* "None" *) | C_None of Region.t (* "None" *)
@ -96,121 +99,143 @@ type t =
| EOF of Region.t (* End of file *) | EOF of Region.t (* End of file *)
(* Projections *)
type token = t type token = t
let proj_token = function let proj_token = function
CAT region -> region, "CAT" (* Literals *)
| MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS" String Region.{region; value} ->
| SLASH region -> region, "SLASH" region, sprintf "String %s" value
| TIMES region -> region, "TIMES" | Bytes Region.{region; value = s,b} ->
| LPAR region -> region, "LPAR" region,
| RPAR region -> region, "RPAR" sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| ELLIPSIS region -> region, "ELLIPSIS"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| EQEQ region -> region, "EQEQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| ARROW region -> region, "ARROW"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
| Int Region.{region; value = s,n} -> | Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} -> | Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} -> | Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| String Region.{region; value} -> | Ident Region.{region; value} ->
region, sprintf "String %s" value region, sprintf "Ident %s" value
| Bytes Region.{region; value = s,b} -> | Constr Region.{region; value} ->
region, region, sprintf "Constr %s" value
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b) (* Symbols *)
| Else region -> region, "Else"
| False region -> region, "False" | CAT region -> region, "CAT"
| If region -> region, "If" | MINUS region -> region, "MINUS"
| Let region -> region, "Let" | PLUS region -> region, "PLUS"
| Rec region -> region, "Rec" | SLASH region -> region, "SLASH"
| Switch region -> region, "Switch" | TIMES region -> region, "TIMES"
| Mod region -> region, "Mod" | LPAR region -> region, "LPAR"
| NOT region -> region, "!" | RPAR region -> region, "RPAR"
| Or region -> region, "Or" | LBRACKET region -> region, "LBRACKET"
| True region -> region, "True" | RBRACKET region -> region, "RBRACKET"
| Type region -> region, "Type" | LBRACE region -> region, "LBRACE"
| C_None region -> region, "C_None" | RBRACE region -> region, "RBRACE"
| C_Some region -> region, "C_Some" | COMMA region -> region, "COMMA"
| Attr Region.{region; value} -> region, sprintf "Attr %s" value | SEMI region -> region, "SEMI"
| EOF region -> region, "EOF" | VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| ELLIPSIS region -> region, "ELLIPSIS"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| EQEQ region -> region, "EQEQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| ARROW region -> region, "ARROW"
| NOT region -> region, "NOT"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
| Else region -> region, "Else"
| False region -> region, "False"
| If region -> region, "If"
| Let region -> region, "Let"
| Rec region -> region, "Rec"
| Switch region -> region, "Switch"
| Mod region -> region, "Mod"
| Or region -> region, "Or"
| True region -> region, "True"
| Type region -> region, "Type"
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
| Attr Region.{region; value} -> region, sprintf "Attr %s" value
| EOF region -> region, "EOF"
let to_lexeme = function let to_lexeme = function
CAT _ -> "++" (* Literals *)
| MINUS _ -> "-"
| PLUS _ -> "+" String s -> s.Region.value
| SLASH _ -> "/" | Bytes b -> fst b.Region.value
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| ELLIPSIS _ -> "..."
| WILD _ -> "_"
| EQ _ -> "="
| EQEQ _ -> "=="
| NE _ -> "!="
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| ARROW _ -> "=>"
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
| Ident id -> id.Region.value
| Constr id -> id.Region.value
| Int i | Int i
| Nat i | Nat i
| Mutez i -> fst i.Region.value | Mutez i -> fst i.Region.value
| String s -> s.Region.value | Ident id -> id.Region.value
| Bytes b -> fst b.Region.value | Constr id -> id.Region.value
| Else _ -> "else" | Attr a -> a.Region.value
| False _ -> "false"
| If _ -> "if" (* Symbols *)
| Let _ -> "let"
| Rec _ -> "rec" | CAT _ -> "++"
| Mod _ -> "mod" | MINUS _ -> "-"
| NOT _ -> "!" | PLUS _ -> "+"
| Or _ -> "or" | SLASH _ -> "/"
| Switch _ -> "switch" | TIMES _ -> "*"
| True _ -> "true" | LPAR _ -> "("
| Type _ -> "type" | RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| ELLIPSIS _ -> "..."
| WILD _ -> "_"
| EQ _ -> "="
| EQEQ _ -> "=="
| NE _ -> "!="
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| ARROW _ -> "=>"
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
| NOT _ -> "!"
(* Keywords *)
| Else _ -> "else"
| False _ -> "false"
| If _ -> "if"
| Let _ -> "let"
| Mod _ -> "mod"
| Or _ -> "or"
| Rec _ -> "rec"
| Switch _ -> "switch"
| True _ -> "true"
| Type _ -> "type"
(* Data constructors *)
| C_None _ -> "None" | C_None _ -> "None"
| C_Some _ -> "Some" | C_Some _ -> "Some"
| Attr a -> a.Region.value
(* Virtual tokens *)
| EOF _ -> "" | EOF _ -> ""
(* CONVERSIONS *)
let to_string token ?(offsets=true) mode = let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode let reg_str = region#compact ~offsets mode
@ -261,12 +286,9 @@ let reserved =
|> add "functor" |> add "functor"
|> add "inherit" |> add "inherit"
|> add "initializer" |> add "initializer"
(* |> add "land" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "lazy" |> add "lazy"
(* |> add "lor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "lsl" |> add "lsl"
|> add "lsr" |> add "lsr"
(* |> add "lxor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "match" |> add "match"
|> add "method" |> add "method"
|> add "module" |> add "module"
@ -291,7 +313,7 @@ let reserved =
let constructors = [ let constructors = [
(fun reg -> C_None reg); (fun reg -> C_None reg);
(fun reg -> C_Some reg); (fun reg -> C_Some reg)
] ]
let add map (key, value) = SMap.add key value map let add map (key, value) = SMap.add key value map
@ -376,20 +398,18 @@ let mk_int lexeme region =
else Ok (Int Region.{region; value = lexeme, z}) else Ok (Int Region.{region; value = lexeme, z})
let mk_nat lexeme region = let mk_nat lexeme region =
let z = let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "n") "") |>
Str.(global_replace (regexp "n") "") |> Z.of_string in
Z.of_string in if Z.equal z Z.zero && lexeme <> "0n"
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme, z}) else Ok (Nat Region.{region; value = lexeme, z})
let mk_mutez lexeme region = let mk_mutez lexeme region =
let z = let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "mutez") "") |>
Str.(global_replace (regexp "mutez") "") |> Z.of_string in
Z.of_string in if Z.equal z Z.zero && lexeme <> "0mutez"
if Z.equal z Z.zero && lexeme <> "0mutez"
then Error Non_canonical_zero then Error Non_canonical_zero
else Ok (Mutez Region.{region; value = lexeme, z}) else Ok (Mutez Region.{region; value = lexeme, z})
@ -426,11 +446,11 @@ let mk_sym lexeme region =
(* Symbols specific to ReasonLIGO *) (* Symbols specific to ReasonLIGO *)
| "..." -> Ok (ELLIPSIS region) | "..." -> Ok (ELLIPSIS region)
| "=>" -> Ok (ARROW region) | "=>" -> Ok (ARROW region)
| "==" -> Ok (EQEQ region) | "==" -> Ok (EQEQ region)
| "!" -> Ok (NOT region) | "!" -> Ok (NOT region)
| "++" -> Ok (CAT region) | "++" -> Ok (CAT region)
(* Invalid symbols *) (* Invalid symbols *)
@ -461,16 +481,18 @@ let mk_attr header lexeme region =
(* Predicates *) (* Predicates *)
let is_string = function String _ -> true | _ -> false let is_string = function String _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *) (* Errors *)
type error = type error =
Odd_lengthed_bytes Odd_lengthed_bytes
| Missing_break | Missing_break
| Negative_byte_sequence
let error_to_string = function let error_to_string = function
Odd_lengthed_bytes -> Odd_lengthed_bytes ->
@ -479,6 +501,9 @@ let error_to_string = function
| Missing_break -> | Missing_break ->
"Missing break.\n\ "Missing break.\n\
Hint: Insert some space." Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg exception Error of error Region.reg
@ -491,29 +516,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value}) let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit = let check_right_context token next_token buffer : unit =
if not (is_eof token) then let pos = (to_region token)#stop in
if is_int token || is_bytes token then let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with match next_token buffer with
Some ([], next) -> None -> ()
let pos = (to_region token)#stop in | Some (markup, next) ->
let region = Region.make ~start:pos ~stop:pos in if is_minus token && is_bytes next
if is_int next then then let region =
fail region Odd_lengthed_bytes Region.cover (to_region token) (to_region next)
else in fail region Negative_byte_sequence
if is_ident next || is_string next || is_bytes next then else
fail region Missing_break match markup with
| Some (_::_, _) | None -> () [] ->
else if is_int token
if is_ident token || is_string token then then if is_string next || is_ident next
match next_token buffer with then fail region Missing_break
Some ([], next) -> else ()
if is_ident next || is_string next else
|| is_bytes next || is_int next if is_string token
then then if is_int next || is_bytes next || is_ident next
let pos = (to_region token)#stop in then fail region Missing_break
let region = Region.make ~start:pos ~stop:pos else ()
in fail region Missing_break else
| Some (_::_, _) | None -> () if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -1,2 +0,0 @@
$HOME/git/OCaml-build/Makefile
$HOME/git/OCaml-build/Makefile.cfg

View File

@ -38,8 +38,6 @@
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
type lexeme = string
(* TOKENS *) (* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer (* The signature [TOKEN] exports an abstract type [token], so a lexer
@ -54,6 +52,8 @@ type lexeme = string
reading the ocamllex specification for the lexer ([Lexer.mll]). reading the ocamllex specification for the lexer ([Lexer.mll]).
*) *)
type lexeme = string
module type TOKEN = module type TOKEN =
sig sig
type token type token
@ -112,10 +112,36 @@ module type TOKEN =
unit unit
end end
(* The signature of the lexer *)
module type S =
sig
module Token : TOKEN
type token = Token.token
(* The scanner [init] is meant to be called first to read the
BOM. Then [scan] is called. *)
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
(* Errors (specific to the generic lexer, not to the tokens) *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
(* The functorised interface (* The functorised interface
Note that the module parameter [Token] is re-exported as a Note that the module parameter [Token] is re-exported as a
submodule in [S]. submodule in [S].
*) *)
module Make (Token: TOKEN) : LexerLib.S with module Token = Token module Make (Token : TOKEN) : S with module Token = Token

View File

@ -8,12 +8,6 @@ module Pos = Simple_utils.Pos
(* START HEADER *) (* START HEADER *)
type lexeme = string
(* ALIASES *)
let sprintf = Printf.sprintf
(* TOKENS *) (* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer (* The signature [TOKEN] exports an abstract type [token], so a lexer
@ -22,6 +16,8 @@ let sprintf = Printf.sprintf
caracterises the virtual token for end-of-file, because it requires caracterises the virtual token for end-of-file, because it requires
special handling. *) special handling. *)
type lexeme = string
module type TOKEN = module type TOKEN =
sig sig
type token type token
@ -84,31 +80,39 @@ module type TOKEN =
submodule in [S]. submodule in [S].
*) *)
module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) = module type S =
sig
module Token : TOKEN
type token = Token.token
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
module Make (Token : TOKEN) : (S with module Token = Token) =
struct struct
module Token = Token module Token = Token
type token = Token.token type token = Token.token
type file_path = string
type line_comment = LexerLib.line_comment
type block_comment = LexerLib.block_comment
let mk_block = LexerLib.mk_block
(* ERRORS *) (* ERRORS *)
type error = type error =
Invalid_utf8_sequence Invalid_utf8_sequence
| Unexpected_character of char | Unexpected_character of char
| Undefined_escape_sequence | Undefined_escape_sequence
(* | Missing_break*)
| Unterminated_string | Unterminated_string
(* | Unterminated_integer*)
(* | Odd_lengthed_bytes*)
| Unterminated_comment of string | Unterminated_comment of string
(* | Orphan_minus*)
| Non_canonical_zero | Non_canonical_zero
(* | Negative_byte_sequence *)
| Broken_string | Broken_string
| Invalid_character_in_string | Invalid_character_in_string
| Reserved_name of string | Reserved_name of string
@ -116,6 +120,8 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
| Invalid_natural | Invalid_natural
| Invalid_attribute | Invalid_attribute
let sprintf = Printf.sprintf
let error_to_string = function let error_to_string = function
Invalid_utf8_sequence -> Invalid_utf8_sequence ->
"Invalid UTF-8 sequence." "Invalid UTF-8 sequence."
@ -124,30 +130,15 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
| Undefined_escape_sequence -> | Undefined_escape_sequence ->
"Undefined escape sequence.\n\ "Undefined escape sequence.\n\
Hint: Remove or replace the sequence." Hint: Remove or replace the sequence."
(* | Missing_break -> | Unterminated_string ->
"Missing break.\n\
Hint: Insert some space."
*) | Unterminated_string ->
"Unterminated string.\n\ "Unterminated string.\n\
Hint: Close with double quotes." Hint: Close with double quotes."
(* | Unterminated_integer -> | Unterminated_comment ending ->
"Unterminated integer.\n\
Hint: Remove the sign or proceed with a natural number." *)
(* | Odd_lengthed_bytes ->
"The length of the byte sequence is an odd number.\n\
Hint: Add or remove a digit."
*) | Unterminated_comment ending ->
sprintf "Unterminated comment.\n\ sprintf "Unterminated comment.\n\
Hint: Close with \"%s\"." ending Hint: Close with \"%s\"." ending
(* | Orphan_minus ->
"Orphan minus sign.\n\
Hint: Remove the trailing space." *)
| Non_canonical_zero -> | Non_canonical_zero ->
"Non-canonical zero.\n\ "Non-canonical zero.\n\
Hint: Use 0." Hint: Use 0."
(* | Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign." *)
| Broken_string -> | Broken_string ->
"The string starting here is interrupted by a line break.\n\ "The string starting here is interrupted by a line break.\n\
Hint: Remove the break, close the string before or insert a \ Hint: Remove the break, close the string before or insert a \
@ -319,13 +310,13 @@ let symbol = common_sym | pascaligo_sym | cameligo_sym | reasonligo_sym
(* Comments *) (* Comments *)
let pascaligo_block_comment_opening = "(*" let pascaligo_block_comment_opening = "(*"
let pascaligo_block_comment_closing = "*)" let pascaligo_block_comment_closing = "*)"
let pascaligo_line_comment = "//" let pascaligo_line_comment = "//"
let cameligo_block_comment_opening = "(*" let cameligo_block_comment_opening = "(*"
let cameligo_block_comment_closing = "*)" let cameligo_block_comment_closing = "*)"
let cameligo_line_comment = "//" let cameligo_line_comment = "//"
let reasonligo_block_comment_opening = "/*" let reasonligo_block_comment_opening = "/*"
let reasonligo_block_comment_closing = "*/" let reasonligo_block_comment_closing = "*/"
@ -369,6 +360,7 @@ and scan state = parse
nl { scan (state#push_newline lexbuf) lexbuf } nl { scan (state#push_newline lexbuf) lexbuf }
| ' '+ { scan (state#push_space lexbuf) lexbuf } | ' '+ { scan (state#push_space lexbuf) lexbuf }
| '\t'+ { scan (state#push_tabs lexbuf) lexbuf } | '\t'+ { scan (state#push_tabs lexbuf) lexbuf }
| ident { mk_ident state lexbuf } | ident { mk_ident state lexbuf }
| constr { mk_constr state lexbuf } | constr { mk_constr state lexbuf }
| bytes { mk_bytes seq state lexbuf } | bytes { mk_bytes seq state lexbuf }
@ -420,27 +412,14 @@ and scan state = parse
let state = state#set_pos pos in let state = state#set_pos pos in
scan state lexbuf } scan state lexbuf }
(* Some special errors *) (* String *)
(*
| '-' { let region, _, state = state#sync lexbuf in
let state = scan state lexbuf in
let open Markup in
match FQueue.peek state#units with
None -> assert false
| Some (_, ((Space _ | Tabs _)::_, token))
when Token.is_int token -> fail region Orphan_minus
| _ -> fail region Unterminated_integer }
| "-0x" byte_seq?
{ let region, _, _ = state#sync lexbuf
in fail region Negative_byte_sequence }
*)
| '"' { let opening, lexeme, state = state#sync lexbuf in | '"' { let opening, lexeme, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening lexeme in let thread = LexerLib.mk_thread opening lexeme in
scan_string thread state lexbuf |> mk_string } scan_string thread state lexbuf |> mk_string }
(* Comments *)
| block_comment_openings { | block_comment_openings {
let lexeme = Lexing.lexeme lexbuf in let lexeme = Lexing.lexeme lexbuf in
match state#block with match state#block with
@ -496,8 +475,6 @@ and scan_flags state acc = parse
| eof { let _, _, state = state#sync lexbuf | eof { let _, _, state = state#sync lexbuf
in List.rev acc, state } in List.rev acc, state }
(* TODO: Move below to [LexerCommon.mll] *)
(* Finishing a string *) (* Finishing a string *)
and scan_string thread state = parse and scan_string thread state = parse
@ -624,164 +601,6 @@ and scan_utf8_inline thread state = parse
{ {
(* START TRAILER *) (* START TRAILER *)
(* Scanning the lexing buffer for tokens (and markup, as a
side-effect).
Because we want the lexer to have access to the right lexical
context of a recognised lexeme (to enforce stylistic constraints or
report special error patterns), we need to keep a hidden reference
to a queue of recognised lexical units (that is, tokens and markup)
that acts as a mutable state between the calls to [read]. When
[read] is called, that queue is examined first and, if it contains
at least one token, that token is returned; otherwise, the lexing
buffer is scanned for at least one more new token. That is the
general principle: we put a high-level buffer (our queue) on top of
the low-level lexing buffer.
One tricky and important detail is that we must make any parser
generated by Menhir (and calling [read]) believe that the last
region of the input source that was matched indeed corresponds to
the returned token, despite that many tokens and markup may have
been matched since it was actually read from the input. In other
words, the parser requests a token that is taken from the
high-level buffer, but the parser requests the source regions from
the _low-level_ lexing buffer, and they may disagree if more than
one token has actually been recognised.
Consequently, in order to maintain a consistent view for the
parser, we have to patch some fields of the lexing buffer, namely
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
generated by Menhir when querying source positions (regions). This
is the purpose of the function [patch_buffer]. After reading one
or more tokens and markup by the scanning rule [scan], we have to
save in the hidden reference [buf_reg] the region of the source
that was matched by [scan]. This atomic sequence of patching,
scanning and saving is implemented by the _function_ [scan]
(beware: it shadows the scanning rule [scan]). The function
[patch_buffer] is, of course, also called just before returning the
token, so the parser has a view of the lexing buffer consistent
with the token.
Note that an additional reference [first_call] is needed to
distinguish the first call to the function [scan], as the first
scanning rule is actually [init] (which can handle the BOM), not
[scan].
*)
type logger = Markup.t list -> token -> unit
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type instance = {
input : input;
read : log:logger -> Lexing.lexbuf -> token;
buffer : Lexing.lexbuf;
get_win : unit -> token LexerLib.window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
let lexbuf_from_input = function
File path ->
(try
let chan = open_in path in
let close () = close_in chan in
let lexbuf = Lexing.from_channel chan in
let () =
let open Lexing in
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
in Ok (lexbuf, close)
with Sys_error msg -> Stdlib.Error (File_opening msg))
| String s ->
Ok (Lexing.from_string s, fun () -> ())
| Channel chan ->
let close () = close_in chan in
Ok (Lexing.from_channel chan, close)
| Buffer b -> Ok (b, fun () -> ())
let open_token_stream ?line ?block input =
let file_path = match input with
File path -> path
| _ -> "" in
let pos = Pos.min ~file:file_path in
let buf_reg = ref (pos#byte, pos#byte)
and first_call = ref true
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let supply = Uutf.Manual.src decoder in
let state = ref (LexerLib.mk_state
~units:FQueue.empty
~last:Region.ghost
~window:LexerLib.Nil
~pos
~markup:[]
~decoder
~supply
?block
?line
()) in
let get_pos () = !state#pos
and get_last () = !state#last
and get_win () = !state#window
and get_file () = file_path in
let patch_buffer (start, stop) buffer =
let open Lexing in
let file_path = buffer.lex_curr_p.pos_fname in
buffer.lex_start_p <- {start with pos_fname = file_path};
buffer.lex_curr_p <- {stop with pos_fname = file_path}
and save_region buffer =
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
let scan buffer =
patch_buffer !buf_reg buffer;
(if !first_call
then (state := init !state buffer; first_call := false)
else state := scan !state buffer);
save_region buffer in
let next_token buffer =
scan buffer;
match FQueue.peek !state#units with
None -> None
| Some (units, ext_token) ->
state := !state#set_units units; Some ext_token in
let rec read ~log buffer =
match FQueue.deq !state#units with
None ->
scan buffer;
read ~log buffer
| Some (units, (left_mark, token)) ->
log left_mark token;
state := ((!state#set_units units)
#set_last (Token.to_region token))
#slide_token token;
Token.check_right_context token next_token buffer;
patch_buffer (Token.to_region token)#byte_pos buffer;
token in
match lexbuf_from_input input with
Ok (buffer, close) ->
let () =
match input with
File path when path <> "" -> LexerLib.reset ~file:path buffer
| _ -> () in
let instance = {
input; read; buffer; get_win; get_pos; get_last; get_file; close}
in Ok instance
| Error _ as e -> e
end (* of functor [Make] in HEADER *) end (* of functor [Make] in HEADER *)
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -1,159 +1,6 @@
(* Sliding window *)
(* The type [window] models a two-slot buffer of tokens for reporting
after a parse error. Technically, it is a parametric type, but its
use is meant for tokens, wherever they are defined.
In [Two(t1,t2)], the token [t2] is the next to be sent to the
parser.
The call [slide token buffer] pushes the token [token] in the
buffer [buffer]. If the buffer is full, that is, it is
[Two(t1,t2)], then the token [t2] is discarded to make room for
[token].
*)
type 'a window =
Nil
| One of 'a
| Two of 'a * 'a
let slide token = function
Nil -> One token
| One t | Two (t,_) -> Two (token,t)
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
type lexeme = string
(* The signature [TOKEN] exports an abstract type [token], so a lexer
can be a functor over tokens. This enables to externalise
version-dependent constraints in any module whose signature matches
[TOKEN]. Generic functions to construct tokens are required.
Note the predicate [is_eof], which caracterises the virtual token
for end-of-file, because it requires special handling.
*)
module type TOKEN =
sig
type token
(* Predicates *)
val is_eof : token -> bool
(* Projections *)
val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t
(* Style *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool ->
[`Byte | `Point] ->
error Region.reg ->
file:bool ->
string Region.reg
val check_right_context :
token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit
end
(* The module type for lexers is [S]. It mainly exports the function
[open_token_stream], which returns
* a function [read] that extracts tokens from a lexing buffer,
together with a lexing buffer [buffer] to read from,
* a function [close] that closes that buffer,
* a function [get_pos] that returns the current position, and
* a function [get_last] that returns the region of the last
recognised token.
* a function [get_file] that returns the name of the file being
scanned (empty string if [stdin]).
Note that a module [Token] is exported too, because the signature
of the exported functions depend on it.
The type [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
The call [read ~log] evaluates in a lexer (also known as a
tokeniser or scanner) whose type is [Lexing.lexbuf -> token], and
suitable for a parser generated by Menhir. The argument labelled
[log] is a logger, that is, it may print a token and its left
markup to a given channel, at the caller's discretion.
*)
module type S =
sig
module Token : TOKEN
type token = Token.token
type file_path = string
type logger = Markup.t list -> token -> unit
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type instance = {
input : input;
read : log:logger -> Lexing.lexbuf -> token;
buffer : Lexing.lexbuf;
get_win : unit -> token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
val lexbuf_from_input :
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
type line_comment = string (* Opening of a line comment *)
type block_comment = <opening : string; closing : string>
val mk_block : opening:string -> closing:string -> block_comment
val open_token_stream :
?line:line_comment ->
?block:block_comment ->
input ->
(instance, open_err) Stdlib.result
(* Error reporting *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool ->
[`Byte | `Point] ->
error Region.reg ->
file:bool ->
string Region.reg
end
(* LEXER ENGINE *) (* LEXER ENGINE *)
(* Resetting file name and line number in the lexing buffer (* Resetting file name and line number in the lexing buffer
@ -205,27 +52,20 @@ let rollback buffer =
buffer.lex_curr_pos <- buffer.lex_curr_pos - len; buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum} buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
(* Utility types *)
type file_path = string
type lexeme = string
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *) (* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
(* When scanning structured constructs, like strings and comments, we
need to keep the region of the opening symbol (like double quote,
"//" or "(*") in order to report any error more precisely. Since
ocamllex is byte-oriented, we need to store the parsed bytes as
characters in an accumulator [acc] and also its length [len], so,
we are done, it is easy to build the string making up the
structured construct with [mk_str] (see above).
The resulting data structure is called a _thread_. (Note for
Emacs: "*)".)
*)
type thread = < type thread = <
opening : Region.t; opening : Region.t;
length : int; length : int;
acc : char list; acc : char list;
push_char : char -> thread; to_string : string;
push_char : char -> thread;
push_string : string -> thread; push_string : string -> thread;
to_string : string;
set_opening : Region.t -> thread set_opening : Region.t -> thread
> >
@ -233,12 +73,12 @@ let mk_thread region lexeme : thread =
(* The call [explode s a] is the list made by pushing the characters (* The call [explode s a] is the list made by pushing the characters
in the string [s] on top of [a], in reverse order. For example, in the string [s] on top of [a], in reverse order. For example,
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *) [explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
let explode s acc = let explode s acc =
let rec push = function let rec push = function
0 -> acc 0 -> acc
| i -> s.[i-1] :: push (i-1) | i -> s.[i-1] :: push (i-1)
in push (String.length s) in push (String.length s) in
in
object object
val opening = region val opening = region
method opening = opening method opening = opening
@ -268,184 +108,288 @@ let mk_thread region lexeme : thread =
method to_string = method to_string =
let bytes = Bytes.make length ' ' in let bytes = Bytes.make length ' ' in
let rec fill i = function let rec fill i = function
[] -> bytes [] -> bytes
| char::l -> Bytes.set bytes i char; fill (i-1) l | char::l -> Bytes.set bytes i char; fill (i-1) l
in fill (length-1) acc |> Bytes.to_string in fill (length-1) acc |> Bytes.to_string
end end
type file_path = string (* STATE *)
(* STATE *) (* Scanning the lexing buffer for tokens (and markup, as a
side-effect).
(* Beyond tokens, the result of lexing is a state. The type Because we want the lexer to have access to the right lexical
[state] represents the logical state of the lexing engine, that context of a recognised lexeme (to enforce stylistic constraints or
is, a value which is threaded during scanning and which denotes report special error patterns), we need to keep a hidden reference
useful, high-level information beyond what the type to a queue of recognised lexical units (that is, tokens and markup)
[Lexing.lexbuf] in the standard library already provides for that acts as a mutable state between the calls to [read]. When
all generic lexers. [read] is called, that queue is examined first and, if it contains
at least one token, that token is returned; otherwise, the lexing
buffer is scanned for at least one more new token. That is the
general principle: we put a high-level buffer (our queue) on top of
the low-level lexing buffer.
Tokens are the smallest units used by the parser to build the One tricky and important detail is that we must make any parser
abstract syntax tree. The state includes a queue of recognised generated by Menhir (and calling [read]) believe that the last
tokens, with the markup at the left of its lexeme until either region of the input source that was matched indeed corresponds to
the start of the file or the end of the previously recognised the returned token, despite that many tokens and markup may have
token. been matched since it was actually read from the input. In other
words, the parser requests a token that is taken from the
high-level buffer, but the parser requests the source regions from
the _low-level_ lexing buffer, and they may disagree if more than
one token has actually been recognised.
The markup from the last recognised token or, if the first Consequently, in order to maintain a consistent view for the
token has not been recognised yet, from the beginning of the parser, we have to patch some fields of the lexing buffer, namely
file is stored in the field [markup] of the state because it is [lex_start_p] and [lex_curr_p], as these fields are read by parsers
a side-effect, with respect to the output token list, and we generated by Menhir when querying source positions (regions). This
use a record with a single field [units] because that record is the purpose of the function [patch_buffer]. After reading one or
may be easily extended during the future maintenance of this more tokens and markup by the scanning rule [scan], we have to save
lexer. in the hidden reference [buf_reg] the region of the source that was
matched by [scan]. This atomic sequence of patching, scanning and
saving is implemented by the _function_ [scan] (beware: it shadows
the scanning rule [scan]). The function [patch_buffer] is, of
course, also called just before returning the token, so the parser
has a view of the lexing buffer consistent with the token.
The state also includes a field [pos] which holds the current Note that an additional reference [first_call] is needed to
position in the LIGO source file. The position is not always distinguish the first call to the function [scan], as the first
updated after a single character has been matched: that depends scanning rule is actually [init] (which can handle the BOM), not
on the regular expression that matched the lexing buffer. [scan].
*)
The field [window] is a two-token window, that is, a buffer type 'token window =
that contains the last recognised token, and the penultimate Nil
(if any). | One of 'token
| Two of 'token * 'token
The fields [decoder] and [supply] offer the support needed type 'token state = <
for the lexing of UTF-8 encoded characters in comments (the units : (Markup.t list * 'token) FQueue.t;
only place where they are allowed in LIGO). The former is the markup : Markup.t list;
decoder proper and the latter is the effectful function window : 'token window;
[supply] that takes a byte, a start index and a length and feed last : Region.t;
it to [decoder]. See the documentation of the third-party pos : Pos.t;
library Uutf. decoder : Uutf.decoder;
*) supply : Bytes.t -> int -> int -> unit;
block : EvalOpt.block_comment option;
line : EvalOpt.line_comment option;
type line_comment = string (* Opening of a line comment *) enqueue : 'token -> 'token state;
type block_comment = <opening : string; closing : string> set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
set_last : Region.t -> 'token state;
set_pos : Pos.t -> 'token state;
slide_token : 'token -> 'token state;
let mk_block ~opening ~closing : block_comment = sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
object
method opening = opening
method closing = closing
end
type 'a state = < push_newline : Lexing.lexbuf -> 'token state;
units : (Markup.t list * 'a) FQueue.t; push_line : thread -> 'token state;
markup : Markup.t list; push_block : thread -> 'token state;
window : 'a window; push_space : Lexing.lexbuf -> 'token state;
last : Region.t; push_tabs : Lexing.lexbuf -> 'token state;
pos : Pos.t; push_bom : Lexing.lexbuf -> 'token state;
decoder : Uutf.decoder; push_markup : Markup.t -> 'token state;
supply : Bytes.t -> int -> int -> unit; >
block : block_comment option;
line : line_comment option;
enqueue : 'a -> 'a state; let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply
set_units : (Markup.t list * 'a) FQueue.t -> 'a state; ?block ?line () : _ state =
set_last : Region.t -> 'a state; object (self)
set_pos : Pos.t -> 'a state; val units = units
slide_token : 'a -> 'a state; method units = units
val markup = markup
method markup = markup
val window = window
method window = window
val last = last
method last = last
val pos = pos
method pos = pos
method decoder = decoder
method supply = supply
method block = block
method line = line
sync : Lexing.lexbuf -> Region.t * lexeme * 'a state; method enqueue token =
{< units = FQueue.enq (markup, token) units;
markup = [] >}
push_newline : Lexing.lexbuf -> 'a state; method set_units units = {< units = units >}
push_line : thread -> 'a state; method set_last region = {< last = region >}
push_block : thread -> 'a state; method set_pos pos = {< pos = pos >}
push_space : Lexing.lexbuf -> 'a state;
push_tabs : Lexing.lexbuf -> 'a state;
push_bom : Lexing.lexbuf -> 'a state;
push_markup : Markup.t -> 'a state;
>
let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply method slide_token token =
?block ?line () : _ state = match self#window with
object (self) Nil -> {< window = One token >}
val units = units | One t | Two (t,_) -> {< window = Two (token,t) >}
method units = units
val markup = markup
method markup = markup
val window = window
method window = window
val last = last
method last = last
val pos = pos
method pos = pos
method decoder = decoder
method supply = supply
method block = block
method line = line
(* The call [enqueue (token, state)] updates functionally the method sync buffer =
state [state] by associating the token [token] with the let lex = Lexing.lexeme buffer in
stored markup and enqueuing the pair into the units let len = String.length lex in
queue. The field [markup] is then reset to the empty let start = pos in
list. *) let stop = start#shift_bytes len in
let state = {< pos = stop >}
in Region.make ~start ~stop, lex, state
method enqueue token = (* MARKUP *)
{< units = FQueue.enq (markup, token) units;
markup = [] >}
method set_units units = {< units = units >} (* Committing markup to the current logical state *)
method set_last region = {< last = region >}
method set_pos pos = {< pos = pos >}
method slide_token token = method push_markup unit = {< markup = unit :: markup >}
{< window = slide token window >}
(* The call [sync state buffer] updates the current position method push_newline buffer =
in accordance with the contents of the lexing buffer, more let () = Lexing.new_line buffer in
precisely, depending on the length of the string which has let value = Lexing.lexeme buffer in
just been recognised by the scanner: that length is used as let start = self#pos in
a positive offset to the current column. *) let stop = start#new_line value in
let region = Region.make ~start ~stop in
let unit = Markup.Newline Region.{region; value}
in {< pos = stop; markup = unit::markup >}
method sync buffer = method push_line thread =
let lex = Lexing.lexeme buffer in let start = thread#opening#start in
let len = String.length lex in let region = Region.make ~start ~stop:self#pos
let start = pos in and value = thread#to_string in
let stop = start#shift_bytes len in let unit = Markup.LineCom Region.{region; value}
let state = {< pos = stop >} in {< markup = unit::markup >}
in Region.make ~start ~stop, lex, state
(* MARKUP *) method push_block thread =
let start = thread#opening#start in
let region = Region.make ~start ~stop:self#pos
and value = thread#to_string in
let unit = Markup.BlockCom Region.{region; value}
in {< markup = unit::markup >}
(* Committing markup to the current logical state *) method push_space buffer =
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Space Region.{region; value}
in state#push_markup unit
method push_markup unit = {< markup = unit :: markup >} method push_tabs buffer =
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Tabs Region.{region; value}
in state#push_markup unit
method push_newline buffer = method push_bom buffer =
let () = Lexing.new_line buffer in let region, value, state = self#sync buffer in
let value = Lexing.lexeme buffer in let unit = Markup.BOM Region.{region; value}
let start = self#pos in in state#push_markup unit
let stop = start#new_line value in end
let region = Region.make ~start ~stop in
let unit = Markup.Newline Region.{region; value}
in {< pos = stop; markup = unit::markup >}
method push_line thread = (* LEXER INSTANCE *)
let start = thread#opening#start in
let region = Region.make ~start ~stop:self#pos
and value = thread#to_string in
let unit = Markup.LineCom Region.{region; value}
in {< markup = unit::markup >}
method push_block thread = type input =
let start = thread#opening#start in File of file_path
let region = Region.make ~start ~stop:self#pos | String of string
and value = thread#to_string in | Channel of in_channel
let unit = Markup.BlockCom Region.{region; value} | Buffer of Lexing.lexbuf
in {< markup = unit::markup >}
method push_space buffer = type 'token logger = Markup.t list -> 'token -> unit
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Space Region.{region; value}
in state#push_markup unit
method push_tabs buffer = type 'token instance = {
let region, lex, state = self#sync buffer in input : input;
let value = String.length lex in read : log:('token logger) -> Lexing.lexbuf -> 'token;
let unit = Markup.Tabs Region.{region; value} buffer : Lexing.lexbuf;
in state#push_markup unit get_win : unit -> 'token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
method push_bom buffer = type open_err = File_opening of string
let region, value, state = self#sync buffer in
let unit = Markup.BOM Region.{region; value}
in state#push_markup unit
end let lexbuf_from_input = function
String s ->
Ok (Lexing.from_string s, fun () -> ())
| Channel chan ->
let close () = close_in chan in
Ok (Lexing.from_channel chan, close)
| Buffer b ->
Ok (b, fun () -> ())
| File path ->
try
let chan = open_in path in
let close () = close_in chan in
let lexbuf = Lexing.from_channel chan in
let () =
let open Lexing in
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
in Ok (lexbuf, close)
with Sys_error msg -> Stdlib.Error (File_opening msg)
let open_token_stream ?line ?block ~init ~scan
~token_to_region ~style input =
let file_path = match input with
File path -> path
| _ -> "" in
let pos = Pos.min ~file:file_path in
let buf_reg = ref (pos#byte, pos#byte)
and first_call = ref true
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let supply = Uutf.Manual.src decoder in
let state = ref (mk_state
~units:FQueue.empty
~last:Region.ghost
~window:Nil
~pos
~markup:[]
~decoder
~supply
?block
?line
()) in
let get_pos () = !state#pos
and get_last () = !state#last
and get_win () = !state#window
and get_file () = file_path in
let patch_buffer (start, stop) buffer =
let open Lexing in
let file_path = buffer.lex_curr_p.pos_fname in
buffer.lex_start_p <- {start with pos_fname = file_path};
buffer.lex_curr_p <- {stop with pos_fname = file_path}
and save_region buffer =
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
let scan' init scan buffer =
patch_buffer !buf_reg buffer;
(if !first_call
then (state := init !state buffer; first_call := false)
else state := scan !state buffer);
save_region buffer in
let next_token init scan buffer =
scan' init scan buffer;
match FQueue.peek !state#units with
None -> None
| Some (units, ext_token) ->
state := !state#set_units units; Some ext_token in
let rec read init scan ~token_to_region ~style ~log buffer =
match FQueue.deq !state#units with
None ->
scan' init scan buffer;
read init scan ~token_to_region ~style ~log buffer
| Some (units, (left_mark, token)) ->
log left_mark token;
state := ((!state#set_units units)
#set_last (token_to_region token))
#slide_token token;
style token (next_token init scan) buffer;
patch_buffer (token_to_region token)#byte_pos buffer;
token in
match lexbuf_from_input input with
Ok (buffer, close) ->
let () =
match input with
File path when path <> "" -> reset ~file:path buffer
| _ -> () in
let instance = {
read = read init scan ~token_to_region ~style;
input; buffer; get_win; get_pos; get_last; get_file; close}
in Ok instance
| Error _ as e -> e

View File

@ -0,0 +1,205 @@
(* A library for writing UTF8-aware lexers *)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
(* The function [rollback] resets the lexing buffer to the state it
was when it matched the last regular expression. This function is
safe to use only in the semantic action of the rule which last
matched. *)
val rollback : Lexing.lexbuf -> unit
(* Utility types *)
type file_path = string
type lexeme = string
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
(* When scanning structured constructs, like strings and comments, we
need to keep the region of the opening symbol (like double quote,
"//" or "(*") in order to report any error more precisely. Since
ocamllex is byte-oriented, we need to store the parsed bytes as
characters in an accumulator [acc] and also its length [len], so,
we are done, it is easy to build the string making up the
structured construct with [mk_str] (see above).
The resulting data structure is called a _thread_. (Note for
Emacs: "*)".)
*)
type thread = <
opening : Region.t;
length : int;
acc : char list;
to_string : string;
push_char : char -> thread;
push_string : string -> thread;
set_opening : Region.t -> thread
>
val mk_thread : Region.t -> lexeme -> thread
(* STATE *)
(* Beyond producing tokens, the result of lexing is a _state_. The
type [state] represents the abstract logical state of the lexing
engine, that is, a value which is threaded during scanning and
which denotes useful, high-level information beyond what the type
[Lexing.lexbuf] in the standard library already provides for all
generic lexers. We qualify it as "logical state" because the lexing
buffer itself has a "physical state" defined by the type
[Lexing.lexbuf].
Tokens are the smallest units used by the parser to build the
abstract syntax tree. The state includes a queue of recognised
tokens, with the markup at the left of its lexeme until either the
start of the file or the end of the previously recognised token.
The markup from the last recognised token or, if the first token
has not been recognised yet, from the beginning of the file is
stored in the field [markup] of the state because it is a
side-effect, with respect to the output token list, and we use a
record with a single field [units] because that record may be
easily extended during the future maintenance of this lexer.
The state also includes a field [pos] which holds the current
position in the LIGO source file. The position is not always
updated after a single character has been matched: that depends on
the regular expression that matched the lexing buffer.
The field [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
Technically, it is a parametric type, but its use is meant for
tokens, wherever they are defined. In [Two (t1,t2)], and in case
of a syntax error, [t1] is the first invalid token and [t2] is the
last valid one.
The fields [decoder] and [supply] offer the support needed for
the lexing of UTF-8 encoded characters in comments (the only place
where they are allowed in LIGO). The former is the decoder proper
and the latter is the effectful function [supply] that takes a
byte, a start index and a length and feed it to [decoder]. See the
documentation of the third-party library Uutf.
Some methods are now documented.
The call [state#enqueue token] updates functionally the state
[state] by associating the token [token] with the stored markup and
enqueuing the pair into the units queue. The field [markup] is then
reset to the empty list.
The call [state#slide_token token] pushes the token [token] in
the buffer [buffer]. If the buffer is full, that is, it is [Two
(t1,t2)], then the token [t2] is discarded to make room for
[token].
The call [state#sync buffer] updates the current position in
accordance with the contents of the lexing buffer, more precisely,
depending on the length of the string which has just been
recognised by the scanner: that length is used as a positive offset
to the current column.
*)
type 'token window =
Nil
| One of 'token
| Two of 'token * 'token
type 'token state = <
units : (Markup.t list * 'token) FQueue.t;
markup : Markup.t list;
window : 'token window;
last : Region.t;
pos : Pos.t;
decoder : Uutf.decoder;
supply : Bytes.t -> int -> int -> unit;
block : EvalOpt.block_comment option;
line : EvalOpt.line_comment option;
enqueue : 'token -> 'token state;
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
set_last : Region.t -> 'token state;
set_pos : Pos.t -> 'token state;
slide_token : 'token -> 'token state;
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
push_newline : Lexing.lexbuf -> 'token state;
push_line : thread -> 'token state;
push_block : thread -> 'token state;
push_space : Lexing.lexbuf -> 'token state;
push_tabs : Lexing.lexbuf -> 'token state;
push_bom : Lexing.lexbuf -> 'token state;
push_markup : Markup.t -> 'token state;
>
(* LEXER INSTANCE *)
(* The function [open_token_stream] returns a lexer instance made of
* the input [input] of type [input];
* a function [read] that extracts tokens from a lexing buffer,
together with a lexing buffer [buffer] to read from,
* a function [close] that closes that buffer,
* a function [get_pos] that returns the current position, and
* a function [get_last] that returns the region of the last
recognised token.
* a function [get_file] that returns the name of the file being
scanned (empty string if [stdin]).
Note that a module [Token] is exported too, because the signature
of the exported functions depend on it.
The type [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
The call [read ?line ?block ~init ~scan ~token_to_region ~style
input] evaluates in a lexer (also known as a tokeniser or scanner)
whose type is [log:('token logger) -> Lexing.lexbuf -> 'token], and
suitable for a parser generated by Menhir. The argument labelled
[log] is a logger, that is, it may print a token and its left
markup to a given channel, at the caller's discretion. The argument
labelled [~init] is the scanner to be called first, usually for
reading the BOM, then [scan] is used for the following calls. The
function labelled [~style] is used to check stylistic constraints
on the tokens and the markup between them.
*)
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type 'token logger = Markup.t list -> 'token -> unit
type 'token instance = {
input : input;
read : log:('token logger) -> Lexing.lexbuf -> 'token;
buffer : Lexing.lexbuf;
get_win : unit -> 'token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
val lexbuf_from_input :
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
val open_token_stream :
?line:EvalOpt.line_comment ->
?block:EvalOpt.block_comment ->
init:('token state -> Lexing.lexbuf -> 'token state) ->
scan:('token state -> Lexing.lexbuf -> 'token state) ->
token_to_region:('token -> Region.t) ->
style:('token ->
(Lexing.lexbuf -> (Markup.t list * 'token) option) ->
Lexing.lexbuf ->
unit) ->
input ->
('token instance, open_err) Stdlib.result

View File

@ -4,7 +4,8 @@ module Region = Simple_utils.Region
module type S = module type S =
sig sig
module Lexer : LexerLib.S module Lexer : Lexer.S
type token = Lexer.token
val output_token : val output_token :
?offsets:bool -> ?offsets:bool ->
@ -12,7 +13,7 @@ module type S =
EvalOpt.command -> EvalOpt.command ->
out_channel -> out_channel ->
Markup.t list -> Markup.t list ->
Lexer.token -> token ->
unit unit
type file_path = string type file_path = string
@ -22,19 +23,26 @@ module type S =
[`Byte | `Point] -> [`Byte | `Point] ->
?block:EvalOpt.block_comment -> ?block:EvalOpt.block_comment ->
?line:EvalOpt.line_comment -> ?line:EvalOpt.line_comment ->
Lexer.input -> token_to_region:(token -> Region.t) ->
style:(token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit) ->
LexerLib.input ->
EvalOpt.command -> EvalOpt.command ->
(unit, string Region.reg) Stdlib.result (unit, string Region.reg) Stdlib.result
end end
module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) = module Make (Lexer: Lexer.S) : (S with module Lexer = Lexer) =
struct struct
module Lexer = Lexer module Lexer = Lexer
module Token = Lexer.Token module Token = Lexer.Token
type token = Lexer.token
(** Pretty-printing in a string the lexemes making up the markup (* Pretty-printing in a string the lexemes making up the markup
between two tokens, concatenated with the last lexeme between two tokens, concatenated with the last lexeme
itself. *) itself. *)
let output_token ?(offsets=true) mode command let output_token ?(offsets=true) mode command
channel left_mark token : unit = channel left_mark token : unit =
let output str = Printf.fprintf channel "%s%!" str in let output str = Printf.fprintf channel "%s%!" str in
@ -57,10 +65,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
type file_path = string type file_path = string
let trace ?(offsets=true) mode ?block ?line input command : let trace ?(offsets=true) mode ?block ?line
~token_to_region ~style input command :
(unit, string Region.reg) Stdlib.result = (unit, string Region.reg) Stdlib.result =
match Lexer.open_token_stream ?line ?block input with match LexerLib.open_token_stream
Ok Lexer.{read; buffer; close; _} -> ~init:Lexer.init
~scan:Lexer.scan
~token_to_region
~style
?line ?block input
with
Ok LexerLib.{read; buffer; close; _} ->
let log = output_token ~offsets mode command stdout let log = output_token ~offsets mode command stdout
and close_all () = flush_all (); close () in and close_all () = flush_all (); close () in
let rec iter () = let rec iter () =
@ -69,12 +84,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
if Token.is_eof token if Token.is_eof token
then Stdlib.Ok () then Stdlib.Ok ()
else iter () else iter ()
| exception Lexer.Token.Error error ->
let msg =
Lexer.Token.format_error
~offsets mode ~file:true error
in Stdlib.Error msg
| exception Lexer.Error error -> | exception Lexer.Error error ->
let msg = let msg =
Lexer.format_error ~offsets mode ~file:true error Lexer.format_error ~offsets mode ~file:true error
in Stdlib.Error msg in in Stdlib.Error msg in
let result = iter () let result = iter ()
in close_all (); result in close_all (); result
| Stdlib.Error (Lexer.File_opening msg) -> | Stdlib.Error (LexerLib.File_opening msg) ->
flush_all (); Stdlib.Error (Region.wrap_ghost msg) flush_all (); Stdlib.Error (Region.wrap_ghost msg)
end end

View File

@ -2,7 +2,8 @@ module Region = Simple_utils.Region
module type S = module type S =
sig sig
module Lexer : LexerLib.S module Lexer : Lexer.S
type token = Lexer.token
val output_token : val output_token :
?offsets:bool -> ?offsets:bool ->
@ -10,7 +11,7 @@ module type S =
EvalOpt.command -> EvalOpt.command ->
out_channel -> out_channel ->
Markup.t list -> Markup.t list ->
Lexer.token -> token ->
unit unit
type file_path = string type file_path = string
@ -20,9 +21,14 @@ module type S =
[`Byte | `Point] -> [`Byte | `Point] ->
?block:EvalOpt.block_comment -> ?block:EvalOpt.block_comment ->
?line:EvalOpt.line_comment -> ?line:EvalOpt.line_comment ->
Lexer.input -> token_to_region:(token -> Region.t) ->
style:(token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit) ->
LexerLib.input ->
EvalOpt.command -> EvalOpt.command ->
(unit, string Region.reg) Stdlib.result (unit, string Region.reg) Stdlib.result
end end
module Make (Lexer: LexerLib.S) : S with module Lexer = Lexer module Make (Lexer: Lexer.S) : S with module Lexer = Lexer

View File

@ -9,7 +9,7 @@ module type IO =
val options : EvalOpt.options (* CLI options *) val options : EvalOpt.options (* CLI options *)
end end
module Make (IO: IO) (Lexer: LexerLib.S) = module Make (IO: IO) (Lexer: Lexer.S) =
struct struct
(* Error printing and exception tracing *) (* Error printing and exception tracing *)
@ -39,11 +39,16 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
| Stdlib.Ok pp_buffer -> | Stdlib.Ok pp_buffer ->
(* Running the lexer on the preprocessed input *) (* Running the lexer on the preprocessed input *)
let source = Lexer.String (Buffer.contents pp_buffer) in let source = LexerLib.String (Buffer.contents pp_buffer) in
match Lexer.open_token_stream ?line:IO.options#line match LexerLib.open_token_stream
?block:IO.options#block ?line:IO.options#line
source with ?block:IO.options#block
Ok Lexer.{read; buffer; close; _} -> ~init:Lexer.init
~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
source with
Ok LexerLib.{read; buffer; close; _} ->
let close_all () = flush_all (); close () in let close_all () = flush_all (); close () in
let rec read_tokens tokens = let rec read_tokens tokens =
match read ~log:(fun _ _ -> ()) buffer with match read ~log:(fun _ _ -> ()) buffer with
@ -51,9 +56,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
if Lexer.Token.is_eof token if Lexer.Token.is_eof token
then Stdlib.Ok (List.rev tokens) then Stdlib.Ok (List.rev tokens)
else read_tokens (token::tokens) else read_tokens (token::tokens)
(* | exception _ -> | exception Lexer.Token.Error error ->
Printf.eprintf "Here\n%!"; exit 1
*) | exception Lexer.Token.Error error ->
let file = let file =
match IO.options#input with match IO.options#input with
None | Some "-" -> false None | Some "-" -> false
@ -74,7 +77,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
in Stdlib.Error msg in in Stdlib.Error msg in
let result = read_tokens [] let result = read_tokens []
in close_all (); result in close_all (); result
| Stdlib.Error (Lexer.File_opening msg) -> | Stdlib.Error (LexerLib.File_opening msg) ->
flush_all (); Stdlib.Error (Region.wrap_ghost msg) in flush_all (); Stdlib.Error (Region.wrap_ghost msg) in
match IO.options#input with match IO.options#input with
None -> preproc stdin None -> preproc stdin
@ -115,7 +118,9 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
IO.options#mode IO.options#mode
?block:IO.options#block ?block:IO.options#block
?line:IO.options#line ?line:IO.options#line
(Lexer.String preproc_str) ~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
(LexerLib.String preproc_str)
IO.options#cmd IO.options#cmd
in match IO.options#input with in match IO.options#input with
None -> preproc stdin None -> preproc stdin

View File

@ -7,7 +7,7 @@ module type IO =
val options : EvalOpt.options (* CLI options *) val options : EvalOpt.options (* CLI options *)
end end
module Make (IO: IO) (Lexer: LexerLib.S) : module Make (IO: IO) (Lexer: Lexer.S) :
sig sig
val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result
val trace : unit -> (unit, string Region.reg) Stdlib.result val trace : unit -> (unit, string Region.reg) Stdlib.result

View File

@ -56,8 +56,8 @@ module type PARSER =
(* Main functor *) (* Main functor *)
module Make (IO: IO) module Make (IO: IO)
(Lexer: LexerLib.S) (Lexer: Lexer.S)
(Parser: PARSER with type token = Lexer.Token.token) (Parser: PARSER with type token = Lexer.token)
(ParErr: sig val message : int -> string end) = (ParErr: sig val message : int -> string end) =
struct struct
module I = Parser.MenhirInterpreter module I = Parser.MenhirInterpreter
@ -143,14 +143,14 @@ module Make (IO: IO)
~offsets:IO.options#offsets ~offsets:IO.options#offsets
IO.options#mode IO.options#cmd stdout IO.options#mode IO.options#cmd stdout
let incr_contract Lexer.{read; buffer; get_win; close; _} = let incr_contract LexerLib.{read; buffer; get_win; close; _} =
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
and failure = failure get_win in and failure = failure get_win in
let parser = Incr.contract buffer.Lexing.lex_curr_p in let parser = Incr.contract buffer.Lexing.lex_curr_p in
let ast = I.loop_handle success failure supplier parser let ast = I.loop_handle success failure supplier parser
in flush_all (); close (); ast in flush_all (); close (); ast
let incr_expr Lexer.{read; buffer; get_win; close; _} = let incr_expr LexerLib.{read; buffer; get_win; close; _} =
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
and failure = failure get_win in and failure = failure get_win in
let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in

View File

@ -55,8 +55,8 @@ module type PARSER =
end end
module Make (IO: IO) module Make (IO: IO)
(Lexer: LexerLib.S) (Lexer: Lexer.S)
(Parser: PARSER with type token = Lexer.Token.token) (Parser: PARSER with type token = Lexer.token)
(ParErr: sig val message : int -> string end) : (ParErr: sig val message : int -> string end) :
sig sig
(* WARNING: The following parsers may all raise [Lexer.Error] *) (* WARNING: The following parsers may all raise [Lexer.Error] *)
@ -78,8 +78,8 @@ module Make (IO: IO)
exception Point of error exception Point of error
val incr_contract : Lexer.instance -> Parser.ast val incr_contract : Lexer.token LexerLib.instance -> Parser.ast
val incr_expr : Lexer.instance -> Parser.expr val incr_expr : Lexer.token LexerLib.instance -> Parser.expr
val format_error : val format_error :
?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg ?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg

View File

@ -37,7 +37,7 @@ module type Printer =
val print_expr : state -> expr -> unit val print_expr : state -> expr -> unit
end end
module Make (Lexer: LexerLib.S) module Make (Lexer: Lexer.S)
(AST: sig type t type expr end) (AST: sig type t type expr end)
(Parser: ParserAPI.PARSER (Parser: ParserAPI.PARSER
with type ast = AST.t with type ast = AST.t
@ -89,12 +89,12 @@ module Make (Lexer: LexerLib.S)
ParserLog.mk_state ~offsets:SubIO.options#offsets ParserLog.mk_state ~offsets:SubIO.options#offsets
~mode:SubIO.options#mode ~mode:SubIO.options#mode
~buffer:output in ~buffer:output in
let close () = lexer_inst.Lexer.close () in let close () = lexer_inst.LexerLib.close () in
let expr = let expr =
try try
if SubIO.options#mono then if SubIO.options#mono then
let tokeniser = lexer_inst.Lexer.read ~log let tokeniser = lexer_inst.LexerLib.read ~log
and lexbuf = lexer_inst.Lexer.buffer and lexbuf = lexer_inst.LexerLib.buffer
in Front.mono_expr tokeniser lexbuf in Front.mono_expr tokeniser lexbuf
else else
Front.incr_expr lexer_inst Front.incr_expr lexer_inst
@ -124,12 +124,12 @@ module Make (Lexer: LexerLib.S)
ParserLog.mk_state ~offsets:SubIO.options#offsets ParserLog.mk_state ~offsets:SubIO.options#offsets
~mode:SubIO.options#mode ~mode:SubIO.options#mode
~buffer:output in ~buffer:output in
let close () = lexer_inst.Lexer.close () in let close () = lexer_inst.LexerLib.close () in
let ast = let ast =
try try
if SubIO.options#mono then if SubIO.options#mono then
let tokeniser = lexer_inst.Lexer.read ~log let tokeniser = lexer_inst.LexerLib.read ~log
and lexbuf = lexer_inst.Lexer.buffer and lexbuf = lexer_inst.LexerLib.buffer
in Front.mono_contract tokeniser lexbuf in Front.mono_contract tokeniser lexbuf
else else
Front.incr_contract lexer_inst Front.incr_contract lexer_inst
@ -163,10 +163,18 @@ module Make (Lexer: LexerLib.S)
| exception Lexer.Error err -> | exception Lexer.Error err ->
let file = let file =
lexer_inst.Lexer.buffer.Lexing.lex_curr_p.Lexing.pos_fname in lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error = let error = Lexer.format_error
Lexer.format_error ~offsets:SubIO.options#offsets ~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "") SubIO.options#mode err ~file:(file <> "")
in Stdlib.Error error
| exception Lexer.Token.Error err ->
let file =
lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error = Lexer.Token.format_error
~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "")
in Stdlib.Error error in Stdlib.Error error
(* Incremental API of Menhir *) (* Incremental API of Menhir *)
@ -181,7 +189,7 @@ module Make (Lexer: LexerLib.S)
| exception Parser.Error -> | exception Parser.Error ->
let invalid, valid_opt = let invalid, valid_opt =
match lexer_inst.Lexer.get_win () with match lexer_inst.LexerLib.get_win () with
LexerLib.Nil -> LexerLib.Nil ->
assert false (* Safe: There is always at least EOF. *) assert false (* Safe: There is always at least EOF. *)
| LexerLib.One invalid -> invalid, None | LexerLib.One invalid -> invalid, None
@ -205,8 +213,8 @@ module Make (Lexer: LexerLib.S)
(* Parsing a contract *) (* Parsing a contract *)
let gen_parser options input parser = let gen_parser options input parser =
match Lexer.lexbuf_from_input input with match LexerLib.lexbuf_from_input input with
Stdlib.Error (Lexer.File_opening msg) -> Stdlib.Error (LexerLib.File_opening msg) ->
Stdlib.Error (Region.wrap_ghost msg) Stdlib.Error (Region.wrap_ghost msg)
| Ok (lexbuf, close) -> | Ok (lexbuf, close) ->
(* Preprocessing the input source *) (* Preprocessing the input source *)
@ -224,48 +232,53 @@ module Make (Lexer: LexerLib.S)
(* Lexing and parsing the preprocessed input source *) (* Lexing and parsing the preprocessed input source *)
let () = close () in let () = close () in
let input' = Lexer.String (Buffer.contents buffer) in let input' = LexerLib.String (Buffer.contents buffer) in
match Lexer.open_token_stream ?line:options#line match LexerLib.open_token_stream
?block:options#block ~init:Lexer.init
input' ~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
?line:options#line
?block:options#block
input'
with with
Ok instance -> Ok instance ->
let open Lexing in let open Lexing in
instance.Lexer.buffer.lex_curr_p <- instance.LexerLib.buffer.lex_curr_p <-
{instance.Lexer.buffer.lex_curr_p with pos_fname=file}; {instance.LexerLib.buffer.lex_curr_p with pos_fname=file};
apply instance parser apply instance parser
| Stdlib.Error (Lexer.File_opening msg) -> | Stdlib.Error (LexerLib.File_opening msg) ->
Stdlib.Error (Region.wrap_ghost msg) Stdlib.Error (Region.wrap_ghost msg)
(* Parsing a contract in a file *) (* Parsing a contract in a file *)
let contract_in_file (source : string) = let contract_in_file (source : string) =
let options = SubIO.make ~input:(Some source) ~expr:false let options = SubIO.make ~input:(Some source) ~expr:false
in gen_parser options (Lexer.File source) parse_contract in gen_parser options (LexerLib.File source) parse_contract
(* Parsing a contract in a string *) (* Parsing a contract in a string *)
let contract_in_string (source : string) = let contract_in_string (source : string) =
let options = SubIO.make ~input:None ~expr:false in let options = SubIO.make ~input:None ~expr:false in
gen_parser options (Lexer.String source) parse_contract gen_parser options (LexerLib.String source) parse_contract
(* Parsing a contract in stdin *) (* Parsing a contract in stdin *)
let contract_in_stdin () = let contract_in_stdin () =
let options = SubIO.make ~input:None ~expr:false in let options = SubIO.make ~input:None ~expr:false in
gen_parser options (Lexer.Channel stdin) parse_contract gen_parser options (LexerLib.Channel stdin) parse_contract
(* Parsing an expression in a string *) (* Parsing an expression in a string *)
let expr_in_string (source : string) = let expr_in_string (source : string) =
let options = SubIO.make ~input:None ~expr:true in let options = SubIO.make ~input:None ~expr:true in
gen_parser options (Lexer.String source) parse_expr gen_parser options (LexerLib.String source) parse_expr
(* Parsing an expression in stdin *) (* Parsing an expression in stdin *)
let expr_in_stdin () = let expr_in_stdin () =
let options = SubIO.make ~input:None ~expr:true in let options = SubIO.make ~input:None ~expr:true in
gen_parser options (Lexer.Channel stdin) parse_expr gen_parser options (LexerLib.Channel stdin) parse_expr
(* Preprocess only *) (* Preprocess only *)

View File

@ -43,7 +43,7 @@ module type Printer =
(* Main functor to make the parser *) (* Main functor to make the parser *)
module Make (Lexer : LexerLib.S) module Make (Lexer : Lexer.S)
(AST : sig type t type expr end) (AST : sig type t type expr end)
(Parser : ParserAPI.PARSER (Parser : ParserAPI.PARSER
with type ast = AST.t with type ast = AST.t

View File

@ -1 +1 @@
const a: string = -0x222; const a: string = - (**) 0x2222

View File

@ -1 +1 @@
let a = -0x222; let a = - (**) 0x2222

View File

@ -1 +1 @@
let a = -0x222; let a = - /**/ 0x2222;

View File

@ -1 +0,0 @@
let arguments = 1;