Finished refactoring of lexer.

This commit is contained in:
Christian Rinderknecht 2020-04-28 19:26:31 +02:00
parent 9618a48848
commit 005b7fd69b
27 changed files with 1174 additions and 1079 deletions

View File

@ -2,7 +2,7 @@ open Cli_expect
let%expect_test _ =
run_ligo_bad [ "compile-contract" ; "../../test/lexer/broken_string.ligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "broken_string.ligo", line 1, characters 18-19:
The string starting here is interrupted by a line break.
Hint: Remove the break, close the string before or insert a backslash.
@ -19,7 +19,7 @@ ligo: : Lexical error in file "broken_string.ligo", line 1, characters 18-19:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/broken_string.mligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "broken_string.mligo", line 1, characters 8-9:
The string starting here is interrupted by a line break.
Hint: Remove the break, close the string before or insert a backslash.
@ -36,7 +36,7 @@ ligo: : Lexical error in file "broken_string.mligo", line 1, characters 8-9:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/broken_string.religo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9:
The string starting here is interrupted by a line break.
Hint: Remove the break, close the string before or insert a backslash.
@ -53,8 +53,8 @@ ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ;
[%expect {|
ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-23:
[%expect {|
ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-31:
Negative byte sequence.
Hint: Remove the leading minus sign.
{}
@ -70,8 +70,8 @@ ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ;
[%expect {|
ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-13:
[%expect {|
ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-21:
Negative byte sequence.
Hint: Remove the leading minus sign.
{}
@ -87,8 +87,8 @@ ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ;
[%expect {|
ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-13:
[%expect {|
ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-21:
Negative byte sequence.
Hint: Remove the leading minus sign.
{}
@ -103,8 +103,9 @@ ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, character
* Check the changelog by running 'ligo changelog'
|} ];
(*
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
Reserved name: "arguments".
Hint: Change the name.
@ -119,9 +120,10 @@ ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
* Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new
* Check the changelog by running 'ligo changelog'
|} ];
*)
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "reserved_name.religo", line 1, characters 4-7:
Reserved name: "end".
Hint: Change the name.
@ -138,7 +140,7 @@ ligo: : Lexical error in file "reserved_name.religo", line 1, characters 4-7:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.mligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "reserved_name.mligo", line 1, characters 4-10:
Reserved name: "object".
Hint: Change the name.
@ -155,7 +157,7 @@ ligo: : Lexical error in file "reserved_name.mligo", line 1, characters 4-10:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unexpected_character.ligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "unexpected_character.ligo", line 1, characters 18-19:
Unexpected character '\239'.
{}
@ -171,7 +173,7 @@ ligo: : Lexical error in file "unexpected_character.ligo", line 1, characters 18
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unexpected_character.mligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "unexpected_character.mligo", line 1, characters 8-9:
Unexpected character '\239'.
{}
@ -187,7 +189,7 @@ ligo: : Lexical error in file "unexpected_character.mligo", line 1, characters 8
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unexpected_character.religo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "unexpected_character.religo", line 1, characters 8-9:
Unexpected character '\239'.
{}
@ -203,11 +205,10 @@ ligo: : Lexical error in file "unexpected_character.religo", line 1, characters
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ;
[%expect {|
ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0-2:
[%expect {|
ligo: : Preprocessing error in file "../../test/lexer/unterminated_comment.mligo", line 1, characters 0-2:
Unterminated comment.
Hint: Close with "*)".
{}
Hint: Close with "*)". {}
If you're not sure how to fix this error, you can
@ -220,7 +221,7 @@ ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_symbol.ligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "invalid_symbol.ligo", line 1, characters 17-20:
Invalid symbol.
Hint: Check the LIGO syntax you use.
@ -237,7 +238,7 @@ ligo: : Lexical error in file "invalid_symbol.ligo", line 1, characters 17-20:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_symbol.mligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "invalid_symbol.mligo", line 1, characters 10-13:
Invalid symbol.
Hint: Check the LIGO syntax you use.
@ -254,7 +255,7 @@ ligo: : Lexical error in file "invalid_symbol.mligo", line 1, characters 10-13:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_symbol.religo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "invalid_symbol.religo", line 1, characters 10-11:
Invalid symbol.
Hint: Check the LIGO syntax you use.
@ -271,7 +272,7 @@ ligo: : Lexical error in file "invalid_symbol.religo", line 1, characters 10-11:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/missing_break.ligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "missing_break.ligo", line 1, characters 18-18:
Missing break.
Hint: Insert some space.
@ -288,7 +289,7 @@ ligo: : Lexical error in file "missing_break.ligo", line 1, characters 18-18:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/missing_break.mligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "missing_break.mligo", line 1, characters 11-11:
Missing break.
Hint: Insert some space.
@ -305,7 +306,7 @@ ligo: : Lexical error in file "missing_break.mligo", line 1, characters 11-11:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/missing_break.religo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "missing_break.religo", line 1, characters 11-11:
Missing break.
Hint: Insert some space.
@ -322,7 +323,7 @@ ligo: : Lexical error in file "missing_break.religo", line 1, characters 11-11:
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_character_in_string.ligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "invalid_character_in_string.ligo", line 1, characters 19-20:
Invalid character in string.
Hint: Remove or replace the character.
@ -339,7 +340,7 @@ ligo: : Lexical error in file "invalid_character_in_string.ligo", line 1, charac
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_character_in_string.mligo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "invalid_character_in_string.mligo", line 1, characters 9-10:
Invalid character in string.
Hint: Remove or replace the character.
@ -356,7 +357,7 @@ ligo: : Lexical error in file "invalid_character_in_string.mligo", line 1, chara
|} ];
run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_character_in_string.religo" ; "main" ] ;
[%expect {|
[%expect {|
ligo: : Lexical error in file "invalid_character_in_string.religo", line 1, characters 9-10:
Invalid character in string.
Hint: Remove or replace the character.
@ -370,4 +371,4 @@ ligo: : Lexical error in file "invalid_character_in_string.religo", line 1, char
* Ask a question on our Discord: https://discord.gg/9rhYaEt
* Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new
* Check the changelog by running 'ligo changelog'
|} ]
|} ]

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli
../shared/Lexer.mll
../shared/LexerLib.ml
../shared/EvalOpt.ml
../shared/EvalOpt.mli
../shared/FQueue.ml

View File

@ -102,138 +102,167 @@ type t =
| EOF of Region.t (* End of file *)
(* Projections *)
type token = t
let proj_token = function
ARROW region -> region, "ARROW"
| CONS region -> region, "CONS"
| CAT region -> region, "CAT"
| MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS"
| SLASH region -> region, "SLASH"
| TIMES region -> region, "TIMES"
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
(* Literals *)
String Region.{region; value} ->
region, sprintf "Str %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| String Region.{region; value} ->
region, sprintf "Str %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b)
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
| Attr Region.{region; value} ->
region, sprintf "Attr \"%s\"" value
(* Symbols *)
| ARROW region -> region, "ARROW"
| CONS region -> region, "CONS"
| CAT region -> region, "CAT"
| MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS"
| SLASH region -> region, "SLASH"
| TIMES region -> region, "TIMES"
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
(* Keywords *)
| Begin region -> region, "Begin"
| Else region -> region, "Else"
| End region -> region, "End"
| Else region -> region, "Else"
| End region -> region, "End"
| False region -> region, "False"
| Fun region -> region, "Fun"
| Rec region -> region, "Rec"
| If region -> region, "If"
| In region -> region, "In"
| Let region -> region, "Let"
| Fun region -> region, "Fun"
| Rec region -> region, "Rec"
| If region -> region, "If"
| In region -> region, "In"
| Let region -> region, "Let"
| Match region -> region, "Match"
| Mod region -> region, "Mod"
| Not region -> region, "Not"
| Of region -> region, "Of"
| Or region -> region, "Or"
| Then region -> region, "Then"
| True region -> region, "True"
| Type region -> region, "Type"
| With region -> region, "With"
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
| Mod region -> region, "Mod"
| Not region -> region, "Not"
| Of region -> region, "Of"
| Or region -> region, "Or"
| Then region -> region, "Then"
| True region -> region, "True"
| Type region -> region, "Type"
| With region -> region, "With"
(* Data *)
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
(* Virtual tokens *)
| EOF region -> region, "EOF"
let to_lexeme = function
ARROW _ -> "->"
| CONS _ -> "::"
| CAT _ -> "^"
| MINUS _ -> "-"
| PLUS _ -> "+"
| SLASH _ -> "/"
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| WILD _ -> "_"
| EQ _ -> "="
| NE _ -> "<>"
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
| Ident id -> id.Region.value
| Constr id -> id.Region.value
let to_lexeme = function
(* Literals *)
String s -> String.escaped s.Region.value
| Bytes b -> fst b.Region.value
| Int i
| Nat i
| Mutez i -> fst i.Region.value
| String s -> String.escaped s.Region.value
| Bytes b -> fst b.Region.value
| Attr a -> a.Region.value
| Mutez i -> fst i.Region.value
| Ident id -> id.Region.value
| Constr id -> id.Region.value
| Attr a -> a.Region.value
(* Symbols *)
| ARROW _ -> "->"
| CONS _ -> "::"
| CAT _ -> "^"
| MINUS _ -> "-"
| PLUS _ -> "+"
| SLASH _ -> "/"
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| WILD _ -> "_"
| EQ _ -> "="
| NE _ -> "<>"
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
(* Keywords *)
| Begin _ -> "begin"
| Else _ -> "else"
| End _ -> "end"
| Else _ -> "else"
| End _ -> "end"
| False _ -> "false"
| Fun _ -> "fun"
| Rec _ -> "rec"
| If _ -> "if"
| In _ -> "in"
| Let _ -> "let"
| Fun _ -> "fun"
| Rec _ -> "rec"
| If _ -> "if"
| In _ -> "in"
| Let _ -> "let"
| Match _ -> "match"
| Mod _ -> "mod"
| Not _ -> "not"
| Of _ -> "of"
| Or _ -> "or"
| True _ -> "true"
| Type _ -> "type"
| Then _ -> "then"
| With _ -> "with"
| Mod _ -> "mod"
| Not _ -> "not"
| Of _ -> "of"
| Or _ -> "or"
| True _ -> "true"
| Type _ -> "type"
| Then _ -> "then"
| With _ -> "with"
(* Data constructors *)
| C_None _ -> "None"
| C_Some _ -> "Some"
(* Virtual tokens *)
| EOF _ -> ""
(* CONVERSIONS *)
let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode
@ -241,10 +270,6 @@ let to_string token ?(offsets=true) mode =
let to_region token = proj_token token |> fst
(* Injections *)
type int_err = Non_canonical_zero
(* LEXIS *)
let keywords = [
@ -385,6 +410,8 @@ let mk_bytes lexeme region =
let value = lexeme, `Hex norm
in Bytes Region.{region; value}
type int_err = Non_canonical_zero
let mk_int lexeme region =
let z =
Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string
@ -398,23 +425,21 @@ type nat_err =
let mk_nat lexeme region =
match (String.index_opt lexeme 'n') with
| None -> Error Invalid_natural
| Some _ -> (
let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme,z})
)
None -> Error Invalid_natural
| Some _ -> let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme,z})
let mk_mutez lexeme region =
let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez"
if Z.equal z Z.zero && lexeme <> "0mutez"
then Error Non_canonical_zero
else Ok (Mutez Region.{region; value = lexeme, z})
@ -422,8 +447,6 @@ let eof region = EOF region
type sym_err = Invalid_symbol
type attr_err = Invalid_attribute
let mk_sym lexeme region =
match lexeme with
(* Lexemes in common with all concrete syntaxes *)
@ -473,24 +496,27 @@ let mk_constr lexeme region =
(* Attributes *)
type attr_err = Invalid_attribute
let mk_attr header lexeme region =
if header = "[@" then
Error Invalid_attribute
if header = "[@" then Error Invalid_attribute
else Ok (Attr Region.{value=lexeme; region})
(* Predicates *)
let is_string = function String _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *)
type error =
Odd_lengthed_bytes
| Missing_break
| Negative_byte_sequence
let error_to_string = function
Odd_lengthed_bytes ->
@ -499,6 +525,9 @@ let error_to_string = function
| Missing_break ->
"Missing break.\n\
Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg
@ -511,29 +540,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit =
if not (is_eof token) then
if is_int token || is_bytes token then
match next_token buffer with
Some ([], next) ->
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
if is_int next then
fail region Odd_lengthed_bytes
else
if is_ident next || is_string next || is_bytes next then
fail region Missing_break
| Some (_::_, _) | None -> ()
else
if is_ident token || is_string token then
match next_token buffer with
Some ([], next) ->
if is_ident next || is_string next
|| is_bytes next || is_int next
then
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos
in fail region Missing_break
| Some (_::_, _) | None -> ()
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with
None -> ()
| Some (markup, next) ->
if is_minus token && is_bytes next
then let region =
Region.cover (to_region token) (to_region next)
in fail region Negative_byte_sequence
else
match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *)
}

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli
../shared/Lexer.mll
../shared/LexerLib.mli
../shared/LexerLib.ml
../shared/EvalOpt.ml
../shared/EvalOpt.mli

View File

@ -1,4 +1,4 @@
(** This signature defines the lexical tokens for LIGO
(* This signature defines the lexical tokens for LIGO
_Tokens_ are the abstract units which are used by the parser to
build the abstract syntax tree (AST), in other words, the stream of

View File

@ -5,24 +5,14 @@
(* Shorthands *)
type lexeme = string
let sprintf = Printf.sprintf
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
module SMap = Map.Make (String)
module SSet = Set.Make (String)
(* Hack to roll back one lexeme in the current semantic action *)
(*
let rollback buffer =
let open Lexing in
let len = String.length (lexeme buffer) in
let pos_cnum = buffer.lex_curr_p.pos_cnum - len in
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
*)
type lexeme = string
let sprintf = Printf.sprintf
(* TOKENS *)
@ -123,6 +113,8 @@ type t =
| EOF of Region.t
(* Projections *)
type token = t
let proj_token = function
@ -130,32 +122,20 @@ let proj_token = function
String Region.{region; value} ->
region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b)
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| Ident Region.{region; value} ->
region, sprintf "Ident \"%s\"" value
| Constr Region.{region; value} ->
region, sprintf "Constr \"%s\"" value
(*
| Attr {header; string={region; value}} ->
region, sprintf "Attr (\"%s\",\"%s\")" header value
*)
(* Symbols *)
| SEMI region -> region, "SEMI"
@ -244,7 +224,7 @@ let to_lexeme = function
| Bytes b -> fst b.Region.value
| Int i
| Nat i
| Mutez i -> fst i.Region.value
| Mutez i -> fst i.Region.value
| Ident id
| Constr id -> id.Region.value
@ -382,9 +362,7 @@ let keywords = [
(fun reg -> With reg)
]
let reserved =
let open SSet in
empty |> add "arguments"
let reserved = SSet.empty
let constructors = [
(fun reg -> False reg);
@ -484,22 +462,20 @@ type nat_err =
let mk_nat lexeme region =
match String.index_opt lexeme 'n' with
None -> Error Invalid_natural
| Some _ ->
let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme,z})
None -> Error Invalid_natural
| Some _ -> let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme,z})
let mk_mutez lexeme region =
let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez"
let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez"
then Error Non_canonical_zero
else Ok (Mutez Region.{region; value = lexeme, z})
@ -557,22 +533,23 @@ let mk_constr lexeme region =
type attr_err = Invalid_attribute
let mk_attr _header _string _region =
Error Invalid_attribute
let mk_attr _ _ _ = Error Invalid_attribute
(* Predicates *)
let is_string = function String _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *)
type error =
Odd_lengthed_bytes
| Missing_break
| Negative_byte_sequence
let error_to_string = function
Odd_lengthed_bytes ->
@ -581,6 +558,9 @@ let error_to_string = function
| Missing_break ->
"Missing break.\n\
Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg
@ -593,29 +573,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit =
if not (is_eof token) then
if is_int token || is_bytes token then
match next_token buffer with
Some ([], next) ->
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
if is_int next then
fail region Odd_lengthed_bytes
else
if is_ident next || is_string next || is_bytes next then
fail region Missing_break
| Some (_::_, _) | None -> ()
else
if is_ident token || is_string token then
match next_token buffer with
Some ([], next) ->
if is_ident next || is_string next
|| is_bytes next || is_int next
then
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos
in fail region Missing_break
| Some (_::_, _) | None -> ()
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with
None -> ()
| Some (markup, next) ->
if is_minus token && is_bytes next
then let region =
Region.cover (to_region token) (to_region next)
in fail region Negative_byte_sequence
else
match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *)
}

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli
../shared/Lexer.mll
../shared/LexerLib.ml
../shared/EvalOpt.ml
../shared/EvalOpt.mli
../shared/FQueue.ml

View File

@ -31,48 +31,49 @@ type lexeme = string
type t =
(* Symbols *)
CAT of Region.t (* "++" *)
CAT of Region.t (* "++" *)
(* Arithmetics *)
| MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *)
| MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *)
(* Compounds *)
| LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *)
| LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *)
| LBRACKET of Region.t (* "[" *)
| RBRACKET of Region.t (* "]" *)
| LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *)
| LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *)
(* Separators *)
| COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *)
| COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *)
| WILD of Region.t (* "_" *)
| WILD of Region.t (* "_" *)
(* Comparisons *)
| EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *)
| GT of Region.t (* ">" *)
| LE of Region.t (* "=<" *)
| GE of Region.t (* ">=" *)
| EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *)
| GT of Region.t (* ">" *)
| LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *)
| ARROW of Region.t (* "=>" *)
(* Logic *)
| BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *)
@ -91,18 +92,18 @@ type t =
(* Keywords *)
| Else of Region.t
| False of Region.t
| If of Region.t
| Let of Region.t
| Rec of Region.t
| Else of Region.t
| False of Region.t
| If of Region.t
| Let of Region.t
| Mod of Region.t
| Or of Region.t
| Rec of Region.t
| Switch of Region.t
| Mod of Region.t
| Or of Region.t
| True of Region.t
| Type of Region.t
| True of Region.t
| Type of Region.t
(* Data constructors *)
(* Data constructors *)
| C_None of Region.t (* "None" *)
| C_Some of Region.t (* "Some" *)
@ -111,8 +112,6 @@ type t =
| EOF of Region.t (* End of file *)
type token = t
(* Projections
The difference between extracting the lexeme and a string from a
@ -121,6 +120,8 @@ type token = t
lexeme (concrete syntax).
*)
type token = t
val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t

View File

@ -1,66 +1,69 @@
{
(* START OF HEADER *)
type lexeme = string
let sprintf = Printf.sprintf
(* Shorthands *)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
module SMap = Utils.String.Map
module SSet = Utils.String.Set
type lexeme = string
let sprintf = Printf.sprintf
(* TOKENS *)
type t =
(* Symbols *)
CAT of Region.t (* "++" *)
CAT of Region.t (* "++" *)
(* Arithmetics *)
| MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *)
| MINUS of Region.t (* "-" *)
| PLUS of Region.t (* "+" *)
| SLASH of Region.t (* "/" *)
| TIMES of Region.t (* "*" *)
(* Compounds *)
| LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *)
| LBRACKET of Region.t (* "[" *)
| RBRACKET of Region.t (* "]" *)
| LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *)
| LPAR of Region.t (* "(" *)
| RPAR of Region.t (* ")" *)
| LBRACKET of Region.t (* "[" *)
| RBRACKET of Region.t (* "]" *)
| LBRACE of Region.t (* "{" *)
| RBRACE of Region.t (* "}" *)
(* Separators *)
| COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *)
| COMMA of Region.t (* "," *)
| SEMI of Region.t (* ";" *)
| VBAR of Region.t (* "|" *)
| COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *)
| WILD of Region.t (* "_" *)
| WILD of Region.t (* "_" *)
(* Comparisons *)
| EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *)
| GT of Region.t (* ">" *)
| LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *)
| ARROW of Region.t (* "=>" *)
| EQ of Region.t (* "=" *)
| EQEQ of Region.t (* "==" *)
| NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *)
| GT of Region.t (* ">" *)
| LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *)
| BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *)
(* Logic *)
| NOT of Region.t (* ! *)
| BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *)
| NOT of Region.t (* ! *)
(* Identifiers, labels, numbers and strings *)
@ -75,17 +78,17 @@ type t =
(* Keywords *)
(*| And*)
| Else of Region.t
| False of Region.t
| If of Region.t
| Let of Region.t
| Rec of Region.t
| Else of Region.t
| False of Region.t
| If of Region.t
| Let of Region.t
| Mod of Region.t
| Or of Region.t
| Rec of Region.t
| Switch of Region.t
| Mod of Region.t
| Or of Region.t
| True of Region.t
| Type of Region.t
| True of Region.t
| Type of Region.t
(* Data constructors *)
| C_None of Region.t (* "None" *)
@ -96,121 +99,143 @@ type t =
| EOF of Region.t (* End of file *)
(* Projections *)
type token = t
let proj_token = function
CAT region -> region, "CAT"
| MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS"
| SLASH region -> region, "SLASH"
| TIMES region -> region, "TIMES"
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| ELLIPSIS region -> region, "ELLIPSIS"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| EQEQ region -> region, "EQEQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| ARROW region -> region, "ARROW"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
(* Literals *)
String Region.{region; value} ->
region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| String Region.{region; value} ->
region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b)
| Else region -> region, "Else"
| False region -> region, "False"
| If region -> region, "If"
| Let region -> region, "Let"
| Rec region -> region, "Rec"
| Switch region -> region, "Switch"
| Mod region -> region, "Mod"
| NOT region -> region, "!"
| Or region -> region, "Or"
| True region -> region, "True"
| Type region -> region, "Type"
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
| Attr Region.{region; value} -> region, sprintf "Attr %s" value
| EOF region -> region, "EOF"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
(* Symbols *)
| CAT region -> region, "CAT"
| MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS"
| SLASH region -> region, "SLASH"
| TIMES region -> region, "TIMES"
| LPAR region -> region, "LPAR"
| RPAR region -> region, "RPAR"
| LBRACKET region -> region, "LBRACKET"
| RBRACKET region -> region, "RBRACKET"
| LBRACE region -> region, "LBRACE"
| RBRACE region -> region, "RBRACE"
| COMMA region -> region, "COMMA"
| SEMI region -> region, "SEMI"
| VBAR region -> region, "VBAR"
| COLON region -> region, "COLON"
| DOT region -> region, "DOT"
| ELLIPSIS region -> region, "ELLIPSIS"
| WILD region -> region, "WILD"
| EQ region -> region, "EQ"
| EQEQ region -> region, "EQEQ"
| NE region -> region, "NE"
| LT region -> region, "LT"
| GT region -> region, "GT"
| LE region -> region, "LE"
| GE region -> region, "GE"
| ARROW region -> region, "ARROW"
| NOT region -> region, "NOT"
| BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND"
| Else region -> region, "Else"
| False region -> region, "False"
| If region -> region, "If"
| Let region -> region, "Let"
| Rec region -> region, "Rec"
| Switch region -> region, "Switch"
| Mod region -> region, "Mod"
| Or region -> region, "Or"
| True region -> region, "True"
| Type region -> region, "Type"
| C_None region -> region, "C_None"
| C_Some region -> region, "C_Some"
| Attr Region.{region; value} -> region, sprintf "Attr %s" value
| EOF region -> region, "EOF"
let to_lexeme = function
CAT _ -> "++"
| MINUS _ -> "-"
| PLUS _ -> "+"
| SLASH _ -> "/"
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| ELLIPSIS _ -> "..."
| WILD _ -> "_"
| EQ _ -> "="
| EQEQ _ -> "=="
| NE _ -> "!="
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| ARROW _ -> "=>"
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
| Ident id -> id.Region.value
| Constr id -> id.Region.value
(* Literals *)
String s -> s.Region.value
| Bytes b -> fst b.Region.value
| Int i
| Nat i
| Mutez i -> fst i.Region.value
| String s -> s.Region.value
| Bytes b -> fst b.Region.value
| Else _ -> "else"
| False _ -> "false"
| If _ -> "if"
| Let _ -> "let"
| Rec _ -> "rec"
| Mod _ -> "mod"
| NOT _ -> "!"
| Or _ -> "or"
| Switch _ -> "switch"
| True _ -> "true"
| Type _ -> "type"
| Mutez i -> fst i.Region.value
| Ident id -> id.Region.value
| Constr id -> id.Region.value
| Attr a -> a.Region.value
(* Symbols *)
| CAT _ -> "++"
| MINUS _ -> "-"
| PLUS _ -> "+"
| SLASH _ -> "/"
| TIMES _ -> "*"
| LPAR _ -> "("
| RPAR _ -> ")"
| LBRACKET _ -> "["
| RBRACKET _ -> "]"
| LBRACE _ -> "{"
| RBRACE _ -> "}"
| COMMA _ -> ","
| SEMI _ -> ";"
| VBAR _ -> "|"
| COLON _ -> ":"
| DOT _ -> "."
| ELLIPSIS _ -> "..."
| WILD _ -> "_"
| EQ _ -> "="
| EQEQ _ -> "=="
| NE _ -> "!="
| LT _ -> "<"
| GT _ -> ">"
| LE _ -> "<="
| GE _ -> ">="
| ARROW _ -> "=>"
| BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&"
| NOT _ -> "!"
(* Keywords *)
| Else _ -> "else"
| False _ -> "false"
| If _ -> "if"
| Let _ -> "let"
| Mod _ -> "mod"
| Or _ -> "or"
| Rec _ -> "rec"
| Switch _ -> "switch"
| True _ -> "true"
| Type _ -> "type"
(* Data constructors *)
| C_None _ -> "None"
| C_Some _ -> "Some"
| Attr a -> a.Region.value
(* Virtual tokens *)
| EOF _ -> ""
(* CONVERSIONS *)
let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode
@ -261,12 +286,9 @@ let reserved =
|> add "functor"
|> add "inherit"
|> add "initializer"
(* |> add "land" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "lazy"
(* |> add "lor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "lsl"
|> add "lsr"
(* |> add "lxor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "match"
|> add "method"
|> add "module"
@ -291,7 +313,7 @@ let reserved =
let constructors = [
(fun reg -> C_None reg);
(fun reg -> C_Some reg);
(fun reg -> C_Some reg)
]
let add map (key, value) = SMap.add key value map
@ -376,20 +398,18 @@ let mk_int lexeme region =
else Ok (Int Region.{region; value = lexeme, z})
let mk_nat lexeme region =
let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme, z})
let mk_mutez lexeme region =
let z =
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez"
let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |>
Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez"
then Error Non_canonical_zero
else Ok (Mutez Region.{region; value = lexeme, z})
@ -426,11 +446,11 @@ let mk_sym lexeme region =
(* Symbols specific to ReasonLIGO *)
| "..." -> Ok (ELLIPSIS region)
| "=>" -> Ok (ARROW region)
| "==" -> Ok (EQEQ region)
| "!" -> Ok (NOT region)
| "++" -> Ok (CAT region)
| "..." -> Ok (ELLIPSIS region)
| "=>" -> Ok (ARROW region)
| "==" -> Ok (EQEQ region)
| "!" -> Ok (NOT region)
| "++" -> Ok (CAT region)
(* Invalid symbols *)
@ -461,16 +481,18 @@ let mk_attr header lexeme region =
(* Predicates *)
let is_string = function String _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *)
type error =
Odd_lengthed_bytes
| Missing_break
| Negative_byte_sequence
let error_to_string = function
Odd_lengthed_bytes ->
@ -479,6 +501,9 @@ let error_to_string = function
| Missing_break ->
"Missing break.\n\
Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg
@ -491,29 +516,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit =
if not (is_eof token) then
if is_int token || is_bytes token then
match next_token buffer with
Some ([], next) ->
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
if is_int next then
fail region Odd_lengthed_bytes
else
if is_ident next || is_string next || is_bytes next then
fail region Missing_break
| Some (_::_, _) | None -> ()
else
if is_ident token || is_string token then
match next_token buffer with
Some ([], next) ->
if is_ident next || is_string next
|| is_bytes next || is_int next
then
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos
in fail region Missing_break
| Some (_::_, _) | None -> ()
let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in
match next_token buffer with
None -> ()
| Some (markup, next) ->
if is_minus token && is_bytes next
then let region =
Region.cover (to_region token) (to_region next)
in fail region Negative_byte_sequence
else
match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *)
}

View File

@ -1,2 +0,0 @@
$HOME/git/OCaml-build/Makefile
$HOME/git/OCaml-build/Makefile.cfg

View File

@ -38,8 +38,6 @@
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
type lexeme = string
(* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer
@ -54,6 +52,8 @@ type lexeme = string
reading the ocamllex specification for the lexer ([Lexer.mll]).
*)
type lexeme = string
module type TOKEN =
sig
type token
@ -112,10 +112,36 @@ module type TOKEN =
unit
end
(* The signature of the lexer *)
module type S =
sig
module Token : TOKEN
type token = Token.token
(* The scanner [init] is meant to be called first to read the
BOM. Then [scan] is called. *)
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
(* Errors (specific to the generic lexer, not to the tokens) *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
(* The functorised interface
Note that the module parameter [Token] is re-exported as a
submodule in [S].
*)
module Make (Token: TOKEN) : LexerLib.S with module Token = Token
module Make (Token : TOKEN) : S with module Token = Token

View File

@ -8,12 +8,6 @@ module Pos = Simple_utils.Pos
(* START HEADER *)
type lexeme = string
(* ALIASES *)
let sprintf = Printf.sprintf
(* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer
@ -22,6 +16,8 @@ let sprintf = Printf.sprintf
caracterises the virtual token for end-of-file, because it requires
special handling. *)
type lexeme = string
module type TOKEN =
sig
type token
@ -84,31 +80,39 @@ module type TOKEN =
submodule in [S].
*)
module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
module type S =
sig
module Token : TOKEN
type token = Token.token
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
module Make (Token : TOKEN) : (S with module Token = Token) =
struct
module Token = Token
type token = Token.token
type file_path = string
type line_comment = LexerLib.line_comment
type block_comment = LexerLib.block_comment
let mk_block = LexerLib.mk_block
(* ERRORS *)
type error =
Invalid_utf8_sequence
| Unexpected_character of char
| Undefined_escape_sequence
(* | Missing_break*)
| Unterminated_string
(* | Unterminated_integer*)
(* | Odd_lengthed_bytes*)
| Unterminated_comment of string
(* | Orphan_minus*)
| Non_canonical_zero
(* | Negative_byte_sequence *)
| Broken_string
| Invalid_character_in_string
| Reserved_name of string
@ -116,6 +120,8 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
| Invalid_natural
| Invalid_attribute
let sprintf = Printf.sprintf
let error_to_string = function
Invalid_utf8_sequence ->
"Invalid UTF-8 sequence."
@ -124,30 +130,15 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
| Undefined_escape_sequence ->
"Undefined escape sequence.\n\
Hint: Remove or replace the sequence."
(* | Missing_break ->
"Missing break.\n\
Hint: Insert some space."
*) | Unterminated_string ->
| Unterminated_string ->
"Unterminated string.\n\
Hint: Close with double quotes."
(* | Unterminated_integer ->
"Unterminated integer.\n\
Hint: Remove the sign or proceed with a natural number." *)
(* | Odd_lengthed_bytes ->
"The length of the byte sequence is an odd number.\n\
Hint: Add or remove a digit."
*) | Unterminated_comment ending ->
| Unterminated_comment ending ->
sprintf "Unterminated comment.\n\
Hint: Close with \"%s\"." ending
(* | Orphan_minus ->
"Orphan minus sign.\n\
Hint: Remove the trailing space." *)
| Non_canonical_zero ->
"Non-canonical zero.\n\
Hint: Use 0."
(* | Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign." *)
| Broken_string ->
"The string starting here is interrupted by a line break.\n\
Hint: Remove the break, close the string before or insert a \
@ -319,13 +310,13 @@ let symbol = common_sym | pascaligo_sym | cameligo_sym | reasonligo_sym
(* Comments *)
let pascaligo_block_comment_opening = "(*"
let pascaligo_block_comment_closing = "*)"
let pascaligo_line_comment = "//"
let pascaligo_block_comment_opening = "(*"
let pascaligo_block_comment_closing = "*)"
let pascaligo_line_comment = "//"
let cameligo_block_comment_opening = "(*"
let cameligo_block_comment_closing = "*)"
let cameligo_line_comment = "//"
let cameligo_block_comment_opening = "(*"
let cameligo_block_comment_closing = "*)"
let cameligo_line_comment = "//"
let reasonligo_block_comment_opening = "/*"
let reasonligo_block_comment_closing = "*/"
@ -369,6 +360,7 @@ and scan state = parse
nl { scan (state#push_newline lexbuf) lexbuf }
| ' '+ { scan (state#push_space lexbuf) lexbuf }
| '\t'+ { scan (state#push_tabs lexbuf) lexbuf }
| ident { mk_ident state lexbuf }
| constr { mk_constr state lexbuf }
| bytes { mk_bytes seq state lexbuf }
@ -420,27 +412,14 @@ and scan state = parse
let state = state#set_pos pos in
scan state lexbuf }
(* Some special errors *)
(*
| '-' { let region, _, state = state#sync lexbuf in
let state = scan state lexbuf in
let open Markup in
match FQueue.peek state#units with
None -> assert false
| Some (_, ((Space _ | Tabs _)::_, token))
when Token.is_int token -> fail region Orphan_minus
| _ -> fail region Unterminated_integer }
| "-0x" byte_seq?
{ let region, _, _ = state#sync lexbuf
in fail region Negative_byte_sequence }
*)
(* String *)
| '"' { let opening, lexeme, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening lexeme in
scan_string thread state lexbuf |> mk_string }
(* Comments *)
| block_comment_openings {
let lexeme = Lexing.lexeme lexbuf in
match state#block with
@ -496,8 +475,6 @@ and scan_flags state acc = parse
| eof { let _, _, state = state#sync lexbuf
in List.rev acc, state }
(* TODO: Move below to [LexerCommon.mll] *)
(* Finishing a string *)
and scan_string thread state = parse
@ -624,164 +601,6 @@ and scan_utf8_inline thread state = parse
{
(* START TRAILER *)
(* Scanning the lexing buffer for tokens (and markup, as a
side-effect).
Because we want the lexer to have access to the right lexical
context of a recognised lexeme (to enforce stylistic constraints or
report special error patterns), we need to keep a hidden reference
to a queue of recognised lexical units (that is, tokens and markup)
that acts as a mutable state between the calls to [read]. When
[read] is called, that queue is examined first and, if it contains
at least one token, that token is returned; otherwise, the lexing
buffer is scanned for at least one more new token. That is the
general principle: we put a high-level buffer (our queue) on top of
the low-level lexing buffer.
One tricky and important detail is that we must make any parser
generated by Menhir (and calling [read]) believe that the last
region of the input source that was matched indeed corresponds to
the returned token, despite that many tokens and markup may have
been matched since it was actually read from the input. In other
words, the parser requests a token that is taken from the
high-level buffer, but the parser requests the source regions from
the _low-level_ lexing buffer, and they may disagree if more than
one token has actually been recognised.
Consequently, in order to maintain a consistent view for the
parser, we have to patch some fields of the lexing buffer, namely
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
generated by Menhir when querying source positions (regions). This
is the purpose of the function [patch_buffer]. After reading one
or more tokens and markup by the scanning rule [scan], we have to
save in the hidden reference [buf_reg] the region of the source
that was matched by [scan]. This atomic sequence of patching,
scanning and saving is implemented by the _function_ [scan]
(beware: it shadows the scanning rule [scan]). The function
[patch_buffer] is, of course, also called just before returning the
token, so the parser has a view of the lexing buffer consistent
with the token.
Note that an additional reference [first_call] is needed to
distinguish the first call to the function [scan], as the first
scanning rule is actually [init] (which can handle the BOM), not
[scan].
*)
type logger = Markup.t list -> token -> unit
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type instance = {
input : input;
read : log:logger -> Lexing.lexbuf -> token;
buffer : Lexing.lexbuf;
get_win : unit -> token LexerLib.window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
let lexbuf_from_input = function
File path ->
(try
let chan = open_in path in
let close () = close_in chan in
let lexbuf = Lexing.from_channel chan in
let () =
let open Lexing in
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
in Ok (lexbuf, close)
with Sys_error msg -> Stdlib.Error (File_opening msg))
| String s ->
Ok (Lexing.from_string s, fun () -> ())
| Channel chan ->
let close () = close_in chan in
Ok (Lexing.from_channel chan, close)
| Buffer b -> Ok (b, fun () -> ())
let open_token_stream ?line ?block input =
let file_path = match input with
File path -> path
| _ -> "" in
let pos = Pos.min ~file:file_path in
let buf_reg = ref (pos#byte, pos#byte)
and first_call = ref true
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let supply = Uutf.Manual.src decoder in
let state = ref (LexerLib.mk_state
~units:FQueue.empty
~last:Region.ghost
~window:LexerLib.Nil
~pos
~markup:[]
~decoder
~supply
?block
?line
()) in
let get_pos () = !state#pos
and get_last () = !state#last
and get_win () = !state#window
and get_file () = file_path in
let patch_buffer (start, stop) buffer =
let open Lexing in
let file_path = buffer.lex_curr_p.pos_fname in
buffer.lex_start_p <- {start with pos_fname = file_path};
buffer.lex_curr_p <- {stop with pos_fname = file_path}
and save_region buffer =
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
let scan buffer =
patch_buffer !buf_reg buffer;
(if !first_call
then (state := init !state buffer; first_call := false)
else state := scan !state buffer);
save_region buffer in
let next_token buffer =
scan buffer;
match FQueue.peek !state#units with
None -> None
| Some (units, ext_token) ->
state := !state#set_units units; Some ext_token in
let rec read ~log buffer =
match FQueue.deq !state#units with
None ->
scan buffer;
read ~log buffer
| Some (units, (left_mark, token)) ->
log left_mark token;
state := ((!state#set_units units)
#set_last (Token.to_region token))
#slide_token token;
Token.check_right_context token next_token buffer;
patch_buffer (Token.to_region token)#byte_pos buffer;
token in
match lexbuf_from_input input with
Ok (buffer, close) ->
let () =
match input with
File path when path <> "" -> LexerLib.reset ~file:path buffer
| _ -> () in
let instance = {
input; read; buffer; get_win; get_pos; get_last; get_file; close}
in Ok instance
| Error _ as e -> e
end (* of functor [Make] in HEADER *)
(* END TRAILER *)
}

View File

@ -1,159 +1,6 @@
(* Sliding window *)
(* The type [window] models a two-slot buffer of tokens for reporting
after a parse error. Technically, it is a parametric type, but its
use is meant for tokens, wherever they are defined.
In [Two(t1,t2)], the token [t2] is the next to be sent to the
parser.
The call [slide token buffer] pushes the token [token] in the
buffer [buffer]. If the buffer is full, that is, it is
[Two(t1,t2)], then the token [t2] is discarded to make room for
[token].
*)
type 'a window =
Nil
| One of 'a
| Two of 'a * 'a
let slide token = function
Nil -> One token
| One t | Two (t,_) -> Two (token,t)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
type lexeme = string
(* The signature [TOKEN] exports an abstract type [token], so a lexer
can be a functor over tokens. This enables to externalise
version-dependent constraints in any module whose signature matches
[TOKEN]. Generic functions to construct tokens are required.
Note the predicate [is_eof], which caracterises the virtual token
for end-of-file, because it requires special handling.
*)
module type TOKEN =
sig
type token
(* Predicates *)
val is_eof : token -> bool
(* Projections *)
val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t
(* Style *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool ->
[`Byte | `Point] ->
error Region.reg ->
file:bool ->
string Region.reg
val check_right_context :
token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit
end
(* The module type for lexers is [S]. It mainly exports the function
[open_token_stream], which returns
* a function [read] that extracts tokens from a lexing buffer,
together with a lexing buffer [buffer] to read from,
* a function [close] that closes that buffer,
* a function [get_pos] that returns the current position, and
* a function [get_last] that returns the region of the last
recognised token.
* a function [get_file] that returns the name of the file being
scanned (empty string if [stdin]).
Note that a module [Token] is exported too, because the signature
of the exported functions depend on it.
The type [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
The call [read ~log] evaluates in a lexer (also known as a
tokeniser or scanner) whose type is [Lexing.lexbuf -> token], and
suitable for a parser generated by Menhir. The argument labelled
[log] is a logger, that is, it may print a token and its left
markup to a given channel, at the caller's discretion.
*)
module type S =
sig
module Token : TOKEN
type token = Token.token
type file_path = string
type logger = Markup.t list -> token -> unit
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type instance = {
input : input;
read : log:logger -> Lexing.lexbuf -> token;
buffer : Lexing.lexbuf;
get_win : unit -> token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
val lexbuf_from_input :
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
type line_comment = string (* Opening of a line comment *)
type block_comment = <opening : string; closing : string>
val mk_block : opening:string -> closing:string -> block_comment
val open_token_stream :
?line:line_comment ->
?block:block_comment ->
input ->
(instance, open_err) Stdlib.result
(* Error reporting *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool ->
[`Byte | `Point] ->
error Region.reg ->
file:bool ->
string Region.reg
end
(* LEXER ENGINE *)
(* Resetting file name and line number in the lexing buffer
@ -205,27 +52,20 @@ let rollback buffer =
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
(* Utility types *)
type file_path = string
type lexeme = string
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
(* When scanning structured constructs, like strings and comments, we
need to keep the region of the opening symbol (like double quote,
"//" or "(*") in order to report any error more precisely. Since
ocamllex is byte-oriented, we need to store the parsed bytes as
characters in an accumulator [acc] and also its length [len], so,
we are done, it is easy to build the string making up the
structured construct with [mk_str] (see above).
The resulting data structure is called a _thread_. (Note for
Emacs: "*)".)
*)
type thread = <
opening : Region.t;
length : int;
acc : char list;
push_char : char -> thread;
opening : Region.t;
length : int;
acc : char list;
to_string : string;
push_char : char -> thread;
push_string : string -> thread;
to_string : string;
set_opening : Region.t -> thread
>
@ -233,12 +73,12 @@ let mk_thread region lexeme : thread =
(* The call [explode s a] is the list made by pushing the characters
in the string [s] on top of [a], in reverse order. For example,
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
let explode s acc =
let rec push = function
0 -> acc
| i -> s.[i-1] :: push (i-1)
in push (String.length s)
in
in push (String.length s) in
object
val opening = region
method opening = opening
@ -268,184 +108,288 @@ let mk_thread region lexeme : thread =
method to_string =
let bytes = Bytes.make length ' ' in
let rec fill i = function
[] -> bytes
| char::l -> Bytes.set bytes i char; fill (i-1) l
[] -> bytes
| char::l -> Bytes.set bytes i char; fill (i-1) l
in fill (length-1) acc |> Bytes.to_string
end
type file_path = string
(* STATE *)
(* STATE *)
(* Scanning the lexing buffer for tokens (and markup, as a
side-effect).
(* Beyond tokens, the result of lexing is a state. The type
[state] represents the logical state of the lexing engine, that
is, a value which is threaded during scanning and which denotes
useful, high-level information beyond what the type
[Lexing.lexbuf] in the standard library already provides for
all generic lexers.
Because we want the lexer to have access to the right lexical
context of a recognised lexeme (to enforce stylistic constraints or
report special error patterns), we need to keep a hidden reference
to a queue of recognised lexical units (that is, tokens and markup)
that acts as a mutable state between the calls to [read]. When
[read] is called, that queue is examined first and, if it contains
at least one token, that token is returned; otherwise, the lexing
buffer is scanned for at least one more new token. That is the
general principle: we put a high-level buffer (our queue) on top of
the low-level lexing buffer.
Tokens are the smallest units used by the parser to build the
abstract syntax tree. The state includes a queue of recognised
tokens, with the markup at the left of its lexeme until either
the start of the file or the end of the previously recognised
token.
One tricky and important detail is that we must make any parser
generated by Menhir (and calling [read]) believe that the last
region of the input source that was matched indeed corresponds to
the returned token, despite that many tokens and markup may have
been matched since it was actually read from the input. In other
words, the parser requests a token that is taken from the
high-level buffer, but the parser requests the source regions from
the _low-level_ lexing buffer, and they may disagree if more than
one token has actually been recognised.
The markup from the last recognised token or, if the first
token has not been recognised yet, from the beginning of the
file is stored in the field [markup] of the state because it is
a side-effect, with respect to the output token list, and we
use a record with a single field [units] because that record
may be easily extended during the future maintenance of this
lexer.
Consequently, in order to maintain a consistent view for the
parser, we have to patch some fields of the lexing buffer, namely
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
generated by Menhir when querying source positions (regions). This
is the purpose of the function [patch_buffer]. After reading one or
more tokens and markup by the scanning rule [scan], we have to save
in the hidden reference [buf_reg] the region of the source that was
matched by [scan]. This atomic sequence of patching, scanning and
saving is implemented by the _function_ [scan] (beware: it shadows
the scanning rule [scan]). The function [patch_buffer] is, of
course, also called just before returning the token, so the parser
has a view of the lexing buffer consistent with the token.
The state also includes a field [pos] which holds the current
position in the LIGO source file. The position is not always
updated after a single character has been matched: that depends
on the regular expression that matched the lexing buffer.
Note that an additional reference [first_call] is needed to
distinguish the first call to the function [scan], as the first
scanning rule is actually [init] (which can handle the BOM), not
[scan].
*)
The field [window] is a two-token window, that is, a buffer
that contains the last recognised token, and the penultimate
(if any).
type 'token window =
Nil
| One of 'token
| Two of 'token * 'token
The fields [decoder] and [supply] offer the support needed
for the lexing of UTF-8 encoded characters in comments (the
only place where they are allowed in LIGO). The former is the
decoder proper and the latter is the effectful function
[supply] that takes a byte, a start index and a length and feed
it to [decoder]. See the documentation of the third-party
library Uutf.
*)
type 'token state = <
units : (Markup.t list * 'token) FQueue.t;
markup : Markup.t list;
window : 'token window;
last : Region.t;
pos : Pos.t;
decoder : Uutf.decoder;
supply : Bytes.t -> int -> int -> unit;
block : EvalOpt.block_comment option;
line : EvalOpt.line_comment option;
type line_comment = string (* Opening of a line comment *)
type block_comment = <opening : string; closing : string>
enqueue : 'token -> 'token state;
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
set_last : Region.t -> 'token state;
set_pos : Pos.t -> 'token state;
slide_token : 'token -> 'token state;
let mk_block ~opening ~closing : block_comment =
object
method opening = opening
method closing = closing
end
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
type 'a state = <
units : (Markup.t list * 'a) FQueue.t;
markup : Markup.t list;
window : 'a window;
last : Region.t;
pos : Pos.t;
decoder : Uutf.decoder;
supply : Bytes.t -> int -> int -> unit;
block : block_comment option;
line : line_comment option;
push_newline : Lexing.lexbuf -> 'token state;
push_line : thread -> 'token state;
push_block : thread -> 'token state;
push_space : Lexing.lexbuf -> 'token state;
push_tabs : Lexing.lexbuf -> 'token state;
push_bom : Lexing.lexbuf -> 'token state;
push_markup : Markup.t -> 'token state;
>
enqueue : 'a -> 'a state;
set_units : (Markup.t list * 'a) FQueue.t -> 'a state;
set_last : Region.t -> 'a state;
set_pos : Pos.t -> 'a state;
slide_token : 'a -> 'a state;
let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply
?block ?line () : _ state =
object (self)
val units = units
method units = units
val markup = markup
method markup = markup
val window = window
method window = window
val last = last
method last = last
val pos = pos
method pos = pos
method decoder = decoder
method supply = supply
method block = block
method line = line
sync : Lexing.lexbuf -> Region.t * lexeme * 'a state;
method enqueue token =
{< units = FQueue.enq (markup, token) units;
markup = [] >}
push_newline : Lexing.lexbuf -> 'a state;
push_line : thread -> 'a state;
push_block : thread -> 'a state;
push_space : Lexing.lexbuf -> 'a state;
push_tabs : Lexing.lexbuf -> 'a state;
push_bom : Lexing.lexbuf -> 'a state;
push_markup : Markup.t -> 'a state;
>
method set_units units = {< units = units >}
method set_last region = {< last = region >}
method set_pos pos = {< pos = pos >}
let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply
?block ?line () : _ state =
object (self)
val units = units
method units = units
val markup = markup
method markup = markup
val window = window
method window = window
val last = last
method last = last
val pos = pos
method pos = pos
method decoder = decoder
method supply = supply
method block = block
method line = line
method slide_token token =
match self#window with
Nil -> {< window = One token >}
| One t | Two (t,_) -> {< window = Two (token,t) >}
(* The call [enqueue (token, state)] updates functionally the
state [state] by associating the token [token] with the
stored markup and enqueuing the pair into the units
queue. The field [markup] is then reset to the empty
list. *)
method sync buffer =
let lex = Lexing.lexeme buffer in
let len = String.length lex in
let start = pos in
let stop = start#shift_bytes len in
let state = {< pos = stop >}
in Region.make ~start ~stop, lex, state
method enqueue token =
{< units = FQueue.enq (markup, token) units;
markup = [] >}
(* MARKUP *)
method set_units units = {< units = units >}
method set_last region = {< last = region >}
method set_pos pos = {< pos = pos >}
(* Committing markup to the current logical state *)
method slide_token token =
{< window = slide token window >}
method push_markup unit = {< markup = unit :: markup >}
(* The call [sync state buffer] updates the current position
in accordance with the contents of the lexing buffer, more
precisely, depending on the length of the string which has
just been recognised by the scanner: that length is used as
a positive offset to the current column. *)
method push_newline buffer =
let () = Lexing.new_line buffer in
let value = Lexing.lexeme buffer in
let start = self#pos in
let stop = start#new_line value in
let region = Region.make ~start ~stop in
let unit = Markup.Newline Region.{region; value}
in {< pos = stop; markup = unit::markup >}
method sync buffer =
let lex = Lexing.lexeme buffer in
let len = String.length lex in
let start = pos in
let stop = start#shift_bytes len in
let state = {< pos = stop >}
in Region.make ~start ~stop, lex, state
method push_line thread =
let start = thread#opening#start in
let region = Region.make ~start ~stop:self#pos
and value = thread#to_string in
let unit = Markup.LineCom Region.{region; value}
in {< markup = unit::markup >}
(* MARKUP *)
method push_block thread =
let start = thread#opening#start in
let region = Region.make ~start ~stop:self#pos
and value = thread#to_string in
let unit = Markup.BlockCom Region.{region; value}
in {< markup = unit::markup >}
(* Committing markup to the current logical state *)
method push_space buffer =
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Space Region.{region; value}
in state#push_markup unit
method push_markup unit = {< markup = unit :: markup >}
method push_tabs buffer =
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Tabs Region.{region; value}
in state#push_markup unit
method push_newline buffer =
let () = Lexing.new_line buffer in
let value = Lexing.lexeme buffer in
let start = self#pos in
let stop = start#new_line value in
let region = Region.make ~start ~stop in
let unit = Markup.Newline Region.{region; value}
in {< pos = stop; markup = unit::markup >}
method push_bom buffer =
let region, value, state = self#sync buffer in
let unit = Markup.BOM Region.{region; value}
in state#push_markup unit
end
method push_line thread =
let start = thread#opening#start in
let region = Region.make ~start ~stop:self#pos
and value = thread#to_string in
let unit = Markup.LineCom Region.{region; value}
in {< markup = unit::markup >}
(* LEXER INSTANCE *)
method push_block thread =
let start = thread#opening#start in
let region = Region.make ~start ~stop:self#pos
and value = thread#to_string in
let unit = Markup.BlockCom Region.{region; value}
in {< markup = unit::markup >}
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
method push_space buffer =
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Space Region.{region; value}
in state#push_markup unit
type 'token logger = Markup.t list -> 'token -> unit
method push_tabs buffer =
let region, lex, state = self#sync buffer in
let value = String.length lex in
let unit = Markup.Tabs Region.{region; value}
in state#push_markup unit
type 'token instance = {
input : input;
read : log:('token logger) -> Lexing.lexbuf -> 'token;
buffer : Lexing.lexbuf;
get_win : unit -> 'token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
method push_bom buffer =
let region, value, state = self#sync buffer in
let unit = Markup.BOM Region.{region; value}
in state#push_markup unit
type open_err = File_opening of string
end
let lexbuf_from_input = function
String s ->
Ok (Lexing.from_string s, fun () -> ())
| Channel chan ->
let close () = close_in chan in
Ok (Lexing.from_channel chan, close)
| Buffer b ->
Ok (b, fun () -> ())
| File path ->
try
let chan = open_in path in
let close () = close_in chan in
let lexbuf = Lexing.from_channel chan in
let () =
let open Lexing in
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
in Ok (lexbuf, close)
with Sys_error msg -> Stdlib.Error (File_opening msg)
let open_token_stream ?line ?block ~init ~scan
~token_to_region ~style input =
let file_path = match input with
File path -> path
| _ -> "" in
let pos = Pos.min ~file:file_path in
let buf_reg = ref (pos#byte, pos#byte)
and first_call = ref true
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let supply = Uutf.Manual.src decoder in
let state = ref (mk_state
~units:FQueue.empty
~last:Region.ghost
~window:Nil
~pos
~markup:[]
~decoder
~supply
?block
?line
()) in
let get_pos () = !state#pos
and get_last () = !state#last
and get_win () = !state#window
and get_file () = file_path in
let patch_buffer (start, stop) buffer =
let open Lexing in
let file_path = buffer.lex_curr_p.pos_fname in
buffer.lex_start_p <- {start with pos_fname = file_path};
buffer.lex_curr_p <- {stop with pos_fname = file_path}
and save_region buffer =
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
let scan' init scan buffer =
patch_buffer !buf_reg buffer;
(if !first_call
then (state := init !state buffer; first_call := false)
else state := scan !state buffer);
save_region buffer in
let next_token init scan buffer =
scan' init scan buffer;
match FQueue.peek !state#units with
None -> None
| Some (units, ext_token) ->
state := !state#set_units units; Some ext_token in
let rec read init scan ~token_to_region ~style ~log buffer =
match FQueue.deq !state#units with
None ->
scan' init scan buffer;
read init scan ~token_to_region ~style ~log buffer
| Some (units, (left_mark, token)) ->
log left_mark token;
state := ((!state#set_units units)
#set_last (token_to_region token))
#slide_token token;
style token (next_token init scan) buffer;
patch_buffer (token_to_region token)#byte_pos buffer;
token in
match lexbuf_from_input input with
Ok (buffer, close) ->
let () =
match input with
File path when path <> "" -> reset ~file:path buffer
| _ -> () in
let instance = {
read = read init scan ~token_to_region ~style;
input; buffer; get_win; get_pos; get_last; get_file; close}
in Ok instance
| Error _ as e -> e

View File

@ -0,0 +1,205 @@
(* A library for writing UTF8-aware lexers *)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
(* The function [rollback] resets the lexing buffer to the state it
was when it matched the last regular expression. This function is
safe to use only in the semantic action of the rule which last
matched. *)
val rollback : Lexing.lexbuf -> unit
(* Utility types *)
type file_path = string
type lexeme = string
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
(* When scanning structured constructs, like strings and comments, we
need to keep the region of the opening symbol (like double quote,
"//" or "(*") in order to report any error more precisely. Since
ocamllex is byte-oriented, we need to store the parsed bytes as
characters in an accumulator [acc] and also its length [len], so,
we are done, it is easy to build the string making up the
structured construct with [mk_str] (see above).
The resulting data structure is called a _thread_. (Note for
Emacs: "*)".)
*)
type thread = <
opening : Region.t;
length : int;
acc : char list;
to_string : string;
push_char : char -> thread;
push_string : string -> thread;
set_opening : Region.t -> thread
>
val mk_thread : Region.t -> lexeme -> thread
(* STATE *)
(* Beyond producing tokens, the result of lexing is a _state_. The
type [state] represents the abstract logical state of the lexing
engine, that is, a value which is threaded during scanning and
which denotes useful, high-level information beyond what the type
[Lexing.lexbuf] in the standard library already provides for all
generic lexers. We qualify it as "logical state" because the lexing
buffer itself has a "physical state" defined by the type
[Lexing.lexbuf].
Tokens are the smallest units used by the parser to build the
abstract syntax tree. The state includes a queue of recognised
tokens, with the markup at the left of its lexeme until either the
start of the file or the end of the previously recognised token.
The markup from the last recognised token or, if the first token
has not been recognised yet, from the beginning of the file is
stored in the field [markup] of the state because it is a
side-effect, with respect to the output token list, and we use a
record with a single field [units] because that record may be
easily extended during the future maintenance of this lexer.
The state also includes a field [pos] which holds the current
position in the LIGO source file. The position is not always
updated after a single character has been matched: that depends on
the regular expression that matched the lexing buffer.
The field [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
Technically, it is a parametric type, but its use is meant for
tokens, wherever they are defined. In [Two (t1,t2)], and in case
of a syntax error, [t1] is the first invalid token and [t2] is the
last valid one.
The fields [decoder] and [supply] offer the support needed for
the lexing of UTF-8 encoded characters in comments (the only place
where they are allowed in LIGO). The former is the decoder proper
and the latter is the effectful function [supply] that takes a
byte, a start index and a length and feed it to [decoder]. See the
documentation of the third-party library Uutf.
Some methods are now documented.
The call [state#enqueue token] updates functionally the state
[state] by associating the token [token] with the stored markup and
enqueuing the pair into the units queue. The field [markup] is then
reset to the empty list.
The call [state#slide_token token] pushes the token [token] in
the buffer [buffer]. If the buffer is full, that is, it is [Two
(t1,t2)], then the token [t2] is discarded to make room for
[token].
The call [state#sync buffer] updates the current position in
accordance with the contents of the lexing buffer, more precisely,
depending on the length of the string which has just been
recognised by the scanner: that length is used as a positive offset
to the current column.
*)
type 'token window =
Nil
| One of 'token
| Two of 'token * 'token
type 'token state = <
units : (Markup.t list * 'token) FQueue.t;
markup : Markup.t list;
window : 'token window;
last : Region.t;
pos : Pos.t;
decoder : Uutf.decoder;
supply : Bytes.t -> int -> int -> unit;
block : EvalOpt.block_comment option;
line : EvalOpt.line_comment option;
enqueue : 'token -> 'token state;
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
set_last : Region.t -> 'token state;
set_pos : Pos.t -> 'token state;
slide_token : 'token -> 'token state;
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
push_newline : Lexing.lexbuf -> 'token state;
push_line : thread -> 'token state;
push_block : thread -> 'token state;
push_space : Lexing.lexbuf -> 'token state;
push_tabs : Lexing.lexbuf -> 'token state;
push_bom : Lexing.lexbuf -> 'token state;
push_markup : Markup.t -> 'token state;
>
(* LEXER INSTANCE *)
(* The function [open_token_stream] returns a lexer instance made of
* the input [input] of type [input];
* a function [read] that extracts tokens from a lexing buffer,
together with a lexing buffer [buffer] to read from,
* a function [close] that closes that buffer,
* a function [get_pos] that returns the current position, and
* a function [get_last] that returns the region of the last
recognised token.
* a function [get_file] that returns the name of the file being
scanned (empty string if [stdin]).
Note that a module [Token] is exported too, because the signature
of the exported functions depend on it.
The type [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
The call [read ?line ?block ~init ~scan ~token_to_region ~style
input] evaluates in a lexer (also known as a tokeniser or scanner)
whose type is [log:('token logger) -> Lexing.lexbuf -> 'token], and
suitable for a parser generated by Menhir. The argument labelled
[log] is a logger, that is, it may print a token and its left
markup to a given channel, at the caller's discretion. The argument
labelled [~init] is the scanner to be called first, usually for
reading the BOM, then [scan] is used for the following calls. The
function labelled [~style] is used to check stylistic constraints
on the tokens and the markup between them.
*)
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type 'token logger = Markup.t list -> 'token -> unit
type 'token instance = {
input : input;
read : log:('token logger) -> Lexing.lexbuf -> 'token;
buffer : Lexing.lexbuf;
get_win : unit -> 'token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
val lexbuf_from_input :
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
val open_token_stream :
?line:EvalOpt.line_comment ->
?block:EvalOpt.block_comment ->
init:('token state -> Lexing.lexbuf -> 'token state) ->
scan:('token state -> Lexing.lexbuf -> 'token state) ->
token_to_region:('token -> Region.t) ->
style:('token ->
(Lexing.lexbuf -> (Markup.t list * 'token) option) ->
Lexing.lexbuf ->
unit) ->
input ->
('token instance, open_err) Stdlib.result

View File

@ -4,7 +4,8 @@ module Region = Simple_utils.Region
module type S =
sig
module Lexer : LexerLib.S
module Lexer : Lexer.S
type token = Lexer.token
val output_token :
?offsets:bool ->
@ -12,7 +13,7 @@ module type S =
EvalOpt.command ->
out_channel ->
Markup.t list ->
Lexer.token ->
token ->
unit
type file_path = string
@ -22,19 +23,26 @@ module type S =
[`Byte | `Point] ->
?block:EvalOpt.block_comment ->
?line:EvalOpt.line_comment ->
Lexer.input ->
token_to_region:(token -> Region.t) ->
style:(token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit) ->
LexerLib.input ->
EvalOpt.command ->
(unit, string Region.reg) Stdlib.result
end
module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
module Make (Lexer: Lexer.S) : (S with module Lexer = Lexer) =
struct
module Lexer = Lexer
module Token = Lexer.Token
type token = Lexer.token
(** Pretty-printing in a string the lexemes making up the markup
(* Pretty-printing in a string the lexemes making up the markup
between two tokens, concatenated with the last lexeme
itself. *)
let output_token ?(offsets=true) mode command
channel left_mark token : unit =
let output str = Printf.fprintf channel "%s%!" str in
@ -57,10 +65,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
type file_path = string
let trace ?(offsets=true) mode ?block ?line input command :
let trace ?(offsets=true) mode ?block ?line
~token_to_region ~style input command :
(unit, string Region.reg) Stdlib.result =
match Lexer.open_token_stream ?line ?block input with
Ok Lexer.{read; buffer; close; _} ->
match LexerLib.open_token_stream
~init:Lexer.init
~scan:Lexer.scan
~token_to_region
~style
?line ?block input
with
Ok LexerLib.{read; buffer; close; _} ->
let log = output_token ~offsets mode command stdout
and close_all () = flush_all (); close () in
let rec iter () =
@ -69,12 +84,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
if Token.is_eof token
then Stdlib.Ok ()
else iter ()
| exception Lexer.Token.Error error ->
let msg =
Lexer.Token.format_error
~offsets mode ~file:true error
in Stdlib.Error msg
| exception Lexer.Error error ->
let msg =
Lexer.format_error ~offsets mode ~file:true error
in Stdlib.Error msg in
let result = iter ()
in close_all (); result
| Stdlib.Error (Lexer.File_opening msg) ->
| Stdlib.Error (LexerLib.File_opening msg) ->
flush_all (); Stdlib.Error (Region.wrap_ghost msg)
end

View File

@ -2,7 +2,8 @@ module Region = Simple_utils.Region
module type S =
sig
module Lexer : LexerLib.S
module Lexer : Lexer.S
type token = Lexer.token
val output_token :
?offsets:bool ->
@ -10,7 +11,7 @@ module type S =
EvalOpt.command ->
out_channel ->
Markup.t list ->
Lexer.token ->
token ->
unit
type file_path = string
@ -20,9 +21,14 @@ module type S =
[`Byte | `Point] ->
?block:EvalOpt.block_comment ->
?line:EvalOpt.line_comment ->
Lexer.input ->
token_to_region:(token -> Region.t) ->
style:(token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit) ->
LexerLib.input ->
EvalOpt.command ->
(unit, string Region.reg) Stdlib.result
end
module Make (Lexer: LexerLib.S) : S with module Lexer = Lexer
module Make (Lexer: Lexer.S) : S with module Lexer = Lexer

View File

@ -9,7 +9,7 @@ module type IO =
val options : EvalOpt.options (* CLI options *)
end
module Make (IO: IO) (Lexer: LexerLib.S) =
module Make (IO: IO) (Lexer: Lexer.S) =
struct
(* Error printing and exception tracing *)
@ -39,11 +39,16 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
| Stdlib.Ok pp_buffer ->
(* Running the lexer on the preprocessed input *)
let source = Lexer.String (Buffer.contents pp_buffer) in
match Lexer.open_token_stream ?line:IO.options#line
?block:IO.options#block
source with
Ok Lexer.{read; buffer; close; _} ->
let source = LexerLib.String (Buffer.contents pp_buffer) in
match LexerLib.open_token_stream
?line:IO.options#line
?block:IO.options#block
~init:Lexer.init
~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
source with
Ok LexerLib.{read; buffer; close; _} ->
let close_all () = flush_all (); close () in
let rec read_tokens tokens =
match read ~log:(fun _ _ -> ()) buffer with
@ -51,9 +56,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
if Lexer.Token.is_eof token
then Stdlib.Ok (List.rev tokens)
else read_tokens (token::tokens)
(* | exception _ ->
Printf.eprintf "Here\n%!"; exit 1
*) | exception Lexer.Token.Error error ->
| exception Lexer.Token.Error error ->
let file =
match IO.options#input with
None | Some "-" -> false
@ -74,7 +77,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
in Stdlib.Error msg in
let result = read_tokens []
in close_all (); result
| Stdlib.Error (Lexer.File_opening msg) ->
| Stdlib.Error (LexerLib.File_opening msg) ->
flush_all (); Stdlib.Error (Region.wrap_ghost msg) in
match IO.options#input with
None -> preproc stdin
@ -115,7 +118,9 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
IO.options#mode
?block:IO.options#block
?line:IO.options#line
(Lexer.String preproc_str)
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
(LexerLib.String preproc_str)
IO.options#cmd
in match IO.options#input with
None -> preproc stdin

View File

@ -7,7 +7,7 @@ module type IO =
val options : EvalOpt.options (* CLI options *)
end
module Make (IO: IO) (Lexer: LexerLib.S) :
module Make (IO: IO) (Lexer: Lexer.S) :
sig
val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result
val trace : unit -> (unit, string Region.reg) Stdlib.result

View File

@ -56,8 +56,8 @@ module type PARSER =
(* Main functor *)
module Make (IO: IO)
(Lexer: LexerLib.S)
(Parser: PARSER with type token = Lexer.Token.token)
(Lexer: Lexer.S)
(Parser: PARSER with type token = Lexer.token)
(ParErr: sig val message : int -> string end) =
struct
module I = Parser.MenhirInterpreter
@ -143,14 +143,14 @@ module Make (IO: IO)
~offsets:IO.options#offsets
IO.options#mode IO.options#cmd stdout
let incr_contract Lexer.{read; buffer; get_win; close; _} =
let incr_contract LexerLib.{read; buffer; get_win; close; _} =
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
and failure = failure get_win in
let parser = Incr.contract buffer.Lexing.lex_curr_p in
let ast = I.loop_handle success failure supplier parser
in flush_all (); close (); ast
let incr_expr Lexer.{read; buffer; get_win; close; _} =
let incr_expr LexerLib.{read; buffer; get_win; close; _} =
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
and failure = failure get_win in
let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in

View File

@ -55,8 +55,8 @@ module type PARSER =
end
module Make (IO: IO)
(Lexer: LexerLib.S)
(Parser: PARSER with type token = Lexer.Token.token)
(Lexer: Lexer.S)
(Parser: PARSER with type token = Lexer.token)
(ParErr: sig val message : int -> string end) :
sig
(* WARNING: The following parsers may all raise [Lexer.Error] *)
@ -78,8 +78,8 @@ module Make (IO: IO)
exception Point of error
val incr_contract : Lexer.instance -> Parser.ast
val incr_expr : Lexer.instance -> Parser.expr
val incr_contract : Lexer.token LexerLib.instance -> Parser.ast
val incr_expr : Lexer.token LexerLib.instance -> Parser.expr
val format_error :
?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg

View File

@ -37,7 +37,7 @@ module type Printer =
val print_expr : state -> expr -> unit
end
module Make (Lexer: LexerLib.S)
module Make (Lexer: Lexer.S)
(AST: sig type t type expr end)
(Parser: ParserAPI.PARSER
with type ast = AST.t
@ -89,12 +89,12 @@ module Make (Lexer: LexerLib.S)
ParserLog.mk_state ~offsets:SubIO.options#offsets
~mode:SubIO.options#mode
~buffer:output in
let close () = lexer_inst.Lexer.close () in
let close () = lexer_inst.LexerLib.close () in
let expr =
try
if SubIO.options#mono then
let tokeniser = lexer_inst.Lexer.read ~log
and lexbuf = lexer_inst.Lexer.buffer
let tokeniser = lexer_inst.LexerLib.read ~log
and lexbuf = lexer_inst.LexerLib.buffer
in Front.mono_expr tokeniser lexbuf
else
Front.incr_expr lexer_inst
@ -124,12 +124,12 @@ module Make (Lexer: LexerLib.S)
ParserLog.mk_state ~offsets:SubIO.options#offsets
~mode:SubIO.options#mode
~buffer:output in
let close () = lexer_inst.Lexer.close () in
let close () = lexer_inst.LexerLib.close () in
let ast =
try
if SubIO.options#mono then
let tokeniser = lexer_inst.Lexer.read ~log
and lexbuf = lexer_inst.Lexer.buffer
let tokeniser = lexer_inst.LexerLib.read ~log
and lexbuf = lexer_inst.LexerLib.buffer
in Front.mono_contract tokeniser lexbuf
else
Front.incr_contract lexer_inst
@ -163,10 +163,18 @@ module Make (Lexer: LexerLib.S)
| exception Lexer.Error err ->
let file =
lexer_inst.Lexer.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error =
Lexer.format_error ~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "")
lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error = Lexer.format_error
~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "")
in Stdlib.Error error
| exception Lexer.Token.Error err ->
let file =
lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error = Lexer.Token.format_error
~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "")
in Stdlib.Error error
(* Incremental API of Menhir *)
@ -181,7 +189,7 @@ module Make (Lexer: LexerLib.S)
| exception Parser.Error ->
let invalid, valid_opt =
match lexer_inst.Lexer.get_win () with
match lexer_inst.LexerLib.get_win () with
LexerLib.Nil ->
assert false (* Safe: There is always at least EOF. *)
| LexerLib.One invalid -> invalid, None
@ -205,8 +213,8 @@ module Make (Lexer: LexerLib.S)
(* Parsing a contract *)
let gen_parser options input parser =
match Lexer.lexbuf_from_input input with
Stdlib.Error (Lexer.File_opening msg) ->
match LexerLib.lexbuf_from_input input with
Stdlib.Error (LexerLib.File_opening msg) ->
Stdlib.Error (Region.wrap_ghost msg)
| Ok (lexbuf, close) ->
(* Preprocessing the input source *)
@ -224,48 +232,53 @@ module Make (Lexer: LexerLib.S)
(* Lexing and parsing the preprocessed input source *)
let () = close () in
let input' = Lexer.String (Buffer.contents buffer) in
match Lexer.open_token_stream ?line:options#line
?block:options#block
input'
let input' = LexerLib.String (Buffer.contents buffer) in
match LexerLib.open_token_stream
~init:Lexer.init
~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
?line:options#line
?block:options#block
input'
with
Ok instance ->
let open Lexing in
instance.Lexer.buffer.lex_curr_p <-
{instance.Lexer.buffer.lex_curr_p with pos_fname=file};
instance.LexerLib.buffer.lex_curr_p <-
{instance.LexerLib.buffer.lex_curr_p with pos_fname=file};
apply instance parser
| Stdlib.Error (Lexer.File_opening msg) ->
| Stdlib.Error (LexerLib.File_opening msg) ->
Stdlib.Error (Region.wrap_ghost msg)
(* Parsing a contract in a file *)
let contract_in_file (source : string) =
let options = SubIO.make ~input:(Some source) ~expr:false
in gen_parser options (Lexer.File source) parse_contract
in gen_parser options (LexerLib.File source) parse_contract
(* Parsing a contract in a string *)
let contract_in_string (source : string) =
let options = SubIO.make ~input:None ~expr:false in
gen_parser options (Lexer.String source) parse_contract
gen_parser options (LexerLib.String source) parse_contract
(* Parsing a contract in stdin *)
let contract_in_stdin () =
let options = SubIO.make ~input:None ~expr:false in
gen_parser options (Lexer.Channel stdin) parse_contract
gen_parser options (LexerLib.Channel stdin) parse_contract
(* Parsing an expression in a string *)
let expr_in_string (source : string) =
let options = SubIO.make ~input:None ~expr:true in
gen_parser options (Lexer.String source) parse_expr
gen_parser options (LexerLib.String source) parse_expr
(* Parsing an expression in stdin *)
let expr_in_stdin () =
let options = SubIO.make ~input:None ~expr:true in
gen_parser options (Lexer.Channel stdin) parse_expr
gen_parser options (LexerLib.Channel stdin) parse_expr
(* Preprocess only *)

View File

@ -43,7 +43,7 @@ module type Printer =
(* Main functor to make the parser *)
module Make (Lexer : LexerLib.S)
module Make (Lexer : Lexer.S)
(AST : sig type t type expr end)
(Parser : ParserAPI.PARSER
with type ast = AST.t

View File

@ -1 +1 @@
const a: string = -0x222;
const a: string = - (**) 0x2222

View File

@ -1 +1 @@
let a = -0x222;
let a = - (**) 0x2222

View File

@ -1 +1 @@
let a = -0x222;
let a = - /**/ 0x2222;

View File

@ -1 +0,0 @@
let arguments = 1;

View File

@ -1 +1 @@
let end = 1;
let end = 1;