Finished refactoring of lexer.

This commit is contained in:
Christian Rinderknecht 2020-04-28 19:26:31 +02:00
parent 9618a48848
commit 005b7fd69b
27 changed files with 1174 additions and 1079 deletions

View File

@ -54,7 +54,7 @@ ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9:
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-23: ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-31:
Negative byte sequence. Negative byte sequence.
Hint: Remove the leading minus sign. Hint: Remove the leading minus sign.
{} {}
@ -71,7 +71,7 @@ ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-13: ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-21:
Negative byte sequence. Negative byte sequence.
Hint: Remove the leading minus sign. Hint: Remove the leading minus sign.
{} {}
@ -88,7 +88,7 @@ ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-13: ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-21:
Negative byte sequence. Negative byte sequence.
Hint: Remove the leading minus sign. Hint: Remove the leading minus sign.
{} {}
@ -103,6 +103,7 @@ ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, character
* Check the changelog by running 'ligo changelog' * Check the changelog by running 'ligo changelog'
|} ]; |} ];
(*
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13: ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
@ -119,6 +120,7 @@ ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
* Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new * Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new
* Check the changelog by running 'ligo changelog' * Check the changelog by running 'ligo changelog'
|} ]; |} ];
*)
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ;
[%expect {| [%expect {|
@ -204,10 +206,9 @@ ligo: : Lexical error in file "unexpected_character.religo", line 1, characters
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ; run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ;
[%expect {| [%expect {|
ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0-2: ligo: : Preprocessing error in file "../../test/lexer/unterminated_comment.mligo", line 1, characters 0-2:
Unterminated comment. Unterminated comment.
Hint: Close with "*)". Hint: Close with "*)". {}
{}
If you're not sure how to fix this error, you can If you're not sure how to fix this error, you can

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli ../shared/Lexer.mli
../shared/Lexer.mll ../shared/Lexer.mll
../shared/LexerLib.ml
../shared/EvalOpt.ml ../shared/EvalOpt.ml
../shared/EvalOpt.mli ../shared/EvalOpt.mli
../shared/FQueue.ml ../shared/FQueue.ml

View File

@ -102,10 +102,35 @@ type t =
| EOF of Region.t (* End of file *) | EOF of Region.t (* End of file *)
(* Projections *)
type token = t type token = t
let proj_token = function let proj_token = function
ARROW region -> region, "ARROW" (* Literals *)
String Region.{region; value} ->
region, sprintf "Str %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
| Attr Region.{region; value} ->
region, sprintf "Attr \"%s\"" value
(* Symbols *)
| ARROW region -> region, "ARROW"
| CONS region -> region, "CONS" | CONS region -> region, "CONS"
| CAT region -> region, "CAT" | CAT region -> region, "CAT"
| MINUS region -> region, "MINUS" | MINUS region -> region, "MINUS"
@ -132,24 +157,9 @@ let proj_token = function
| GE region -> region, "GE" | GE region -> region, "GE"
| BOOL_OR region -> region, "BOOL_OR" | BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND" | BOOL_AND region -> region, "BOOL_AND"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value (* Keywords *)
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| String Region.{region; value} ->
region, sprintf "Str %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b)
| Attr Region.{region; value} ->
region, sprintf "Attr \"%s\"" value
| Begin region -> region, "Begin" | Begin region -> region, "Begin"
| Else region -> region, "Else" | Else region -> region, "Else"
| End region -> region, "End" | End region -> region, "End"
@ -168,12 +178,32 @@ let proj_token = function
| True region -> region, "True" | True region -> region, "True"
| Type region -> region, "Type" | Type region -> region, "Type"
| With region -> region, "With" | With region -> region, "With"
(* Data *)
| C_None region -> region, "C_None" | C_None region -> region, "C_None"
| C_Some region -> region, "C_Some" | C_Some region -> region, "C_Some"
(* Virtual tokens *)
| EOF region -> region, "EOF" | EOF region -> region, "EOF"
let to_lexeme = function let to_lexeme = function
ARROW _ -> "->" (* Literals *)
String s -> String.escaped s.Region.value
| Bytes b -> fst b.Region.value
| Int i
| Nat i
| Mutez i -> fst i.Region.value
| Ident id -> id.Region.value
| Constr id -> id.Region.value
| Attr a -> a.Region.value
(* Symbols *)
| ARROW _ -> "->"
| CONS _ -> "::" | CONS _ -> "::"
| CAT _ -> "^" | CAT _ -> "^"
| MINUS _ -> "-" | MINUS _ -> "-"
@ -201,14 +231,7 @@ let to_lexeme = function
| BOOL_OR _ -> "||" | BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&" | BOOL_AND _ -> "&&"
| Ident id -> id.Region.value (* Keywords *)
| Constr id -> id.Region.value
| Int i
| Nat i
| Mutez i -> fst i.Region.value
| String s -> String.escaped s.Region.value
| Bytes b -> fst b.Region.value
| Attr a -> a.Region.value
| Begin _ -> "begin" | Begin _ -> "begin"
| Else _ -> "else" | Else _ -> "else"
@ -229,11 +252,17 @@ let to_lexeme = function
| Then _ -> "then" | Then _ -> "then"
| With _ -> "with" | With _ -> "with"
(* Data constructors *)
| C_None _ -> "None" | C_None _ -> "None"
| C_Some _ -> "Some" | C_Some _ -> "Some"
(* Virtual tokens *)
| EOF _ -> "" | EOF _ -> ""
(* CONVERSIONS *)
let to_string token ?(offsets=true) mode = let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode let reg_str = region#compact ~offsets mode
@ -241,10 +270,6 @@ let to_string token ?(offsets=true) mode =
let to_region token = proj_token token |> fst let to_region token = proj_token token |> fst
(* Injections *)
type int_err = Non_canonical_zero
(* LEXIS *) (* LEXIS *)
let keywords = [ let keywords = [
@ -385,6 +410,8 @@ let mk_bytes lexeme region =
let value = lexeme, `Hex norm let value = lexeme, `Hex norm
in Bytes Region.{region; value} in Bytes Region.{region; value}
type int_err = Non_canonical_zero
let mk_int lexeme region = let mk_int lexeme region =
let z = let z =
Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string
@ -398,16 +425,14 @@ type nat_err =
let mk_nat lexeme region = let mk_nat lexeme region =
match (String.index_opt lexeme 'n') with match (String.index_opt lexeme 'n') with
| None -> Error Invalid_natural None -> Error Invalid_natural
| Some _ -> ( | Some _ -> let z =
let z =
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |> Str.(global_replace (regexp "n") "") |>
Z.of_string in Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n" if Z.equal z Z.zero && lexeme <> "0n"
then Error Non_canonical_zero_nat then Error Non_canonical_zero_nat
else Ok (Nat Region.{region; value = lexeme,z}) else Ok (Nat Region.{region; value = lexeme,z})
)
let mk_mutez lexeme region = let mk_mutez lexeme region =
let z = let z =
@ -422,8 +447,6 @@ let eof region = EOF region
type sym_err = Invalid_symbol type sym_err = Invalid_symbol
type attr_err = Invalid_attribute
let mk_sym lexeme region = let mk_sym lexeme region =
match lexeme with match lexeme with
(* Lexemes in common with all concrete syntaxes *) (* Lexemes in common with all concrete syntaxes *)
@ -473,9 +496,10 @@ let mk_constr lexeme region =
(* Attributes *) (* Attributes *)
type attr_err = Invalid_attribute
let mk_attr header lexeme region = let mk_attr header lexeme region =
if header = "[@" then if header = "[@" then Error Invalid_attribute
Error Invalid_attribute
else Ok (Attr Region.{value=lexeme; region}) else Ok (Attr Region.{value=lexeme; region})
(* Predicates *) (* Predicates *)
@ -485,12 +509,14 @@ let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *) (* Errors *)
type error = type error =
Odd_lengthed_bytes Odd_lengthed_bytes
| Missing_break | Missing_break
| Negative_byte_sequence
let error_to_string = function let error_to_string = function
Odd_lengthed_bytes -> Odd_lengthed_bytes ->
@ -499,6 +525,9 @@ let error_to_string = function
| Missing_break -> | Missing_break ->
"Missing break.\n\ "Missing break.\n\
Hint: Insert some space." Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg exception Error of error Region.reg
@ -511,29 +540,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value}) let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit = let check_right_context token next_token buffer : unit =
if not (is_eof token) then
if is_int token || is_bytes token then
match next_token buffer with
Some ([], next) ->
let pos = (to_region token)#stop in let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in let region = Region.make ~start:pos ~stop:pos in
if is_int next then
fail region Odd_lengthed_bytes
else
if is_ident next || is_string next || is_bytes next then
fail region Missing_break
| Some (_::_, _) | None -> ()
else
if is_ident token || is_string token then
match next_token buffer with match next_token buffer with
Some ([], next) -> None -> ()
if is_ident next || is_string next | Some (markup, next) ->
|| is_bytes next || is_int next if is_minus token && is_bytes next
then then let region =
let pos = (to_region token)#stop in Region.cover (to_region token) (to_region next)
let region = Region.make ~start:pos ~stop:pos in fail region Negative_byte_sequence
in fail region Missing_break else
| Some (_::_, _) | None -> () match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli ../shared/Lexer.mli
../shared/Lexer.mll ../shared/Lexer.mll
../shared/LexerLib.mli
../shared/LexerLib.ml ../shared/LexerLib.ml
../shared/EvalOpt.ml ../shared/EvalOpt.ml
../shared/EvalOpt.mli ../shared/EvalOpt.mli

View File

@ -1,4 +1,4 @@
(** This signature defines the lexical tokens for LIGO (* This signature defines the lexical tokens for LIGO
_Tokens_ are the abstract units which are used by the parser to _Tokens_ are the abstract units which are used by the parser to
build the abstract syntax tree (AST), in other words, the stream of build the abstract syntax tree (AST), in other words, the stream of

View File

@ -5,24 +5,14 @@
(* Shorthands *) (* Shorthands *)
type lexeme = string
let sprintf = Printf.sprintf
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
module SMap = Map.Make (String) module SMap = Map.Make (String)
module SSet = Set.Make (String) module SSet = Set.Make (String)
(* Hack to roll back one lexeme in the current semantic action *) type lexeme = string
(*
let rollback buffer = let sprintf = Printf.sprintf
let open Lexing in
let len = String.length (lexeme buffer) in
let pos_cnum = buffer.lex_curr_p.pos_cnum - len in
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
*)
(* TOKENS *) (* TOKENS *)
@ -123,6 +113,8 @@ type t =
| EOF of Region.t | EOF of Region.t
(* Projections *)
type token = t type token = t
let proj_token = function let proj_token = function
@ -130,32 +122,20 @@ let proj_token = function
String Region.{region; value} -> String Region.{region; value} ->
region, sprintf "String %s" value region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} -> | Bytes Region.{region; value = s,b} ->
region, region,
sprintf "Bytes (\"%s\", \"0x%s\")" sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
s (Hex.show b)
| Int Region.{region; value = s,n} -> | Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} -> | Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} -> | Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| Ident Region.{region; value} -> | Ident Region.{region; value} ->
region, sprintf "Ident \"%s\"" value region, sprintf "Ident \"%s\"" value
| Constr Region.{region; value} -> | Constr Region.{region; value} ->
region, sprintf "Constr \"%s\"" value region, sprintf "Constr \"%s\"" value
(*
| Attr {header; string={region; value}} ->
region, sprintf "Attr (\"%s\",\"%s\")" header value
*)
(* Symbols *) (* Symbols *)
| SEMI region -> region, "SEMI" | SEMI region -> region, "SEMI"
@ -382,9 +362,7 @@ let keywords = [
(fun reg -> With reg) (fun reg -> With reg)
] ]
let reserved = let reserved = SSet.empty
let open SSet in
empty |> add "arguments"
let constructors = [ let constructors = [
(fun reg -> False reg); (fun reg -> False reg);
@ -485,8 +463,7 @@ type nat_err =
let mk_nat lexeme region = let mk_nat lexeme region =
match String.index_opt lexeme 'n' with match String.index_opt lexeme 'n' with
None -> Error Invalid_natural None -> Error Invalid_natural
| Some _ -> | Some _ -> let z =
let z =
Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |> Str.(global_replace (regexp "n") "") |>
Z.of_string in Z.of_string in
@ -495,8 +472,7 @@ let mk_nat lexeme region =
else Ok (Nat Region.{region; value = lexeme,z}) else Ok (Nat Region.{region; value = lexeme,z})
let mk_mutez lexeme region = let mk_mutez lexeme region =
let z = let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |> Str.(global_replace (regexp "mutez") "") |>
Z.of_string in Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez" if Z.equal z Z.zero && lexeme <> "0mutez"
@ -557,8 +533,7 @@ let mk_constr lexeme region =
type attr_err = Invalid_attribute type attr_err = Invalid_attribute
let mk_attr _header _string _region = let mk_attr _ _ _ = Error Invalid_attribute
Error Invalid_attribute
(* Predicates *) (* Predicates *)
@ -567,12 +542,14 @@ let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *) (* Errors *)
type error = type error =
Odd_lengthed_bytes Odd_lengthed_bytes
| Missing_break | Missing_break
| Negative_byte_sequence
let error_to_string = function let error_to_string = function
Odd_lengthed_bytes -> Odd_lengthed_bytes ->
@ -581,6 +558,9 @@ let error_to_string = function
| Missing_break -> | Missing_break ->
"Missing break.\n\ "Missing break.\n\
Hint: Insert some space." Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg exception Error of error Region.reg
@ -593,29 +573,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value}) let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit = let check_right_context token next_token buffer : unit =
if not (is_eof token) then
if is_int token || is_bytes token then
match next_token buffer with
Some ([], next) ->
let pos = (to_region token)#stop in let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in let region = Region.make ~start:pos ~stop:pos in
if is_int next then
fail region Odd_lengthed_bytes
else
if is_ident next || is_string next || is_bytes next then
fail region Missing_break
| Some (_::_, _) | None -> ()
else
if is_ident token || is_string token then
match next_token buffer with match next_token buffer with
Some ([], next) -> None -> ()
if is_ident next || is_string next | Some (markup, next) ->
|| is_bytes next || is_int next if is_minus token && is_bytes next
then then let region =
let pos = (to_region token)#stop in Region.cover (to_region token) (to_region next)
let region = Region.make ~start:pos ~stop:pos in fail region Negative_byte_sequence
in fail region Missing_break else
| Some (_::_, _) | None -> () match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
../shared/Lexer.mli ../shared/Lexer.mli
../shared/Lexer.mll ../shared/Lexer.mll
../shared/LexerLib.ml
../shared/EvalOpt.ml ../shared/EvalOpt.ml
../shared/EvalOpt.mli ../shared/EvalOpt.mli
../shared/FQueue.ml ../shared/FQueue.ml

View File

@ -57,6 +57,7 @@ type t =
| COLON of Region.t (* ":" *) | COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *) | DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *) | ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *) (* Wildcard *)
@ -69,10 +70,10 @@ type t =
| NE of Region.t (* "!=" *) | NE of Region.t (* "!=" *)
| LT of Region.t (* "<" *) | LT of Region.t (* "<" *)
| GT of Region.t (* ">" *) | GT of Region.t (* ">" *)
| LE of Region.t (* "=<" *) | LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *) | GE of Region.t (* ">=" *)
| ARROW of Region.t (* "=>" *) (* Logic *)
| BOOL_OR of Region.t (* "||" *) | BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *) | BOOL_AND of Region.t (* "&&" *)
@ -95,10 +96,10 @@ type t =
| False of Region.t | False of Region.t
| If of Region.t | If of Region.t
| Let of Region.t | Let of Region.t
| Rec of Region.t
| Switch of Region.t
| Mod of Region.t | Mod of Region.t
| Or of Region.t | Or of Region.t
| Rec of Region.t
| Switch of Region.t
| True of Region.t | True of Region.t
| Type of Region.t | Type of Region.t
@ -111,8 +112,6 @@ type t =
| EOF of Region.t (* End of file *) | EOF of Region.t (* End of file *)
type token = t
(* Projections (* Projections
The difference between extracting the lexeme and a string from a The difference between extracting the lexeme and a string from a
@ -121,6 +120,8 @@ type token = t
lexeme (concrete syntax). lexeme (concrete syntax).
*) *)
type token = t
val to_lexeme : token -> lexeme val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t val to_region : token -> Region.t

View File

@ -1,15 +1,17 @@
{ {
(* START OF HEADER *) (* START OF HEADER *)
type lexeme = string (* Shorthands *)
let sprintf = Printf.sprintf
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
module SMap = Utils.String.Map module SMap = Utils.String.Map
module SSet = Utils.String.Set module SSet = Utils.String.Set
type lexeme = string
let sprintf = Printf.sprintf
(* TOKENS *) (* TOKENS *)
type t = type t =
@ -41,6 +43,7 @@ type t =
| COLON of Region.t (* ":" *) | COLON of Region.t (* ":" *)
| DOT of Region.t (* "." *) | DOT of Region.t (* "." *)
| ELLIPSIS of Region.t (* "..." *) | ELLIPSIS of Region.t (* "..." *)
| ARROW of Region.t (* "=>" *)
(* Wildcard *) (* Wildcard *)
@ -55,11 +58,11 @@ type t =
| GT of Region.t (* ">" *) | GT of Region.t (* ">" *)
| LE of Region.t (* "<=" *) | LE of Region.t (* "<=" *)
| GE of Region.t (* ">=" *) | GE of Region.t (* ">=" *)
| ARROW of Region.t (* "=>" *)
(* Logic *)
| BOOL_OR of Region.t (* "||" *) | BOOL_OR of Region.t (* "||" *)
| BOOL_AND of Region.t (* "&&" *) | BOOL_AND of Region.t (* "&&" *)
| NOT of Region.t (* ! *) | NOT of Region.t (* ! *)
(* Identifiers, labels, numbers and strings *) (* Identifiers, labels, numbers and strings *)
@ -75,17 +78,17 @@ type t =
(* Keywords *) (* Keywords *)
(*| And*)
| Else of Region.t | Else of Region.t
| False of Region.t | False of Region.t
| If of Region.t | If of Region.t
| Let of Region.t | Let of Region.t
| Rec of Region.t
| Switch of Region.t
| Mod of Region.t | Mod of Region.t
| Or of Region.t | Or of Region.t
| Rec of Region.t
| Switch of Region.t
| True of Region.t | True of Region.t
| Type of Region.t | Type of Region.t
(* Data constructors *) (* Data constructors *)
| C_None of Region.t (* "None" *) | C_None of Region.t (* "None" *)
@ -96,10 +99,32 @@ type t =
| EOF of Region.t (* End of file *) | EOF of Region.t (* End of file *)
(* Projections *)
type token = t type token = t
let proj_token = function let proj_token = function
CAT region -> region, "CAT" (* Literals *)
String Region.{region; value} ->
region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
(* Symbols *)
| CAT region -> region, "CAT"
| MINUS region -> region, "MINUS" | MINUS region -> region, "MINUS"
| PLUS region -> region, "PLUS" | PLUS region -> region, "PLUS"
| SLASH region -> region, "SLASH" | SLASH region -> region, "SLASH"
@ -125,24 +150,9 @@ let proj_token = function
| LE region -> region, "LE" | LE region -> region, "LE"
| GE region -> region, "GE" | GE region -> region, "GE"
| ARROW region -> region, "ARROW" | ARROW region -> region, "ARROW"
| NOT region -> region, "NOT"
| BOOL_OR region -> region, "BOOL_OR" | BOOL_OR region -> region, "BOOL_OR"
| BOOL_AND region -> region, "BOOL_AND" | BOOL_AND region -> region, "BOOL_AND"
| Ident Region.{region; value} ->
region, sprintf "Ident %s" value
| Constr Region.{region; value} ->
region, sprintf "Constr %s" value
| Int Region.{region; value = s,n} ->
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
| Nat Region.{region; value = s,n} ->
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
| Mutez Region.{region; value = s,n} ->
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
| String Region.{region; value} ->
region, sprintf "String %s" value
| Bytes Region.{region; value = s,b} ->
region,
sprintf "Bytes (\"%s\", \"0x%s\")"
s (Hex.show b)
| Else region -> region, "Else" | Else region -> region, "Else"
| False region -> region, "False" | False region -> region, "False"
| If region -> region, "If" | If region -> region, "If"
@ -150,7 +160,6 @@ let proj_token = function
| Rec region -> region, "Rec" | Rec region -> region, "Rec"
| Switch region -> region, "Switch" | Switch region -> region, "Switch"
| Mod region -> region, "Mod" | Mod region -> region, "Mod"
| NOT region -> region, "!"
| Or region -> region, "Or" | Or region -> region, "Or"
| True region -> region, "True" | True region -> region, "True"
| Type region -> region, "Type" | Type region -> region, "Type"
@ -160,7 +169,20 @@ let proj_token = function
| EOF region -> region, "EOF" | EOF region -> region, "EOF"
let to_lexeme = function let to_lexeme = function
CAT _ -> "++" (* Literals *)
String s -> s.Region.value
| Bytes b -> fst b.Region.value
| Int i
| Nat i
| Mutez i -> fst i.Region.value
| Ident id -> id.Region.value
| Constr id -> id.Region.value
| Attr a -> a.Region.value
(* Symbols *)
| CAT _ -> "++"
| MINUS _ -> "-" | MINUS _ -> "-"
| PLUS _ -> "+" | PLUS _ -> "+"
| SLASH _ -> "/" | SLASH _ -> "/"
@ -188,29 +210,32 @@ let to_lexeme = function
| ARROW _ -> "=>" | ARROW _ -> "=>"
| BOOL_OR _ -> "||" | BOOL_OR _ -> "||"
| BOOL_AND _ -> "&&" | BOOL_AND _ -> "&&"
| Ident id -> id.Region.value | NOT _ -> "!"
| Constr id -> id.Region.value
| Int i (* Keywords *)
| Nat i
| Mutez i -> fst i.Region.value
| String s -> s.Region.value
| Bytes b -> fst b.Region.value
| Else _ -> "else" | Else _ -> "else"
| False _ -> "false" | False _ -> "false"
| If _ -> "if" | If _ -> "if"
| Let _ -> "let" | Let _ -> "let"
| Rec _ -> "rec"
| Mod _ -> "mod" | Mod _ -> "mod"
| NOT _ -> "!"
| Or _ -> "or" | Or _ -> "or"
| Rec _ -> "rec"
| Switch _ -> "switch" | Switch _ -> "switch"
| True _ -> "true" | True _ -> "true"
| Type _ -> "type" | Type _ -> "type"
(* Data constructors *)
| C_None _ -> "None" | C_None _ -> "None"
| C_Some _ -> "Some" | C_Some _ -> "Some"
| Attr a -> a.Region.value
(* Virtual tokens *)
| EOF _ -> "" | EOF _ -> ""
(* CONVERSIONS *)
let to_string token ?(offsets=true) mode = let to_string token ?(offsets=true) mode =
let region, val_str = proj_token token in let region, val_str = proj_token token in
let reg_str = region#compact ~offsets mode let reg_str = region#compact ~offsets mode
@ -261,12 +286,9 @@ let reserved =
|> add "functor" |> add "functor"
|> add "inherit" |> add "inherit"
|> add "initializer" |> add "initializer"
(* |> add "land" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "lazy" |> add "lazy"
(* |> add "lor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "lsl" |> add "lsl"
|> add "lsr" |> add "lsr"
(* |> add "lxor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|> add "match" |> add "match"
|> add "method" |> add "method"
|> add "module" |> add "module"
@ -291,7 +313,7 @@ let reserved =
let constructors = [ let constructors = [
(fun reg -> C_None reg); (fun reg -> C_None reg);
(fun reg -> C_Some reg); (fun reg -> C_Some reg)
] ]
let add map (key, value) = SMap.add key value map let add map (key, value) = SMap.add key value map
@ -376,8 +398,7 @@ let mk_int lexeme region =
else Ok (Int Region.{region; value = lexeme, z}) else Ok (Int Region.{region; value = lexeme, z})
let mk_nat lexeme region = let mk_nat lexeme region =
let z = let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "n") "") |> Str.(global_replace (regexp "n") "") |>
Z.of_string in Z.of_string in
if Z.equal z Z.zero && lexeme <> "0n" if Z.equal z Z.zero && lexeme <> "0n"
@ -385,8 +406,7 @@ let mk_nat lexeme region =
else Ok (Nat Region.{region; value = lexeme, z}) else Ok (Nat Region.{region; value = lexeme, z})
let mk_mutez lexeme region = let mk_mutez lexeme region =
let z = let z = Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "_") "" lexeme) |>
Str.(global_replace (regexp "mutez") "") |> Str.(global_replace (regexp "mutez") "") |>
Z.of_string in Z.of_string in
if Z.equal z Z.zero && lexeme <> "0mutez" if Z.equal z Z.zero && lexeme <> "0mutez"
@ -465,12 +485,14 @@ let is_bytes = function Bytes _ -> true | _ -> false
let is_int = function Int _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false
let is_ident = function Ident _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false
let is_eof = function EOF _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false
let is_minus = function MINUS _ -> true | _ -> false
(* Errors *) (* Errors *)
type error = type error =
Odd_lengthed_bytes Odd_lengthed_bytes
| Missing_break | Missing_break
| Negative_byte_sequence
let error_to_string = function let error_to_string = function
Odd_lengthed_bytes -> Odd_lengthed_bytes ->
@ -479,6 +501,9 @@ let error_to_string = function
| Missing_break -> | Missing_break ->
"Missing break.\n\ "Missing break.\n\
Hint: Insert some space." Hint: Insert some space."
| Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign."
exception Error of error Region.reg exception Error of error Region.reg
@ -491,29 +516,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
let fail region value = raise (Error Region.{region; value}) let fail region value = raise (Error Region.{region; value})
let check_right_context token next_token buffer : unit = let check_right_context token next_token buffer : unit =
if not (is_eof token) then
if is_int token || is_bytes token then
match next_token buffer with
Some ([], next) ->
let pos = (to_region token)#stop in let pos = (to_region token)#stop in
let region = Region.make ~start:pos ~stop:pos in let region = Region.make ~start:pos ~stop:pos in
if is_int next then
fail region Odd_lengthed_bytes
else
if is_ident next || is_string next || is_bytes next then
fail region Missing_break
| Some (_::_, _) | None -> ()
else
if is_ident token || is_string token then
match next_token buffer with match next_token buffer with
Some ([], next) -> None -> ()
if is_ident next || is_string next | Some (markup, next) ->
|| is_bytes next || is_int next if is_minus token && is_bytes next
then then let region =
let pos = (to_region token)#stop in Region.cover (to_region token) (to_region next)
let region = Region.make ~start:pos ~stop:pos in fail region Negative_byte_sequence
in fail region Missing_break else
| Some (_::_, _) | None -> () match markup with
[] ->
if is_int token
then if is_string next || is_ident next
then fail region Missing_break
else ()
else
if is_string token
then if is_int next || is_bytes next || is_ident next
then fail region Missing_break
else ()
else
if is_bytes token
then if is_string next || is_ident next
then fail region Missing_break
else if is_int next
then fail region Odd_lengthed_bytes
else ()
else ()
| _::_ -> ()
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -1,2 +0,0 @@
$HOME/git/OCaml-build/Makefile
$HOME/git/OCaml-build/Makefile.cfg

View File

@ -38,8 +38,6 @@
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
type lexeme = string
(* TOKENS *) (* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer (* The signature [TOKEN] exports an abstract type [token], so a lexer
@ -54,6 +52,8 @@ type lexeme = string
reading the ocamllex specification for the lexer ([Lexer.mll]). reading the ocamllex specification for the lexer ([Lexer.mll]).
*) *)
type lexeme = string
module type TOKEN = module type TOKEN =
sig sig
type token type token
@ -112,10 +112,36 @@ module type TOKEN =
unit unit
end end
(* The signature of the lexer *)
module type S =
sig
module Token : TOKEN
type token = Token.token
(* The scanner [init] is meant to be called first to read the
BOM. Then [scan] is called. *)
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
(* Errors (specific to the generic lexer, not to the tokens) *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
(* The functorised interface (* The functorised interface
Note that the module parameter [Token] is re-exported as a Note that the module parameter [Token] is re-exported as a
submodule in [S]. submodule in [S].
*) *)
module Make (Token: TOKEN) : LexerLib.S with module Token = Token module Make (Token : TOKEN) : S with module Token = Token

View File

@ -8,12 +8,6 @@ module Pos = Simple_utils.Pos
(* START HEADER *) (* START HEADER *)
type lexeme = string
(* ALIASES *)
let sprintf = Printf.sprintf
(* TOKENS *) (* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer (* The signature [TOKEN] exports an abstract type [token], so a lexer
@ -22,6 +16,8 @@ let sprintf = Printf.sprintf
caracterises the virtual token for end-of-file, because it requires caracterises the virtual token for end-of-file, because it requires
special handling. *) special handling. *)
type lexeme = string
module type TOKEN = module type TOKEN =
sig sig
type token type token
@ -84,31 +80,39 @@ module type TOKEN =
submodule in [S]. submodule in [S].
*) *)
module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) = module type S =
sig
module Token : TOKEN
type token = Token.token
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
module Make (Token : TOKEN) : (S with module Token = Token) =
struct struct
module Token = Token module Token = Token
type token = Token.token type token = Token.token
type file_path = string
type line_comment = LexerLib.line_comment
type block_comment = LexerLib.block_comment
let mk_block = LexerLib.mk_block
(* ERRORS *) (* ERRORS *)
type error = type error =
Invalid_utf8_sequence Invalid_utf8_sequence
| Unexpected_character of char | Unexpected_character of char
| Undefined_escape_sequence | Undefined_escape_sequence
(* | Missing_break*)
| Unterminated_string | Unterminated_string
(* | Unterminated_integer*)
(* | Odd_lengthed_bytes*)
| Unterminated_comment of string | Unterminated_comment of string
(* | Orphan_minus*)
| Non_canonical_zero | Non_canonical_zero
(* | Negative_byte_sequence *)
| Broken_string | Broken_string
| Invalid_character_in_string | Invalid_character_in_string
| Reserved_name of string | Reserved_name of string
@ -116,6 +120,8 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
| Invalid_natural | Invalid_natural
| Invalid_attribute | Invalid_attribute
let sprintf = Printf.sprintf
let error_to_string = function let error_to_string = function
Invalid_utf8_sequence -> Invalid_utf8_sequence ->
"Invalid UTF-8 sequence." "Invalid UTF-8 sequence."
@ -124,30 +130,15 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
| Undefined_escape_sequence -> | Undefined_escape_sequence ->
"Undefined escape sequence.\n\ "Undefined escape sequence.\n\
Hint: Remove or replace the sequence." Hint: Remove or replace the sequence."
(* | Missing_break -> | Unterminated_string ->
"Missing break.\n\
Hint: Insert some space."
*) | Unterminated_string ->
"Unterminated string.\n\ "Unterminated string.\n\
Hint: Close with double quotes." Hint: Close with double quotes."
(* | Unterminated_integer -> | Unterminated_comment ending ->
"Unterminated integer.\n\
Hint: Remove the sign or proceed with a natural number." *)
(* | Odd_lengthed_bytes ->
"The length of the byte sequence is an odd number.\n\
Hint: Add or remove a digit."
*) | Unterminated_comment ending ->
sprintf "Unterminated comment.\n\ sprintf "Unterminated comment.\n\
Hint: Close with \"%s\"." ending Hint: Close with \"%s\"." ending
(* | Orphan_minus ->
"Orphan minus sign.\n\
Hint: Remove the trailing space." *)
| Non_canonical_zero -> | Non_canonical_zero ->
"Non-canonical zero.\n\ "Non-canonical zero.\n\
Hint: Use 0." Hint: Use 0."
(* | Negative_byte_sequence ->
"Negative byte sequence.\n\
Hint: Remove the leading minus sign." *)
| Broken_string -> | Broken_string ->
"The string starting here is interrupted by a line break.\n\ "The string starting here is interrupted by a line break.\n\
Hint: Remove the break, close the string before or insert a \ Hint: Remove the break, close the string before or insert a \
@ -369,6 +360,7 @@ and scan state = parse
nl { scan (state#push_newline lexbuf) lexbuf } nl { scan (state#push_newline lexbuf) lexbuf }
| ' '+ { scan (state#push_space lexbuf) lexbuf } | ' '+ { scan (state#push_space lexbuf) lexbuf }
| '\t'+ { scan (state#push_tabs lexbuf) lexbuf } | '\t'+ { scan (state#push_tabs lexbuf) lexbuf }
| ident { mk_ident state lexbuf } | ident { mk_ident state lexbuf }
| constr { mk_constr state lexbuf } | constr { mk_constr state lexbuf }
| bytes { mk_bytes seq state lexbuf } | bytes { mk_bytes seq state lexbuf }
@ -420,27 +412,14 @@ and scan state = parse
let state = state#set_pos pos in let state = state#set_pos pos in
scan state lexbuf } scan state lexbuf }
(* Some special errors *) (* String *)
(*
| '-' { let region, _, state = state#sync lexbuf in
let state = scan state lexbuf in
let open Markup in
match FQueue.peek state#units with
None -> assert false
| Some (_, ((Space _ | Tabs _)::_, token))
when Token.is_int token -> fail region Orphan_minus
| _ -> fail region Unterminated_integer }
| "-0x" byte_seq?
{ let region, _, _ = state#sync lexbuf
in fail region Negative_byte_sequence }
*)
| '"' { let opening, lexeme, state = state#sync lexbuf in | '"' { let opening, lexeme, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening lexeme in let thread = LexerLib.mk_thread opening lexeme in
scan_string thread state lexbuf |> mk_string } scan_string thread state lexbuf |> mk_string }
(* Comments *)
| block_comment_openings { | block_comment_openings {
let lexeme = Lexing.lexeme lexbuf in let lexeme = Lexing.lexeme lexbuf in
match state#block with match state#block with
@ -496,8 +475,6 @@ and scan_flags state acc = parse
| eof { let _, _, state = state#sync lexbuf | eof { let _, _, state = state#sync lexbuf
in List.rev acc, state } in List.rev acc, state }
(* TODO: Move below to [LexerCommon.mll] *)
(* Finishing a string *) (* Finishing a string *)
and scan_string thread state = parse and scan_string thread state = parse
@ -624,164 +601,6 @@ and scan_utf8_inline thread state = parse
{ {
(* START TRAILER *) (* START TRAILER *)
(* Scanning the lexing buffer for tokens (and markup, as a
side-effect).
Because we want the lexer to have access to the right lexical
context of a recognised lexeme (to enforce stylistic constraints or
report special error patterns), we need to keep a hidden reference
to a queue of recognised lexical units (that is, tokens and markup)
that acts as a mutable state between the calls to [read]. When
[read] is called, that queue is examined first and, if it contains
at least one token, that token is returned; otherwise, the lexing
buffer is scanned for at least one more new token. That is the
general principle: we put a high-level buffer (our queue) on top of
the low-level lexing buffer.
One tricky and important detail is that we must make any parser
generated by Menhir (and calling [read]) believe that the last
region of the input source that was matched indeed corresponds to
the returned token, despite that many tokens and markup may have
been matched since it was actually read from the input. In other
words, the parser requests a token that is taken from the
high-level buffer, but the parser requests the source regions from
the _low-level_ lexing buffer, and they may disagree if more than
one token has actually been recognised.
Consequently, in order to maintain a consistent view for the
parser, we have to patch some fields of the lexing buffer, namely
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
generated by Menhir when querying source positions (regions). This
is the purpose of the function [patch_buffer]. After reading one
or more tokens and markup by the scanning rule [scan], we have to
save in the hidden reference [buf_reg] the region of the source
that was matched by [scan]. This atomic sequence of patching,
scanning and saving is implemented by the _function_ [scan]
(beware: it shadows the scanning rule [scan]). The function
[patch_buffer] is, of course, also called just before returning the
token, so the parser has a view of the lexing buffer consistent
with the token.
Note that an additional reference [first_call] is needed to
distinguish the first call to the function [scan], as the first
scanning rule is actually [init] (which can handle the BOM), not
[scan].
*)
type logger = Markup.t list -> token -> unit
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type instance = {
input : input;
read : log:logger -> Lexing.lexbuf -> token;
buffer : Lexing.lexbuf;
get_win : unit -> token LexerLib.window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
let lexbuf_from_input = function
File path ->
(try
let chan = open_in path in
let close () = close_in chan in
let lexbuf = Lexing.from_channel chan in
let () =
let open Lexing in
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
in Ok (lexbuf, close)
with Sys_error msg -> Stdlib.Error (File_opening msg))
| String s ->
Ok (Lexing.from_string s, fun () -> ())
| Channel chan ->
let close () = close_in chan in
Ok (Lexing.from_channel chan, close)
| Buffer b -> Ok (b, fun () -> ())
let open_token_stream ?line ?block input =
let file_path = match input with
File path -> path
| _ -> "" in
let pos = Pos.min ~file:file_path in
let buf_reg = ref (pos#byte, pos#byte)
and first_call = ref true
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let supply = Uutf.Manual.src decoder in
let state = ref (LexerLib.mk_state
~units:FQueue.empty
~last:Region.ghost
~window:LexerLib.Nil
~pos
~markup:[]
~decoder
~supply
?block
?line
()) in
let get_pos () = !state#pos
and get_last () = !state#last
and get_win () = !state#window
and get_file () = file_path in
let patch_buffer (start, stop) buffer =
let open Lexing in
let file_path = buffer.lex_curr_p.pos_fname in
buffer.lex_start_p <- {start with pos_fname = file_path};
buffer.lex_curr_p <- {stop with pos_fname = file_path}
and save_region buffer =
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
let scan buffer =
patch_buffer !buf_reg buffer;
(if !first_call
then (state := init !state buffer; first_call := false)
else state := scan !state buffer);
save_region buffer in
let next_token buffer =
scan buffer;
match FQueue.peek !state#units with
None -> None
| Some (units, ext_token) ->
state := !state#set_units units; Some ext_token in
let rec read ~log buffer =
match FQueue.deq !state#units with
None ->
scan buffer;
read ~log buffer
| Some (units, (left_mark, token)) ->
log left_mark token;
state := ((!state#set_units units)
#set_last (Token.to_region token))
#slide_token token;
Token.check_right_context token next_token buffer;
patch_buffer (Token.to_region token)#byte_pos buffer;
token in
match lexbuf_from_input input with
Ok (buffer, close) ->
let () =
match input with
File path when path <> "" -> LexerLib.reset ~file:path buffer
| _ -> () in
let instance = {
input; read; buffer; get_win; get_pos; get_last; get_file; close}
in Ok instance
| Error _ as e -> e
end (* of functor [Make] in HEADER *) end (* of functor [Make] in HEADER *)
(* END TRAILER *) (* END TRAILER *)
} }

View File

@ -1,159 +1,6 @@
(* Sliding window *)
(* The type [window] models a two-slot buffer of tokens for reporting
after a parse error. Technically, it is a parametric type, but its
use is meant for tokens, wherever they are defined.
In [Two(t1,t2)], the token [t2] is the next to be sent to the
parser.
The call [slide token buffer] pushes the token [token] in the
buffer [buffer]. If the buffer is full, that is, it is
[Two(t1,t2)], then the token [t2] is discarded to make room for
[token].
*)
type 'a window =
Nil
| One of 'a
| Two of 'a * 'a
let slide token = function
Nil -> One token
| One t | Two (t,_) -> Two (token,t)
module Region = Simple_utils.Region module Region = Simple_utils.Region
module Pos = Simple_utils.Pos module Pos = Simple_utils.Pos
type lexeme = string
(* The signature [TOKEN] exports an abstract type [token], so a lexer
can be a functor over tokens. This enables to externalise
version-dependent constraints in any module whose signature matches
[TOKEN]. Generic functions to construct tokens are required.
Note the predicate [is_eof], which caracterises the virtual token
for end-of-file, because it requires special handling.
*)
module type TOKEN =
sig
type token
(* Predicates *)
val is_eof : token -> bool
(* Projections *)
val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t
(* Style *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool ->
[`Byte | `Point] ->
error Region.reg ->
file:bool ->
string Region.reg
val check_right_context :
token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit
end
(* The module type for lexers is [S]. It mainly exports the function
[open_token_stream], which returns
* a function [read] that extracts tokens from a lexing buffer,
together with a lexing buffer [buffer] to read from,
* a function [close] that closes that buffer,
* a function [get_pos] that returns the current position, and
* a function [get_last] that returns the region of the last
recognised token.
* a function [get_file] that returns the name of the file being
scanned (empty string if [stdin]).
Note that a module [Token] is exported too, because the signature
of the exported functions depend on it.
The type [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
The call [read ~log] evaluates in a lexer (also known as a
tokeniser or scanner) whose type is [Lexing.lexbuf -> token], and
suitable for a parser generated by Menhir. The argument labelled
[log] is a logger, that is, it may print a token and its left
markup to a given channel, at the caller's discretion.
*)
module type S =
sig
module Token : TOKEN
type token = Token.token
type file_path = string
type logger = Markup.t list -> token -> unit
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type instance = {
input : input;
read : log:logger -> Lexing.lexbuf -> token;
buffer : Lexing.lexbuf;
get_win : unit -> token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
val lexbuf_from_input :
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
type line_comment = string (* Opening of a line comment *)
type block_comment = <opening : string; closing : string>
val mk_block : opening:string -> closing:string -> block_comment
val open_token_stream :
?line:line_comment ->
?block:block_comment ->
input ->
(instance, open_err) Stdlib.result
(* Error reporting *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool ->
[`Byte | `Point] ->
error Region.reg ->
file:bool ->
string Region.reg
end
(* LEXER ENGINE *) (* LEXER ENGINE *)
(* Resetting file name and line number in the lexing buffer (* Resetting file name and line number in the lexing buffer
@ -205,27 +52,20 @@ let rollback buffer =
buffer.lex_curr_pos <- buffer.lex_curr_pos - len; buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum} buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
(* Utility types *)
type file_path = string
type lexeme = string
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *) (* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
(* When scanning structured constructs, like strings and comments, we
need to keep the region of the opening symbol (like double quote,
"//" or "(*") in order to report any error more precisely. Since
ocamllex is byte-oriented, we need to store the parsed bytes as
characters in an accumulator [acc] and also its length [len], so,
we are done, it is easy to build the string making up the
structured construct with [mk_str] (see above).
The resulting data structure is called a _thread_. (Note for
Emacs: "*)".)
*)
type thread = < type thread = <
opening : Region.t; opening : Region.t;
length : int; length : int;
acc : char list; acc : char list;
to_string : string;
push_char : char -> thread; push_char : char -> thread;
push_string : string -> thread; push_string : string -> thread;
to_string : string;
set_opening : Region.t -> thread set_opening : Region.t -> thread
> >
@ -233,12 +73,12 @@ let mk_thread region lexeme : thread =
(* The call [explode s a] is the list made by pushing the characters (* The call [explode s a] is the list made by pushing the characters
in the string [s] on top of [a], in reverse order. For example, in the string [s] on top of [a], in reverse order. For example,
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *) [explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
let explode s acc = let explode s acc =
let rec push = function let rec push = function
0 -> acc 0 -> acc
| i -> s.[i-1] :: push (i-1) | i -> s.[i-1] :: push (i-1)
in push (String.length s) in push (String.length s) in
in
object object
val opening = region val opening = region
method opening = opening method opening = opening
@ -273,84 +113,82 @@ let mk_thread region lexeme : thread =
in fill (length-1) acc |> Bytes.to_string in fill (length-1) acc |> Bytes.to_string
end end
type file_path = string
(* STATE *) (* STATE *)
(* Beyond tokens, the result of lexing is a state. The type (* Scanning the lexing buffer for tokens (and markup, as a
[state] represents the logical state of the lexing engine, that side-effect).
is, a value which is threaded during scanning and which denotes
useful, high-level information beyond what the type
[Lexing.lexbuf] in the standard library already provides for
all generic lexers.
Tokens are the smallest units used by the parser to build the Because we want the lexer to have access to the right lexical
abstract syntax tree. The state includes a queue of recognised context of a recognised lexeme (to enforce stylistic constraints or
tokens, with the markup at the left of its lexeme until either report special error patterns), we need to keep a hidden reference
the start of the file or the end of the previously recognised to a queue of recognised lexical units (that is, tokens and markup)
token. that acts as a mutable state between the calls to [read]. When
[read] is called, that queue is examined first and, if it contains
at least one token, that token is returned; otherwise, the lexing
buffer is scanned for at least one more new token. That is the
general principle: we put a high-level buffer (our queue) on top of
the low-level lexing buffer.
The markup from the last recognised token or, if the first One tricky and important detail is that we must make any parser
token has not been recognised yet, from the beginning of the generated by Menhir (and calling [read]) believe that the last
file is stored in the field [markup] of the state because it is region of the input source that was matched indeed corresponds to
a side-effect, with respect to the output token list, and we the returned token, despite that many tokens and markup may have
use a record with a single field [units] because that record been matched since it was actually read from the input. In other
may be easily extended during the future maintenance of this words, the parser requests a token that is taken from the
lexer. high-level buffer, but the parser requests the source regions from
the _low-level_ lexing buffer, and they may disagree if more than
one token has actually been recognised.
The state also includes a field [pos] which holds the current Consequently, in order to maintain a consistent view for the
position in the LIGO source file. The position is not always parser, we have to patch some fields of the lexing buffer, namely
updated after a single character has been matched: that depends [lex_start_p] and [lex_curr_p], as these fields are read by parsers
on the regular expression that matched the lexing buffer. generated by Menhir when querying source positions (regions). This
is the purpose of the function [patch_buffer]. After reading one or
more tokens and markup by the scanning rule [scan], we have to save
in the hidden reference [buf_reg] the region of the source that was
matched by [scan]. This atomic sequence of patching, scanning and
saving is implemented by the _function_ [scan] (beware: it shadows
the scanning rule [scan]). The function [patch_buffer] is, of
course, also called just before returning the token, so the parser
has a view of the lexing buffer consistent with the token.
The field [window] is a two-token window, that is, a buffer Note that an additional reference [first_call] is needed to
that contains the last recognised token, and the penultimate distinguish the first call to the function [scan], as the first
(if any). scanning rule is actually [init] (which can handle the BOM), not
[scan].
The fields [decoder] and [supply] offer the support needed
for the lexing of UTF-8 encoded characters in comments (the
only place where they are allowed in LIGO). The former is the
decoder proper and the latter is the effectful function
[supply] that takes a byte, a start index and a length and feed
it to [decoder]. See the documentation of the third-party
library Uutf.
*) *)
type line_comment = string (* Opening of a line comment *) type 'token window =
type block_comment = <opening : string; closing : string> Nil
| One of 'token
| Two of 'token * 'token
let mk_block ~opening ~closing : block_comment = type 'token state = <
object units : (Markup.t list * 'token) FQueue.t;
method opening = opening
method closing = closing
end
type 'a state = <
units : (Markup.t list * 'a) FQueue.t;
markup : Markup.t list; markup : Markup.t list;
window : 'a window; window : 'token window;
last : Region.t; last : Region.t;
pos : Pos.t; pos : Pos.t;
decoder : Uutf.decoder; decoder : Uutf.decoder;
supply : Bytes.t -> int -> int -> unit; supply : Bytes.t -> int -> int -> unit;
block : block_comment option; block : EvalOpt.block_comment option;
line : line_comment option; line : EvalOpt.line_comment option;
enqueue : 'a -> 'a state; enqueue : 'token -> 'token state;
set_units : (Markup.t list * 'a) FQueue.t -> 'a state; set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
set_last : Region.t -> 'a state; set_last : Region.t -> 'token state;
set_pos : Pos.t -> 'a state; set_pos : Pos.t -> 'token state;
slide_token : 'a -> 'a state; slide_token : 'token -> 'token state;
sync : Lexing.lexbuf -> Region.t * lexeme * 'a state; sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
push_newline : Lexing.lexbuf -> 'a state; push_newline : Lexing.lexbuf -> 'token state;
push_line : thread -> 'a state; push_line : thread -> 'token state;
push_block : thread -> 'a state; push_block : thread -> 'token state;
push_space : Lexing.lexbuf -> 'a state; push_space : Lexing.lexbuf -> 'token state;
push_tabs : Lexing.lexbuf -> 'a state; push_tabs : Lexing.lexbuf -> 'token state;
push_bom : Lexing.lexbuf -> 'a state; push_bom : Lexing.lexbuf -> 'token state;
push_markup : Markup.t -> 'a state; push_markup : Markup.t -> 'token state;
> >
let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply
@ -371,12 +209,6 @@ type file_path = string
method block = block method block = block
method line = line method line = line
(* The call [enqueue (token, state)] updates functionally the
state [state] by associating the token [token] with the
stored markup and enqueuing the pair into the units
queue. The field [markup] is then reset to the empty
list. *)
method enqueue token = method enqueue token =
{< units = FQueue.enq (markup, token) units; {< units = FQueue.enq (markup, token) units;
markup = [] >} markup = [] >}
@ -386,13 +218,9 @@ type file_path = string
method set_pos pos = {< pos = pos >} method set_pos pos = {< pos = pos >}
method slide_token token = method slide_token token =
{< window = slide token window >} match self#window with
Nil -> {< window = One token >}
(* The call [sync state buffer] updates the current position | One t | Two (t,_) -> {< window = Two (token,t) >}
in accordance with the contents of the lexing buffer, more
precisely, depending on the length of the string which has
just been recognised by the scanner: that length is used as
a positive offset to the current column. *)
method sync buffer = method sync buffer =
let lex = Lexing.lexeme buffer in let lex = Lexing.lexeme buffer in
@ -447,5 +275,121 @@ type file_path = string
let region, value, state = self#sync buffer in let region, value, state = self#sync buffer in
let unit = Markup.BOM Region.{region; value} let unit = Markup.BOM Region.{region; value}
in state#push_markup unit in state#push_markup unit
end end
(* LEXER INSTANCE *)
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type 'token logger = Markup.t list -> 'token -> unit
type 'token instance = {
input : input;
read : log:('token logger) -> Lexing.lexbuf -> 'token;
buffer : Lexing.lexbuf;
get_win : unit -> 'token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
let lexbuf_from_input = function
String s ->
Ok (Lexing.from_string s, fun () -> ())
| Channel chan ->
let close () = close_in chan in
Ok (Lexing.from_channel chan, close)
| Buffer b ->
Ok (b, fun () -> ())
| File path ->
try
let chan = open_in path in
let close () = close_in chan in
let lexbuf = Lexing.from_channel chan in
let () =
let open Lexing in
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
in Ok (lexbuf, close)
with Sys_error msg -> Stdlib.Error (File_opening msg)
let open_token_stream ?line ?block ~init ~scan
~token_to_region ~style input =
let file_path = match input with
File path -> path
| _ -> "" in
let pos = Pos.min ~file:file_path in
let buf_reg = ref (pos#byte, pos#byte)
and first_call = ref true
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let supply = Uutf.Manual.src decoder in
let state = ref (mk_state
~units:FQueue.empty
~last:Region.ghost
~window:Nil
~pos
~markup:[]
~decoder
~supply
?block
?line
()) in
let get_pos () = !state#pos
and get_last () = !state#last
and get_win () = !state#window
and get_file () = file_path in
let patch_buffer (start, stop) buffer =
let open Lexing in
let file_path = buffer.lex_curr_p.pos_fname in
buffer.lex_start_p <- {start with pos_fname = file_path};
buffer.lex_curr_p <- {stop with pos_fname = file_path}
and save_region buffer =
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
let scan' init scan buffer =
patch_buffer !buf_reg buffer;
(if !first_call
then (state := init !state buffer; first_call := false)
else state := scan !state buffer);
save_region buffer in
let next_token init scan buffer =
scan' init scan buffer;
match FQueue.peek !state#units with
None -> None
| Some (units, ext_token) ->
state := !state#set_units units; Some ext_token in
let rec read init scan ~token_to_region ~style ~log buffer =
match FQueue.deq !state#units with
None ->
scan' init scan buffer;
read init scan ~token_to_region ~style ~log buffer
| Some (units, (left_mark, token)) ->
log left_mark token;
state := ((!state#set_units units)
#set_last (token_to_region token))
#slide_token token;
style token (next_token init scan) buffer;
patch_buffer (token_to_region token)#byte_pos buffer;
token in
match lexbuf_from_input input with
Ok (buffer, close) ->
let () =
match input with
File path when path <> "" -> reset ~file:path buffer
| _ -> () in
let instance = {
read = read init scan ~token_to_region ~style;
input; buffer; get_win; get_pos; get_last; get_file; close}
in Ok instance
| Error _ as e -> e

View File

@ -0,0 +1,205 @@
(* A library for writing UTF8-aware lexers *)
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
(* The function [rollback] resets the lexing buffer to the state it
was when it matched the last regular expression. This function is
safe to use only in the semantic action of the rule which last
matched. *)
val rollback : Lexing.lexbuf -> unit
(* Utility types *)
type file_path = string
type lexeme = string
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
(* When scanning structured constructs, like strings and comments, we
need to keep the region of the opening symbol (like double quote,
"//" or "(*") in order to report any error more precisely. Since
ocamllex is byte-oriented, we need to store the parsed bytes as
characters in an accumulator [acc] and also its length [len], so,
we are done, it is easy to build the string making up the
structured construct with [mk_str] (see above).
The resulting data structure is called a _thread_. (Note for
Emacs: "*)".)
*)
type thread = <
opening : Region.t;
length : int;
acc : char list;
to_string : string;
push_char : char -> thread;
push_string : string -> thread;
set_opening : Region.t -> thread
>
val mk_thread : Region.t -> lexeme -> thread
(* STATE *)
(* Beyond producing tokens, the result of lexing is a _state_. The
type [state] represents the abstract logical state of the lexing
engine, that is, a value which is threaded during scanning and
which denotes useful, high-level information beyond what the type
[Lexing.lexbuf] in the standard library already provides for all
generic lexers. We qualify it as "logical state" because the lexing
buffer itself has a "physical state" defined by the type
[Lexing.lexbuf].
Tokens are the smallest units used by the parser to build the
abstract syntax tree. The state includes a queue of recognised
tokens, with the markup at the left of its lexeme until either the
start of the file or the end of the previously recognised token.
The markup from the last recognised token or, if the first token
has not been recognised yet, from the beginning of the file is
stored in the field [markup] of the state because it is a
side-effect, with respect to the output token list, and we use a
record with a single field [units] because that record may be
easily extended during the future maintenance of this lexer.
The state also includes a field [pos] which holds the current
position in the LIGO source file. The position is not always
updated after a single character has been matched: that depends on
the regular expression that matched the lexing buffer.
The field [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
Technically, it is a parametric type, but its use is meant for
tokens, wherever they are defined. In [Two (t1,t2)], and in case
of a syntax error, [t1] is the first invalid token and [t2] is the
last valid one.
The fields [decoder] and [supply] offer the support needed for
the lexing of UTF-8 encoded characters in comments (the only place
where they are allowed in LIGO). The former is the decoder proper
and the latter is the effectful function [supply] that takes a
byte, a start index and a length and feed it to [decoder]. See the
documentation of the third-party library Uutf.
Some methods are now documented.
The call [state#enqueue token] updates functionally the state
[state] by associating the token [token] with the stored markup and
enqueuing the pair into the units queue. The field [markup] is then
reset to the empty list.
The call [state#slide_token token] pushes the token [token] in
the buffer [buffer]. If the buffer is full, that is, it is [Two
(t1,t2)], then the token [t2] is discarded to make room for
[token].
The call [state#sync buffer] updates the current position in
accordance with the contents of the lexing buffer, more precisely,
depending on the length of the string which has just been
recognised by the scanner: that length is used as a positive offset
to the current column.
*)
type 'token window =
Nil
| One of 'token
| Two of 'token * 'token
type 'token state = <
units : (Markup.t list * 'token) FQueue.t;
markup : Markup.t list;
window : 'token window;
last : Region.t;
pos : Pos.t;
decoder : Uutf.decoder;
supply : Bytes.t -> int -> int -> unit;
block : EvalOpt.block_comment option;
line : EvalOpt.line_comment option;
enqueue : 'token -> 'token state;
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
set_last : Region.t -> 'token state;
set_pos : Pos.t -> 'token state;
slide_token : 'token -> 'token state;
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
push_newline : Lexing.lexbuf -> 'token state;
push_line : thread -> 'token state;
push_block : thread -> 'token state;
push_space : Lexing.lexbuf -> 'token state;
push_tabs : Lexing.lexbuf -> 'token state;
push_bom : Lexing.lexbuf -> 'token state;
push_markup : Markup.t -> 'token state;
>
(* LEXER INSTANCE *)
(* The function [open_token_stream] returns a lexer instance made of
* the input [input] of type [input];
* a function [read] that extracts tokens from a lexing buffer,
together with a lexing buffer [buffer] to read from,
* a function [close] that closes that buffer,
* a function [get_pos] that returns the current position, and
* a function [get_last] that returns the region of the last
recognised token.
* a function [get_file] that returns the name of the file being
scanned (empty string if [stdin]).
Note that a module [Token] is exported too, because the signature
of the exported functions depend on it.
The type [window] is a two-token window, that is, a buffer that
contains the last recognised token, and the penultimate (if any).
The call [read ?line ?block ~init ~scan ~token_to_region ~style
input] evaluates in a lexer (also known as a tokeniser or scanner)
whose type is [log:('token logger) -> Lexing.lexbuf -> 'token], and
suitable for a parser generated by Menhir. The argument labelled
[log] is a logger, that is, it may print a token and its left
markup to a given channel, at the caller's discretion. The argument
labelled [~init] is the scanner to be called first, usually for
reading the BOM, then [scan] is used for the following calls. The
function labelled [~style] is used to check stylistic constraints
on the tokens and the markup between them.
*)
type input =
File of file_path
| String of string
| Channel of in_channel
| Buffer of Lexing.lexbuf
type 'token logger = Markup.t list -> 'token -> unit
type 'token instance = {
input : input;
read : log:('token logger) -> Lexing.lexbuf -> 'token;
buffer : Lexing.lexbuf;
get_win : unit -> 'token window;
get_pos : unit -> Pos.t;
get_last : unit -> Region.t;
get_file : unit -> file_path;
close : unit -> unit
}
type open_err = File_opening of string
val lexbuf_from_input :
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
val open_token_stream :
?line:EvalOpt.line_comment ->
?block:EvalOpt.block_comment ->
init:('token state -> Lexing.lexbuf -> 'token state) ->
scan:('token state -> Lexing.lexbuf -> 'token state) ->
token_to_region:('token -> Region.t) ->
style:('token ->
(Lexing.lexbuf -> (Markup.t list * 'token) option) ->
Lexing.lexbuf ->
unit) ->
input ->
('token instance, open_err) Stdlib.result

View File

@ -4,7 +4,8 @@ module Region = Simple_utils.Region
module type S = module type S =
sig sig
module Lexer : LexerLib.S module Lexer : Lexer.S
type token = Lexer.token
val output_token : val output_token :
?offsets:bool -> ?offsets:bool ->
@ -12,7 +13,7 @@ module type S =
EvalOpt.command -> EvalOpt.command ->
out_channel -> out_channel ->
Markup.t list -> Markup.t list ->
Lexer.token -> token ->
unit unit
type file_path = string type file_path = string
@ -22,19 +23,26 @@ module type S =
[`Byte | `Point] -> [`Byte | `Point] ->
?block:EvalOpt.block_comment -> ?block:EvalOpt.block_comment ->
?line:EvalOpt.line_comment -> ?line:EvalOpt.line_comment ->
Lexer.input -> token_to_region:(token -> Region.t) ->
style:(token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit) ->
LexerLib.input ->
EvalOpt.command -> EvalOpt.command ->
(unit, string Region.reg) Stdlib.result (unit, string Region.reg) Stdlib.result
end end
module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) = module Make (Lexer: Lexer.S) : (S with module Lexer = Lexer) =
struct struct
module Lexer = Lexer module Lexer = Lexer
module Token = Lexer.Token module Token = Lexer.Token
type token = Lexer.token
(** Pretty-printing in a string the lexemes making up the markup (* Pretty-printing in a string the lexemes making up the markup
between two tokens, concatenated with the last lexeme between two tokens, concatenated with the last lexeme
itself. *) itself. *)
let output_token ?(offsets=true) mode command let output_token ?(offsets=true) mode command
channel left_mark token : unit = channel left_mark token : unit =
let output str = Printf.fprintf channel "%s%!" str in let output str = Printf.fprintf channel "%s%!" str in
@ -57,10 +65,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
type file_path = string type file_path = string
let trace ?(offsets=true) mode ?block ?line input command : let trace ?(offsets=true) mode ?block ?line
~token_to_region ~style input command :
(unit, string Region.reg) Stdlib.result = (unit, string Region.reg) Stdlib.result =
match Lexer.open_token_stream ?line ?block input with match LexerLib.open_token_stream
Ok Lexer.{read; buffer; close; _} -> ~init:Lexer.init
~scan:Lexer.scan
~token_to_region
~style
?line ?block input
with
Ok LexerLib.{read; buffer; close; _} ->
let log = output_token ~offsets mode command stdout let log = output_token ~offsets mode command stdout
and close_all () = flush_all (); close () in and close_all () = flush_all (); close () in
let rec iter () = let rec iter () =
@ -69,12 +84,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
if Token.is_eof token if Token.is_eof token
then Stdlib.Ok () then Stdlib.Ok ()
else iter () else iter ()
| exception Lexer.Token.Error error ->
let msg =
Lexer.Token.format_error
~offsets mode ~file:true error
in Stdlib.Error msg
| exception Lexer.Error error -> | exception Lexer.Error error ->
let msg = let msg =
Lexer.format_error ~offsets mode ~file:true error Lexer.format_error ~offsets mode ~file:true error
in Stdlib.Error msg in in Stdlib.Error msg in
let result = iter () let result = iter ()
in close_all (); result in close_all (); result
| Stdlib.Error (Lexer.File_opening msg) -> | Stdlib.Error (LexerLib.File_opening msg) ->
flush_all (); Stdlib.Error (Region.wrap_ghost msg) flush_all (); Stdlib.Error (Region.wrap_ghost msg)
end end

View File

@ -2,7 +2,8 @@ module Region = Simple_utils.Region
module type S = module type S =
sig sig
module Lexer : LexerLib.S module Lexer : Lexer.S
type token = Lexer.token
val output_token : val output_token :
?offsets:bool -> ?offsets:bool ->
@ -10,7 +11,7 @@ module type S =
EvalOpt.command -> EvalOpt.command ->
out_channel -> out_channel ->
Markup.t list -> Markup.t list ->
Lexer.token -> token ->
unit unit
type file_path = string type file_path = string
@ -20,9 +21,14 @@ module type S =
[`Byte | `Point] -> [`Byte | `Point] ->
?block:EvalOpt.block_comment -> ?block:EvalOpt.block_comment ->
?line:EvalOpt.line_comment -> ?line:EvalOpt.line_comment ->
Lexer.input -> token_to_region:(token -> Region.t) ->
style:(token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit) ->
LexerLib.input ->
EvalOpt.command -> EvalOpt.command ->
(unit, string Region.reg) Stdlib.result (unit, string Region.reg) Stdlib.result
end end
module Make (Lexer: LexerLib.S) : S with module Lexer = Lexer module Make (Lexer: Lexer.S) : S with module Lexer = Lexer

View File

@ -9,7 +9,7 @@ module type IO =
val options : EvalOpt.options (* CLI options *) val options : EvalOpt.options (* CLI options *)
end end
module Make (IO: IO) (Lexer: LexerLib.S) = module Make (IO: IO) (Lexer: Lexer.S) =
struct struct
(* Error printing and exception tracing *) (* Error printing and exception tracing *)
@ -39,11 +39,16 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
| Stdlib.Ok pp_buffer -> | Stdlib.Ok pp_buffer ->
(* Running the lexer on the preprocessed input *) (* Running the lexer on the preprocessed input *)
let source = Lexer.String (Buffer.contents pp_buffer) in let source = LexerLib.String (Buffer.contents pp_buffer) in
match Lexer.open_token_stream ?line:IO.options#line match LexerLib.open_token_stream
?line:IO.options#line
?block:IO.options#block ?block:IO.options#block
~init:Lexer.init
~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
source with source with
Ok Lexer.{read; buffer; close; _} -> Ok LexerLib.{read; buffer; close; _} ->
let close_all () = flush_all (); close () in let close_all () = flush_all (); close () in
let rec read_tokens tokens = let rec read_tokens tokens =
match read ~log:(fun _ _ -> ()) buffer with match read ~log:(fun _ _ -> ()) buffer with
@ -51,9 +56,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
if Lexer.Token.is_eof token if Lexer.Token.is_eof token
then Stdlib.Ok (List.rev tokens) then Stdlib.Ok (List.rev tokens)
else read_tokens (token::tokens) else read_tokens (token::tokens)
(* | exception _ -> | exception Lexer.Token.Error error ->
Printf.eprintf "Here\n%!"; exit 1
*) | exception Lexer.Token.Error error ->
let file = let file =
match IO.options#input with match IO.options#input with
None | Some "-" -> false None | Some "-" -> false
@ -74,7 +77,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
in Stdlib.Error msg in in Stdlib.Error msg in
let result = read_tokens [] let result = read_tokens []
in close_all (); result in close_all (); result
| Stdlib.Error (Lexer.File_opening msg) -> | Stdlib.Error (LexerLib.File_opening msg) ->
flush_all (); Stdlib.Error (Region.wrap_ghost msg) in flush_all (); Stdlib.Error (Region.wrap_ghost msg) in
match IO.options#input with match IO.options#input with
None -> preproc stdin None -> preproc stdin
@ -115,7 +118,9 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
IO.options#mode IO.options#mode
?block:IO.options#block ?block:IO.options#block
?line:IO.options#line ?line:IO.options#line
(Lexer.String preproc_str) ~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
(LexerLib.String preproc_str)
IO.options#cmd IO.options#cmd
in match IO.options#input with in match IO.options#input with
None -> preproc stdin None -> preproc stdin

View File

@ -7,7 +7,7 @@ module type IO =
val options : EvalOpt.options (* CLI options *) val options : EvalOpt.options (* CLI options *)
end end
module Make (IO: IO) (Lexer: LexerLib.S) : module Make (IO: IO) (Lexer: Lexer.S) :
sig sig
val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result
val trace : unit -> (unit, string Region.reg) Stdlib.result val trace : unit -> (unit, string Region.reg) Stdlib.result

View File

@ -56,8 +56,8 @@ module type PARSER =
(* Main functor *) (* Main functor *)
module Make (IO: IO) module Make (IO: IO)
(Lexer: LexerLib.S) (Lexer: Lexer.S)
(Parser: PARSER with type token = Lexer.Token.token) (Parser: PARSER with type token = Lexer.token)
(ParErr: sig val message : int -> string end) = (ParErr: sig val message : int -> string end) =
struct struct
module I = Parser.MenhirInterpreter module I = Parser.MenhirInterpreter
@ -143,14 +143,14 @@ module Make (IO: IO)
~offsets:IO.options#offsets ~offsets:IO.options#offsets
IO.options#mode IO.options#cmd stdout IO.options#mode IO.options#cmd stdout
let incr_contract Lexer.{read; buffer; get_win; close; _} = let incr_contract LexerLib.{read; buffer; get_win; close; _} =
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
and failure = failure get_win in and failure = failure get_win in
let parser = Incr.contract buffer.Lexing.lex_curr_p in let parser = Incr.contract buffer.Lexing.lex_curr_p in
let ast = I.loop_handle success failure supplier parser let ast = I.loop_handle success failure supplier parser
in flush_all (); close (); ast in flush_all (); close (); ast
let incr_expr Lexer.{read; buffer; get_win; close; _} = let incr_expr LexerLib.{read; buffer; get_win; close; _} =
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
and failure = failure get_win in and failure = failure get_win in
let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in

View File

@ -55,8 +55,8 @@ module type PARSER =
end end
module Make (IO: IO) module Make (IO: IO)
(Lexer: LexerLib.S) (Lexer: Lexer.S)
(Parser: PARSER with type token = Lexer.Token.token) (Parser: PARSER with type token = Lexer.token)
(ParErr: sig val message : int -> string end) : (ParErr: sig val message : int -> string end) :
sig sig
(* WARNING: The following parsers may all raise [Lexer.Error] *) (* WARNING: The following parsers may all raise [Lexer.Error] *)
@ -78,8 +78,8 @@ module Make (IO: IO)
exception Point of error exception Point of error
val incr_contract : Lexer.instance -> Parser.ast val incr_contract : Lexer.token LexerLib.instance -> Parser.ast
val incr_expr : Lexer.instance -> Parser.expr val incr_expr : Lexer.token LexerLib.instance -> Parser.expr
val format_error : val format_error :
?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg ?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg

View File

@ -37,7 +37,7 @@ module type Printer =
val print_expr : state -> expr -> unit val print_expr : state -> expr -> unit
end end
module Make (Lexer: LexerLib.S) module Make (Lexer: Lexer.S)
(AST: sig type t type expr end) (AST: sig type t type expr end)
(Parser: ParserAPI.PARSER (Parser: ParserAPI.PARSER
with type ast = AST.t with type ast = AST.t
@ -89,12 +89,12 @@ module Make (Lexer: LexerLib.S)
ParserLog.mk_state ~offsets:SubIO.options#offsets ParserLog.mk_state ~offsets:SubIO.options#offsets
~mode:SubIO.options#mode ~mode:SubIO.options#mode
~buffer:output in ~buffer:output in
let close () = lexer_inst.Lexer.close () in let close () = lexer_inst.LexerLib.close () in
let expr = let expr =
try try
if SubIO.options#mono then if SubIO.options#mono then
let tokeniser = lexer_inst.Lexer.read ~log let tokeniser = lexer_inst.LexerLib.read ~log
and lexbuf = lexer_inst.Lexer.buffer and lexbuf = lexer_inst.LexerLib.buffer
in Front.mono_expr tokeniser lexbuf in Front.mono_expr tokeniser lexbuf
else else
Front.incr_expr lexer_inst Front.incr_expr lexer_inst
@ -124,12 +124,12 @@ module Make (Lexer: LexerLib.S)
ParserLog.mk_state ~offsets:SubIO.options#offsets ParserLog.mk_state ~offsets:SubIO.options#offsets
~mode:SubIO.options#mode ~mode:SubIO.options#mode
~buffer:output in ~buffer:output in
let close () = lexer_inst.Lexer.close () in let close () = lexer_inst.LexerLib.close () in
let ast = let ast =
try try
if SubIO.options#mono then if SubIO.options#mono then
let tokeniser = lexer_inst.Lexer.read ~log let tokeniser = lexer_inst.LexerLib.read ~log
and lexbuf = lexer_inst.Lexer.buffer and lexbuf = lexer_inst.LexerLib.buffer
in Front.mono_contract tokeniser lexbuf in Front.mono_contract tokeniser lexbuf
else else
Front.incr_contract lexer_inst Front.incr_contract lexer_inst
@ -163,9 +163,17 @@ module Make (Lexer: LexerLib.S)
| exception Lexer.Error err -> | exception Lexer.Error err ->
let file = let file =
lexer_inst.Lexer.buffer.Lexing.lex_curr_p.Lexing.pos_fname in lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error = let error = Lexer.format_error
Lexer.format_error ~offsets:SubIO.options#offsets ~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "")
in Stdlib.Error error
| exception Lexer.Token.Error err ->
let file =
lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
let error = Lexer.Token.format_error
~offsets:SubIO.options#offsets
SubIO.options#mode err ~file:(file <> "") SubIO.options#mode err ~file:(file <> "")
in Stdlib.Error error in Stdlib.Error error
@ -181,7 +189,7 @@ module Make (Lexer: LexerLib.S)
| exception Parser.Error -> | exception Parser.Error ->
let invalid, valid_opt = let invalid, valid_opt =
match lexer_inst.Lexer.get_win () with match lexer_inst.LexerLib.get_win () with
LexerLib.Nil -> LexerLib.Nil ->
assert false (* Safe: There is always at least EOF. *) assert false (* Safe: There is always at least EOF. *)
| LexerLib.One invalid -> invalid, None | LexerLib.One invalid -> invalid, None
@ -205,8 +213,8 @@ module Make (Lexer: LexerLib.S)
(* Parsing a contract *) (* Parsing a contract *)
let gen_parser options input parser = let gen_parser options input parser =
match Lexer.lexbuf_from_input input with match LexerLib.lexbuf_from_input input with
Stdlib.Error (Lexer.File_opening msg) -> Stdlib.Error (LexerLib.File_opening msg) ->
Stdlib.Error (Region.wrap_ghost msg) Stdlib.Error (Region.wrap_ghost msg)
| Ok (lexbuf, close) -> | Ok (lexbuf, close) ->
(* Preprocessing the input source *) (* Preprocessing the input source *)
@ -224,48 +232,53 @@ module Make (Lexer: LexerLib.S)
(* Lexing and parsing the preprocessed input source *) (* Lexing and parsing the preprocessed input source *)
let () = close () in let () = close () in
let input' = Lexer.String (Buffer.contents buffer) in let input' = LexerLib.String (Buffer.contents buffer) in
match Lexer.open_token_stream ?line:options#line match LexerLib.open_token_stream
~init:Lexer.init
~scan:Lexer.scan
~token_to_region:Lexer.Token.to_region
~style:Lexer.Token.check_right_context
?line:options#line
?block:options#block ?block:options#block
input' input'
with with
Ok instance -> Ok instance ->
let open Lexing in let open Lexing in
instance.Lexer.buffer.lex_curr_p <- instance.LexerLib.buffer.lex_curr_p <-
{instance.Lexer.buffer.lex_curr_p with pos_fname=file}; {instance.LexerLib.buffer.lex_curr_p with pos_fname=file};
apply instance parser apply instance parser
| Stdlib.Error (Lexer.File_opening msg) -> | Stdlib.Error (LexerLib.File_opening msg) ->
Stdlib.Error (Region.wrap_ghost msg) Stdlib.Error (Region.wrap_ghost msg)
(* Parsing a contract in a file *) (* Parsing a contract in a file *)
let contract_in_file (source : string) = let contract_in_file (source : string) =
let options = SubIO.make ~input:(Some source) ~expr:false let options = SubIO.make ~input:(Some source) ~expr:false
in gen_parser options (Lexer.File source) parse_contract in gen_parser options (LexerLib.File source) parse_contract
(* Parsing a contract in a string *) (* Parsing a contract in a string *)
let contract_in_string (source : string) = let contract_in_string (source : string) =
let options = SubIO.make ~input:None ~expr:false in let options = SubIO.make ~input:None ~expr:false in
gen_parser options (Lexer.String source) parse_contract gen_parser options (LexerLib.String source) parse_contract
(* Parsing a contract in stdin *) (* Parsing a contract in stdin *)
let contract_in_stdin () = let contract_in_stdin () =
let options = SubIO.make ~input:None ~expr:false in let options = SubIO.make ~input:None ~expr:false in
gen_parser options (Lexer.Channel stdin) parse_contract gen_parser options (LexerLib.Channel stdin) parse_contract
(* Parsing an expression in a string *) (* Parsing an expression in a string *)
let expr_in_string (source : string) = let expr_in_string (source : string) =
let options = SubIO.make ~input:None ~expr:true in let options = SubIO.make ~input:None ~expr:true in
gen_parser options (Lexer.String source) parse_expr gen_parser options (LexerLib.String source) parse_expr
(* Parsing an expression in stdin *) (* Parsing an expression in stdin *)
let expr_in_stdin () = let expr_in_stdin () =
let options = SubIO.make ~input:None ~expr:true in let options = SubIO.make ~input:None ~expr:true in
gen_parser options (Lexer.Channel stdin) parse_expr gen_parser options (LexerLib.Channel stdin) parse_expr
(* Preprocess only *) (* Preprocess only *)

View File

@ -43,7 +43,7 @@ module type Printer =
(* Main functor to make the parser *) (* Main functor to make the parser *)
module Make (Lexer : LexerLib.S) module Make (Lexer : Lexer.S)
(AST : sig type t type expr end) (AST : sig type t type expr end)
(Parser : ParserAPI.PARSER (Parser : ParserAPI.PARSER
with type ast = AST.t with type ast = AST.t

View File

@ -1 +1 @@
const a: string = -0x222; const a: string = - (**) 0x2222

View File

@ -1 +1 @@
let a = -0x222; let a = - (**) 0x2222

View File

@ -1 +1 @@
let a = -0x222; let a = - /**/ 0x2222;

View File

@ -1 +0,0 @@
let arguments = 1;