Finished refactoring of lexer.
This commit is contained in:
parent
9618a48848
commit
005b7fd69b
@ -54,7 +54,7 @@ ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9:
|
||||
|
||||
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ;
|
||||
[%expect {|
|
||||
ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-23:
|
||||
ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-31:
|
||||
Negative byte sequence.
|
||||
Hint: Remove the leading minus sign.
|
||||
{}
|
||||
@ -71,7 +71,7 @@ ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters
|
||||
|
||||
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ;
|
||||
[%expect {|
|
||||
ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-13:
|
||||
ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-21:
|
||||
Negative byte sequence.
|
||||
Hint: Remove the leading minus sign.
|
||||
{}
|
||||
@ -88,7 +88,7 @@ ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters
|
||||
|
||||
run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ;
|
||||
[%expect {|
|
||||
ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-13:
|
||||
ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-21:
|
||||
Negative byte sequence.
|
||||
Hint: Remove the leading minus sign.
|
||||
{}
|
||||
@ -103,6 +103,7 @@ ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, character
|
||||
* Check the changelog by running 'ligo changelog'
|
||||
|} ];
|
||||
|
||||
(*
|
||||
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ;
|
||||
[%expect {|
|
||||
ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
|
||||
@ -119,6 +120,7 @@ ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13:
|
||||
* Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new
|
||||
* Check the changelog by running 'ligo changelog'
|
||||
|} ];
|
||||
*)
|
||||
|
||||
run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ;
|
||||
[%expect {|
|
||||
@ -204,10 +206,9 @@ ligo: : Lexical error in file "unexpected_character.religo", line 1, characters
|
||||
|
||||
run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ;
|
||||
[%expect {|
|
||||
ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0-2:
|
||||
ligo: : Preprocessing error in file "../../test/lexer/unterminated_comment.mligo", line 1, characters 0-2:
|
||||
Unterminated comment.
|
||||
Hint: Close with "*)".
|
||||
{}
|
||||
Hint: Close with "*)". {}
|
||||
|
||||
|
||||
If you're not sure how to fix this error, you can
|
||||
|
@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
|
||||
|
||||
../shared/Lexer.mli
|
||||
../shared/Lexer.mll
|
||||
../shared/LexerLib.ml
|
||||
../shared/EvalOpt.ml
|
||||
../shared/EvalOpt.mli
|
||||
../shared/FQueue.ml
|
||||
|
@ -102,10 +102,35 @@ type t =
|
||||
|
||||
| EOF of Region.t (* End of file *)
|
||||
|
||||
|
||||
(* Projections *)
|
||||
|
||||
type token = t
|
||||
|
||||
let proj_token = function
|
||||
ARROW region -> region, "ARROW"
|
||||
(* Literals *)
|
||||
|
||||
String Region.{region; value} ->
|
||||
region, sprintf "Str %s" value
|
||||
| Bytes Region.{region; value = s,b} ->
|
||||
region,
|
||||
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
|
||||
| Int Region.{region; value = s,n} ->
|
||||
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
|
||||
| Nat Region.{region; value = s,n} ->
|
||||
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
|
||||
| Mutez Region.{region; value = s,n} ->
|
||||
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
|
||||
| Ident Region.{region; value} ->
|
||||
region, sprintf "Ident %s" value
|
||||
| Constr Region.{region; value} ->
|
||||
region, sprintf "Constr %s" value
|
||||
| Attr Region.{region; value} ->
|
||||
region, sprintf "Attr \"%s\"" value
|
||||
|
||||
(* Symbols *)
|
||||
|
||||
| ARROW region -> region, "ARROW"
|
||||
| CONS region -> region, "CONS"
|
||||
| CAT region -> region, "CAT"
|
||||
| MINUS region -> region, "MINUS"
|
||||
@ -132,24 +157,9 @@ let proj_token = function
|
||||
| GE region -> region, "GE"
|
||||
| BOOL_OR region -> region, "BOOL_OR"
|
||||
| BOOL_AND region -> region, "BOOL_AND"
|
||||
| Ident Region.{region; value} ->
|
||||
region, sprintf "Ident %s" value
|
||||
| Constr Region.{region; value} ->
|
||||
region, sprintf "Constr %s" value
|
||||
| Int Region.{region; value = s,n} ->
|
||||
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
|
||||
| Nat Region.{region; value = s,n} ->
|
||||
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
|
||||
| Mutez Region.{region; value = s,n} ->
|
||||
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
|
||||
| String Region.{region; value} ->
|
||||
region, sprintf "Str %s" value
|
||||
| Bytes Region.{region; value = s,b} ->
|
||||
region,
|
||||
sprintf "Bytes (\"%s\", \"0x%s\")"
|
||||
s (Hex.show b)
|
||||
| Attr Region.{region; value} ->
|
||||
region, sprintf "Attr \"%s\"" value
|
||||
|
||||
(* Keywords *)
|
||||
|
||||
| Begin region -> region, "Begin"
|
||||
| Else region -> region, "Else"
|
||||
| End region -> region, "End"
|
||||
@ -168,12 +178,32 @@ let proj_token = function
|
||||
| True region -> region, "True"
|
||||
| Type region -> region, "Type"
|
||||
| With region -> region, "With"
|
||||
|
||||
(* Data *)
|
||||
|
||||
| C_None region -> region, "C_None"
|
||||
| C_Some region -> region, "C_Some"
|
||||
|
||||
(* Virtual tokens *)
|
||||
|
||||
| EOF region -> region, "EOF"
|
||||
|
||||
|
||||
let to_lexeme = function
|
||||
ARROW _ -> "->"
|
||||
(* Literals *)
|
||||
|
||||
String s -> String.escaped s.Region.value
|
||||
| Bytes b -> fst b.Region.value
|
||||
| Int i
|
||||
| Nat i
|
||||
| Mutez i -> fst i.Region.value
|
||||
| Ident id -> id.Region.value
|
||||
| Constr id -> id.Region.value
|
||||
| Attr a -> a.Region.value
|
||||
|
||||
(* Symbols *)
|
||||
|
||||
| ARROW _ -> "->"
|
||||
| CONS _ -> "::"
|
||||
| CAT _ -> "^"
|
||||
| MINUS _ -> "-"
|
||||
@ -201,14 +231,7 @@ let to_lexeme = function
|
||||
| BOOL_OR _ -> "||"
|
||||
| BOOL_AND _ -> "&&"
|
||||
|
||||
| Ident id -> id.Region.value
|
||||
| Constr id -> id.Region.value
|
||||
| Int i
|
||||
| Nat i
|
||||
| Mutez i -> fst i.Region.value
|
||||
| String s -> String.escaped s.Region.value
|
||||
| Bytes b -> fst b.Region.value
|
||||
| Attr a -> a.Region.value
|
||||
(* Keywords *)
|
||||
|
||||
| Begin _ -> "begin"
|
||||
| Else _ -> "else"
|
||||
@ -229,11 +252,17 @@ let to_lexeme = function
|
||||
| Then _ -> "then"
|
||||
| With _ -> "with"
|
||||
|
||||
(* Data constructors *)
|
||||
|
||||
| C_None _ -> "None"
|
||||
| C_Some _ -> "Some"
|
||||
|
||||
(* Virtual tokens *)
|
||||
|
||||
| EOF _ -> ""
|
||||
|
||||
(* CONVERSIONS *)
|
||||
|
||||
let to_string token ?(offsets=true) mode =
|
||||
let region, val_str = proj_token token in
|
||||
let reg_str = region#compact ~offsets mode
|
||||
@ -241,10 +270,6 @@ let to_string token ?(offsets=true) mode =
|
||||
|
||||
let to_region token = proj_token token |> fst
|
||||
|
||||
(* Injections *)
|
||||
|
||||
type int_err = Non_canonical_zero
|
||||
|
||||
(* LEXIS *)
|
||||
|
||||
let keywords = [
|
||||
@ -385,6 +410,8 @@ let mk_bytes lexeme region =
|
||||
let value = lexeme, `Hex norm
|
||||
in Bytes Region.{region; value}
|
||||
|
||||
type int_err = Non_canonical_zero
|
||||
|
||||
let mk_int lexeme region =
|
||||
let z =
|
||||
Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string
|
||||
@ -398,16 +425,14 @@ type nat_err =
|
||||
|
||||
let mk_nat lexeme region =
|
||||
match (String.index_opt lexeme 'n') with
|
||||
| None -> Error Invalid_natural
|
||||
| Some _ -> (
|
||||
let z =
|
||||
None -> Error Invalid_natural
|
||||
| Some _ -> let z =
|
||||
Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
Str.(global_replace (regexp "n") "") |>
|
||||
Z.of_string in
|
||||
if Z.equal z Z.zero && lexeme <> "0n"
|
||||
then Error Non_canonical_zero_nat
|
||||
else Ok (Nat Region.{region; value = lexeme,z})
|
||||
)
|
||||
|
||||
let mk_mutez lexeme region =
|
||||
let z =
|
||||
@ -422,8 +447,6 @@ let eof region = EOF region
|
||||
|
||||
type sym_err = Invalid_symbol
|
||||
|
||||
type attr_err = Invalid_attribute
|
||||
|
||||
let mk_sym lexeme region =
|
||||
match lexeme with
|
||||
(* Lexemes in common with all concrete syntaxes *)
|
||||
@ -473,9 +496,10 @@ let mk_constr lexeme region =
|
||||
|
||||
(* Attributes *)
|
||||
|
||||
type attr_err = Invalid_attribute
|
||||
|
||||
let mk_attr header lexeme region =
|
||||
if header = "[@" then
|
||||
Error Invalid_attribute
|
||||
if header = "[@" then Error Invalid_attribute
|
||||
else Ok (Attr Region.{value=lexeme; region})
|
||||
|
||||
(* Predicates *)
|
||||
@ -485,12 +509,14 @@ let is_bytes = function Bytes _ -> true | _ -> false
|
||||
let is_int = function Int _ -> true | _ -> false
|
||||
let is_ident = function Ident _ -> true | _ -> false
|
||||
let is_eof = function EOF _ -> true | _ -> false
|
||||
let is_minus = function MINUS _ -> true | _ -> false
|
||||
|
||||
(* Errors *)
|
||||
|
||||
type error =
|
||||
Odd_lengthed_bytes
|
||||
| Missing_break
|
||||
| Negative_byte_sequence
|
||||
|
||||
let error_to_string = function
|
||||
Odd_lengthed_bytes ->
|
||||
@ -499,6 +525,9 @@ let error_to_string = function
|
||||
| Missing_break ->
|
||||
"Missing break.\n\
|
||||
Hint: Insert some space."
|
||||
| Negative_byte_sequence ->
|
||||
"Negative byte sequence.\n\
|
||||
Hint: Remove the leading minus sign."
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
@ -511,29 +540,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
|
||||
let fail region value = raise (Error Region.{region; value})
|
||||
|
||||
let check_right_context token next_token buffer : unit =
|
||||
if not (is_eof token) then
|
||||
if is_int token || is_bytes token then
|
||||
match next_token buffer with
|
||||
Some ([], next) ->
|
||||
let pos = (to_region token)#stop in
|
||||
let region = Region.make ~start:pos ~stop:pos in
|
||||
if is_int next then
|
||||
fail region Odd_lengthed_bytes
|
||||
else
|
||||
if is_ident next || is_string next || is_bytes next then
|
||||
fail region Missing_break
|
||||
| Some (_::_, _) | None -> ()
|
||||
else
|
||||
if is_ident token || is_string token then
|
||||
match next_token buffer with
|
||||
Some ([], next) ->
|
||||
if is_ident next || is_string next
|
||||
|| is_bytes next || is_int next
|
||||
then
|
||||
let pos = (to_region token)#stop in
|
||||
let region = Region.make ~start:pos ~stop:pos
|
||||
in fail region Missing_break
|
||||
| Some (_::_, _) | None -> ()
|
||||
None -> ()
|
||||
| Some (markup, next) ->
|
||||
if is_minus token && is_bytes next
|
||||
then let region =
|
||||
Region.cover (to_region token) (to_region next)
|
||||
in fail region Negative_byte_sequence
|
||||
else
|
||||
match markup with
|
||||
[] ->
|
||||
if is_int token
|
||||
then if is_string next || is_ident next
|
||||
then fail region Missing_break
|
||||
else ()
|
||||
else
|
||||
if is_string token
|
||||
then if is_int next || is_bytes next || is_ident next
|
||||
then fail region Missing_break
|
||||
else ()
|
||||
else
|
||||
if is_bytes token
|
||||
then if is_string next || is_ident next
|
||||
then fail region Missing_break
|
||||
else if is_int next
|
||||
then fail region Odd_lengthed_bytes
|
||||
else ()
|
||||
else ()
|
||||
| _::_ -> ()
|
||||
|
||||
(* END TRAILER *)
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
|
||||
|
||||
../shared/Lexer.mli
|
||||
../shared/Lexer.mll
|
||||
../shared/LexerLib.mli
|
||||
../shared/LexerLib.ml
|
||||
../shared/EvalOpt.ml
|
||||
../shared/EvalOpt.mli
|
||||
|
@ -1,4 +1,4 @@
|
||||
(** This signature defines the lexical tokens for LIGO
|
||||
(* This signature defines the lexical tokens for LIGO
|
||||
|
||||
_Tokens_ are the abstract units which are used by the parser to
|
||||
build the abstract syntax tree (AST), in other words, the stream of
|
||||
|
@ -5,24 +5,14 @@
|
||||
|
||||
(* Shorthands *)
|
||||
|
||||
type lexeme = string
|
||||
|
||||
let sprintf = Printf.sprintf
|
||||
|
||||
module Region = Simple_utils.Region
|
||||
module Pos = Simple_utils.Pos
|
||||
module SMap = Map.Make (String)
|
||||
module SSet = Set.Make (String)
|
||||
|
||||
(* Hack to roll back one lexeme in the current semantic action *)
|
||||
(*
|
||||
let rollback buffer =
|
||||
let open Lexing in
|
||||
let len = String.length (lexeme buffer) in
|
||||
let pos_cnum = buffer.lex_curr_p.pos_cnum - len in
|
||||
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
|
||||
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
|
||||
*)
|
||||
type lexeme = string
|
||||
|
||||
let sprintf = Printf.sprintf
|
||||
|
||||
(* TOKENS *)
|
||||
|
||||
@ -123,6 +113,8 @@ type t =
|
||||
| EOF of Region.t
|
||||
|
||||
|
||||
(* Projections *)
|
||||
|
||||
type token = t
|
||||
|
||||
let proj_token = function
|
||||
@ -130,32 +122,20 @@ let proj_token = function
|
||||
|
||||
String Region.{region; value} ->
|
||||
region, sprintf "String %s" value
|
||||
|
||||
| Bytes Region.{region; value = s,b} ->
|
||||
region,
|
||||
sprintf "Bytes (\"%s\", \"0x%s\")"
|
||||
s (Hex.show b)
|
||||
|
||||
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
|
||||
| Int Region.{region; value = s,n} ->
|
||||
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
|
||||
|
||||
| Nat Region.{region; value = s,n} ->
|
||||
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
|
||||
|
||||
| Mutez Region.{region; value = s,n} ->
|
||||
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
|
||||
|
||||
| Ident Region.{region; value} ->
|
||||
region, sprintf "Ident \"%s\"" value
|
||||
|
||||
| Constr Region.{region; value} ->
|
||||
region, sprintf "Constr \"%s\"" value
|
||||
|
||||
(*
|
||||
| Attr {header; string={region; value}} ->
|
||||
region, sprintf "Attr (\"%s\",\"%s\")" header value
|
||||
*)
|
||||
|
||||
(* Symbols *)
|
||||
|
||||
| SEMI region -> region, "SEMI"
|
||||
@ -382,9 +362,7 @@ let keywords = [
|
||||
(fun reg -> With reg)
|
||||
]
|
||||
|
||||
let reserved =
|
||||
let open SSet in
|
||||
empty |> add "arguments"
|
||||
let reserved = SSet.empty
|
||||
|
||||
let constructors = [
|
||||
(fun reg -> False reg);
|
||||
@ -485,8 +463,7 @@ type nat_err =
|
||||
let mk_nat lexeme region =
|
||||
match String.index_opt lexeme 'n' with
|
||||
None -> Error Invalid_natural
|
||||
| Some _ ->
|
||||
let z =
|
||||
| Some _ -> let z =
|
||||
Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
Str.(global_replace (regexp "n") "") |>
|
||||
Z.of_string in
|
||||
@ -495,8 +472,7 @@ let mk_nat lexeme region =
|
||||
else Ok (Nat Region.{region; value = lexeme,z})
|
||||
|
||||
let mk_mutez lexeme region =
|
||||
let z =
|
||||
Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
let z = Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
Str.(global_replace (regexp "mutez") "") |>
|
||||
Z.of_string in
|
||||
if Z.equal z Z.zero && lexeme <> "0mutez"
|
||||
@ -557,8 +533,7 @@ let mk_constr lexeme region =
|
||||
|
||||
type attr_err = Invalid_attribute
|
||||
|
||||
let mk_attr _header _string _region =
|
||||
Error Invalid_attribute
|
||||
let mk_attr _ _ _ = Error Invalid_attribute
|
||||
|
||||
(* Predicates *)
|
||||
|
||||
@ -567,12 +542,14 @@ let is_bytes = function Bytes _ -> true | _ -> false
|
||||
let is_int = function Int _ -> true | _ -> false
|
||||
let is_ident = function Ident _ -> true | _ -> false
|
||||
let is_eof = function EOF _ -> true | _ -> false
|
||||
let is_minus = function MINUS _ -> true | _ -> false
|
||||
|
||||
(* Errors *)
|
||||
|
||||
type error =
|
||||
Odd_lengthed_bytes
|
||||
| Missing_break
|
||||
| Negative_byte_sequence
|
||||
|
||||
let error_to_string = function
|
||||
Odd_lengthed_bytes ->
|
||||
@ -581,6 +558,9 @@ let error_to_string = function
|
||||
| Missing_break ->
|
||||
"Missing break.\n\
|
||||
Hint: Insert some space."
|
||||
| Negative_byte_sequence ->
|
||||
"Negative byte sequence.\n\
|
||||
Hint: Remove the leading minus sign."
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
@ -593,29 +573,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
|
||||
let fail region value = raise (Error Region.{region; value})
|
||||
|
||||
let check_right_context token next_token buffer : unit =
|
||||
if not (is_eof token) then
|
||||
if is_int token || is_bytes token then
|
||||
match next_token buffer with
|
||||
Some ([], next) ->
|
||||
let pos = (to_region token)#stop in
|
||||
let region = Region.make ~start:pos ~stop:pos in
|
||||
if is_int next then
|
||||
fail region Odd_lengthed_bytes
|
||||
else
|
||||
if is_ident next || is_string next || is_bytes next then
|
||||
fail region Missing_break
|
||||
| Some (_::_, _) | None -> ()
|
||||
else
|
||||
if is_ident token || is_string token then
|
||||
match next_token buffer with
|
||||
Some ([], next) ->
|
||||
if is_ident next || is_string next
|
||||
|| is_bytes next || is_int next
|
||||
then
|
||||
let pos = (to_region token)#stop in
|
||||
let region = Region.make ~start:pos ~stop:pos
|
||||
in fail region Missing_break
|
||||
| Some (_::_, _) | None -> ()
|
||||
None -> ()
|
||||
| Some (markup, next) ->
|
||||
if is_minus token && is_bytes next
|
||||
then let region =
|
||||
Region.cover (to_region token) (to_region next)
|
||||
in fail region Negative_byte_sequence
|
||||
else
|
||||
match markup with
|
||||
[] ->
|
||||
if is_int token
|
||||
then if is_string next || is_ident next
|
||||
then fail region Missing_break
|
||||
else ()
|
||||
else
|
||||
if is_string token
|
||||
then if is_int next || is_bytes next || is_ident next
|
||||
then fail region Missing_break
|
||||
else ()
|
||||
else
|
||||
if is_bytes token
|
||||
then if is_string next || is_ident next
|
||||
then fail region Missing_break
|
||||
else if is_int next
|
||||
then fail region Odd_lengthed_bytes
|
||||
else ()
|
||||
else ()
|
||||
| _::_ -> ()
|
||||
|
||||
(* END TRAILER *)
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile
|
||||
|
||||
../shared/Lexer.mli
|
||||
../shared/Lexer.mll
|
||||
../shared/LexerLib.ml
|
||||
../shared/EvalOpt.ml
|
||||
../shared/EvalOpt.mli
|
||||
../shared/FQueue.ml
|
||||
|
@ -57,6 +57,7 @@ type t =
|
||||
| COLON of Region.t (* ":" *)
|
||||
| DOT of Region.t (* "." *)
|
||||
| ELLIPSIS of Region.t (* "..." *)
|
||||
| ARROW of Region.t (* "=>" *)
|
||||
|
||||
(* Wildcard *)
|
||||
|
||||
@ -69,10 +70,10 @@ type t =
|
||||
| NE of Region.t (* "!=" *)
|
||||
| LT of Region.t (* "<" *)
|
||||
| GT of Region.t (* ">" *)
|
||||
| LE of Region.t (* "=<" *)
|
||||
| LE of Region.t (* "<=" *)
|
||||
| GE of Region.t (* ">=" *)
|
||||
|
||||
| ARROW of Region.t (* "=>" *)
|
||||
(* Logic *)
|
||||
|
||||
| BOOL_OR of Region.t (* "||" *)
|
||||
| BOOL_AND of Region.t (* "&&" *)
|
||||
@ -95,10 +96,10 @@ type t =
|
||||
| False of Region.t
|
||||
| If of Region.t
|
||||
| Let of Region.t
|
||||
| Rec of Region.t
|
||||
| Switch of Region.t
|
||||
| Mod of Region.t
|
||||
| Or of Region.t
|
||||
| Rec of Region.t
|
||||
| Switch of Region.t
|
||||
| True of Region.t
|
||||
| Type of Region.t
|
||||
|
||||
@ -111,8 +112,6 @@ type t =
|
||||
|
||||
| EOF of Region.t (* End of file *)
|
||||
|
||||
type token = t
|
||||
|
||||
(* Projections
|
||||
|
||||
The difference between extracting the lexeme and a string from a
|
||||
@ -121,6 +120,8 @@ type token = t
|
||||
lexeme (concrete syntax).
|
||||
*)
|
||||
|
||||
type token = t
|
||||
|
||||
val to_lexeme : token -> lexeme
|
||||
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
|
||||
val to_region : token -> Region.t
|
||||
|
@ -1,15 +1,17 @@
|
||||
{
|
||||
(* START OF HEADER *)
|
||||
|
||||
type lexeme = string
|
||||
|
||||
let sprintf = Printf.sprintf
|
||||
(* Shorthands *)
|
||||
|
||||
module Region = Simple_utils.Region
|
||||
module Pos = Simple_utils.Pos
|
||||
module SMap = Utils.String.Map
|
||||
module SSet = Utils.String.Set
|
||||
|
||||
type lexeme = string
|
||||
|
||||
let sprintf = Printf.sprintf
|
||||
|
||||
(* TOKENS *)
|
||||
|
||||
type t =
|
||||
@ -41,6 +43,7 @@ type t =
|
||||
| COLON of Region.t (* ":" *)
|
||||
| DOT of Region.t (* "." *)
|
||||
| ELLIPSIS of Region.t (* "..." *)
|
||||
| ARROW of Region.t (* "=>" *)
|
||||
|
||||
(* Wildcard *)
|
||||
|
||||
@ -55,11 +58,11 @@ type t =
|
||||
| GT of Region.t (* ">" *)
|
||||
| LE of Region.t (* "<=" *)
|
||||
| GE of Region.t (* ">=" *)
|
||||
| ARROW of Region.t (* "=>" *)
|
||||
|
||||
(* Logic *)
|
||||
|
||||
| BOOL_OR of Region.t (* "||" *)
|
||||
| BOOL_AND of Region.t (* "&&" *)
|
||||
|
||||
| NOT of Region.t (* ! *)
|
||||
|
||||
(* Identifiers, labels, numbers and strings *)
|
||||
@ -75,17 +78,17 @@ type t =
|
||||
|
||||
(* Keywords *)
|
||||
|
||||
(*| And*)
|
||||
| Else of Region.t
|
||||
| False of Region.t
|
||||
| If of Region.t
|
||||
| Let of Region.t
|
||||
| Rec of Region.t
|
||||
| Switch of Region.t
|
||||
| Mod of Region.t
|
||||
| Or of Region.t
|
||||
| Rec of Region.t
|
||||
| Switch of Region.t
|
||||
| True of Region.t
|
||||
| Type of Region.t
|
||||
|
||||
(* Data constructors *)
|
||||
|
||||
| C_None of Region.t (* "None" *)
|
||||
@ -96,10 +99,32 @@ type t =
|
||||
| EOF of Region.t (* End of file *)
|
||||
|
||||
|
||||
(* Projections *)
|
||||
|
||||
type token = t
|
||||
|
||||
let proj_token = function
|
||||
CAT region -> region, "CAT"
|
||||
(* Literals *)
|
||||
|
||||
String Region.{region; value} ->
|
||||
region, sprintf "String %s" value
|
||||
| Bytes Region.{region; value = s,b} ->
|
||||
region,
|
||||
sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b)
|
||||
| Int Region.{region; value = s,n} ->
|
||||
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
|
||||
| Nat Region.{region; value = s,n} ->
|
||||
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
|
||||
| Mutez Region.{region; value = s,n} ->
|
||||
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
|
||||
| Ident Region.{region; value} ->
|
||||
region, sprintf "Ident %s" value
|
||||
| Constr Region.{region; value} ->
|
||||
region, sprintf "Constr %s" value
|
||||
|
||||
(* Symbols *)
|
||||
|
||||
| CAT region -> region, "CAT"
|
||||
| MINUS region -> region, "MINUS"
|
||||
| PLUS region -> region, "PLUS"
|
||||
| SLASH region -> region, "SLASH"
|
||||
@ -125,24 +150,9 @@ let proj_token = function
|
||||
| LE region -> region, "LE"
|
||||
| GE region -> region, "GE"
|
||||
| ARROW region -> region, "ARROW"
|
||||
| NOT region -> region, "NOT"
|
||||
| BOOL_OR region -> region, "BOOL_OR"
|
||||
| BOOL_AND region -> region, "BOOL_AND"
|
||||
| Ident Region.{region; value} ->
|
||||
region, sprintf "Ident %s" value
|
||||
| Constr Region.{region; value} ->
|
||||
region, sprintf "Constr %s" value
|
||||
| Int Region.{region; value = s,n} ->
|
||||
region, sprintf "Int (\"%s\", %s)" s (Z.to_string n)
|
||||
| Nat Region.{region; value = s,n} ->
|
||||
region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n)
|
||||
| Mutez Region.{region; value = s,n} ->
|
||||
region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n)
|
||||
| String Region.{region; value} ->
|
||||
region, sprintf "String %s" value
|
||||
| Bytes Region.{region; value = s,b} ->
|
||||
region,
|
||||
sprintf "Bytes (\"%s\", \"0x%s\")"
|
||||
s (Hex.show b)
|
||||
| Else region -> region, "Else"
|
||||
| False region -> region, "False"
|
||||
| If region -> region, "If"
|
||||
@ -150,7 +160,6 @@ let proj_token = function
|
||||
| Rec region -> region, "Rec"
|
||||
| Switch region -> region, "Switch"
|
||||
| Mod region -> region, "Mod"
|
||||
| NOT region -> region, "!"
|
||||
| Or region -> region, "Or"
|
||||
| True region -> region, "True"
|
||||
| Type region -> region, "Type"
|
||||
@ -160,7 +169,20 @@ let proj_token = function
|
||||
| EOF region -> region, "EOF"
|
||||
|
||||
let to_lexeme = function
|
||||
CAT _ -> "++"
|
||||
(* Literals *)
|
||||
|
||||
String s -> s.Region.value
|
||||
| Bytes b -> fst b.Region.value
|
||||
| Int i
|
||||
| Nat i
|
||||
| Mutez i -> fst i.Region.value
|
||||
| Ident id -> id.Region.value
|
||||
| Constr id -> id.Region.value
|
||||
| Attr a -> a.Region.value
|
||||
|
||||
(* Symbols *)
|
||||
|
||||
| CAT _ -> "++"
|
||||
| MINUS _ -> "-"
|
||||
| PLUS _ -> "+"
|
||||
| SLASH _ -> "/"
|
||||
@ -188,29 +210,32 @@ let to_lexeme = function
|
||||
| ARROW _ -> "=>"
|
||||
| BOOL_OR _ -> "||"
|
||||
| BOOL_AND _ -> "&&"
|
||||
| Ident id -> id.Region.value
|
||||
| Constr id -> id.Region.value
|
||||
| Int i
|
||||
| Nat i
|
||||
| Mutez i -> fst i.Region.value
|
||||
| String s -> s.Region.value
|
||||
| Bytes b -> fst b.Region.value
|
||||
| NOT _ -> "!"
|
||||
|
||||
(* Keywords *)
|
||||
|
||||
| Else _ -> "else"
|
||||
| False _ -> "false"
|
||||
| If _ -> "if"
|
||||
| Let _ -> "let"
|
||||
| Rec _ -> "rec"
|
||||
| Mod _ -> "mod"
|
||||
| NOT _ -> "!"
|
||||
| Or _ -> "or"
|
||||
| Rec _ -> "rec"
|
||||
| Switch _ -> "switch"
|
||||
| True _ -> "true"
|
||||
| Type _ -> "type"
|
||||
|
||||
(* Data constructors *)
|
||||
|
||||
| C_None _ -> "None"
|
||||
| C_Some _ -> "Some"
|
||||
| Attr a -> a.Region.value
|
||||
|
||||
(* Virtual tokens *)
|
||||
|
||||
| EOF _ -> ""
|
||||
|
||||
(* CONVERSIONS *)
|
||||
|
||||
let to_string token ?(offsets=true) mode =
|
||||
let region, val_str = proj_token token in
|
||||
let reg_str = region#compact ~offsets mode
|
||||
@ -261,12 +286,9 @@ let reserved =
|
||||
|> add "functor"
|
||||
|> add "inherit"
|
||||
|> add "initializer"
|
||||
(* |> add "land" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|
||||
|> add "lazy"
|
||||
(* |> add "lor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|
||||
|> add "lsl"
|
||||
|> add "lsr"
|
||||
(* |> add "lxor" - see https://ligo.atlassian.net/browse/LIGO-263 *)
|
||||
|> add "match"
|
||||
|> add "method"
|
||||
|> add "module"
|
||||
@ -291,7 +313,7 @@ let reserved =
|
||||
|
||||
let constructors = [
|
||||
(fun reg -> C_None reg);
|
||||
(fun reg -> C_Some reg);
|
||||
(fun reg -> C_Some reg)
|
||||
]
|
||||
|
||||
let add map (key, value) = SMap.add key value map
|
||||
@ -376,8 +398,7 @@ let mk_int lexeme region =
|
||||
else Ok (Int Region.{region; value = lexeme, z})
|
||||
|
||||
let mk_nat lexeme region =
|
||||
let z =
|
||||
Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
let z = Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
Str.(global_replace (regexp "n") "") |>
|
||||
Z.of_string in
|
||||
if Z.equal z Z.zero && lexeme <> "0n"
|
||||
@ -385,8 +406,7 @@ let mk_nat lexeme region =
|
||||
else Ok (Nat Region.{region; value = lexeme, z})
|
||||
|
||||
let mk_mutez lexeme region =
|
||||
let z =
|
||||
Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
let z = Str.(global_replace (regexp "_") "" lexeme) |>
|
||||
Str.(global_replace (regexp "mutez") "") |>
|
||||
Z.of_string in
|
||||
if Z.equal z Z.zero && lexeme <> "0mutez"
|
||||
@ -465,12 +485,14 @@ let is_bytes = function Bytes _ -> true | _ -> false
|
||||
let is_int = function Int _ -> true | _ -> false
|
||||
let is_ident = function Ident _ -> true | _ -> false
|
||||
let is_eof = function EOF _ -> true | _ -> false
|
||||
let is_minus = function MINUS _ -> true | _ -> false
|
||||
|
||||
(* Errors *)
|
||||
|
||||
type error =
|
||||
Odd_lengthed_bytes
|
||||
| Missing_break
|
||||
| Negative_byte_sequence
|
||||
|
||||
let error_to_string = function
|
||||
Odd_lengthed_bytes ->
|
||||
@ -479,6 +501,9 @@ let error_to_string = function
|
||||
| Missing_break ->
|
||||
"Missing break.\n\
|
||||
Hint: Insert some space."
|
||||
| Negative_byte_sequence ->
|
||||
"Negative byte sequence.\n\
|
||||
Hint: Remove the leading minus sign."
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
@ -491,29 +516,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file =
|
||||
let fail region value = raise (Error Region.{region; value})
|
||||
|
||||
let check_right_context token next_token buffer : unit =
|
||||
if not (is_eof token) then
|
||||
if is_int token || is_bytes token then
|
||||
match next_token buffer with
|
||||
Some ([], next) ->
|
||||
let pos = (to_region token)#stop in
|
||||
let region = Region.make ~start:pos ~stop:pos in
|
||||
if is_int next then
|
||||
fail region Odd_lengthed_bytes
|
||||
else
|
||||
if is_ident next || is_string next || is_bytes next then
|
||||
fail region Missing_break
|
||||
| Some (_::_, _) | None -> ()
|
||||
else
|
||||
if is_ident token || is_string token then
|
||||
match next_token buffer with
|
||||
Some ([], next) ->
|
||||
if is_ident next || is_string next
|
||||
|| is_bytes next || is_int next
|
||||
then
|
||||
let pos = (to_region token)#stop in
|
||||
let region = Region.make ~start:pos ~stop:pos
|
||||
in fail region Missing_break
|
||||
| Some (_::_, _) | None -> ()
|
||||
None -> ()
|
||||
| Some (markup, next) ->
|
||||
if is_minus token && is_bytes next
|
||||
then let region =
|
||||
Region.cover (to_region token) (to_region next)
|
||||
in fail region Negative_byte_sequence
|
||||
else
|
||||
match markup with
|
||||
[] ->
|
||||
if is_int token
|
||||
then if is_string next || is_ident next
|
||||
then fail region Missing_break
|
||||
else ()
|
||||
else
|
||||
if is_string token
|
||||
then if is_int next || is_bytes next || is_ident next
|
||||
then fail region Missing_break
|
||||
else ()
|
||||
else
|
||||
if is_bytes token
|
||||
then if is_string next || is_ident next
|
||||
then fail region Missing_break
|
||||
else if is_int next
|
||||
then fail region Odd_lengthed_bytes
|
||||
else ()
|
||||
else ()
|
||||
| _::_ -> ()
|
||||
|
||||
(* END TRAILER *)
|
||||
}
|
||||
|
@ -1,2 +0,0 @@
|
||||
$HOME/git/OCaml-build/Makefile
|
||||
$HOME/git/OCaml-build/Makefile.cfg
|
@ -38,8 +38,6 @@
|
||||
module Region = Simple_utils.Region
|
||||
module Pos = Simple_utils.Pos
|
||||
|
||||
type lexeme = string
|
||||
|
||||
(* TOKENS *)
|
||||
|
||||
(* The signature [TOKEN] exports an abstract type [token], so a lexer
|
||||
@ -54,6 +52,8 @@ type lexeme = string
|
||||
reading the ocamllex specification for the lexer ([Lexer.mll]).
|
||||
*)
|
||||
|
||||
type lexeme = string
|
||||
|
||||
module type TOKEN =
|
||||
sig
|
||||
type token
|
||||
@ -112,10 +112,36 @@ module type TOKEN =
|
||||
unit
|
||||
end
|
||||
|
||||
(* The signature of the lexer *)
|
||||
|
||||
module type S =
|
||||
sig
|
||||
module Token : TOKEN
|
||||
type token = Token.token
|
||||
|
||||
(* The scanner [init] is meant to be called first to read the
|
||||
BOM. Then [scan] is called. *)
|
||||
|
||||
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
|
||||
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
|
||||
|
||||
(* Errors (specific to the generic lexer, not to the tokens) *)
|
||||
|
||||
type error
|
||||
|
||||
val error_to_string : error -> string
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
val format_error :
|
||||
?offsets:bool -> [`Byte | `Point] ->
|
||||
error Region.reg -> file:bool -> string Region.reg
|
||||
end
|
||||
|
||||
(* The functorised interface
|
||||
|
||||
Note that the module parameter [Token] is re-exported as a
|
||||
submodule in [S].
|
||||
*)
|
||||
|
||||
module Make (Token: TOKEN) : LexerLib.S with module Token = Token
|
||||
module Make (Token : TOKEN) : S with module Token = Token
|
||||
|
@ -8,12 +8,6 @@ module Pos = Simple_utils.Pos
|
||||
|
||||
(* START HEADER *)
|
||||
|
||||
type lexeme = string
|
||||
|
||||
(* ALIASES *)
|
||||
|
||||
let sprintf = Printf.sprintf
|
||||
|
||||
(* TOKENS *)
|
||||
|
||||
(* The signature [TOKEN] exports an abstract type [token], so a lexer
|
||||
@ -22,6 +16,8 @@ let sprintf = Printf.sprintf
|
||||
caracterises the virtual token for end-of-file, because it requires
|
||||
special handling. *)
|
||||
|
||||
type lexeme = string
|
||||
|
||||
module type TOKEN =
|
||||
sig
|
||||
type token
|
||||
@ -84,31 +80,39 @@ module type TOKEN =
|
||||
submodule in [S].
|
||||
*)
|
||||
|
||||
module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
|
||||
module type S =
|
||||
sig
|
||||
module Token : TOKEN
|
||||
type token = Token.token
|
||||
|
||||
val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
|
||||
val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
|
||||
|
||||
type error
|
||||
|
||||
val error_to_string : error -> string
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
val format_error :
|
||||
?offsets:bool -> [`Byte | `Point] ->
|
||||
error Region.reg -> file:bool -> string Region.reg
|
||||
end
|
||||
|
||||
module Make (Token : TOKEN) : (S with module Token = Token) =
|
||||
struct
|
||||
module Token = Token
|
||||
type token = Token.token
|
||||
|
||||
type file_path = string
|
||||
|
||||
type line_comment = LexerLib.line_comment
|
||||
type block_comment = LexerLib.block_comment
|
||||
let mk_block = LexerLib.mk_block
|
||||
|
||||
(* ERRORS *)
|
||||
|
||||
type error =
|
||||
Invalid_utf8_sequence
|
||||
| Unexpected_character of char
|
||||
| Undefined_escape_sequence
|
||||
(* | Missing_break*)
|
||||
| Unterminated_string
|
||||
(* | Unterminated_integer*)
|
||||
(* | Odd_lengthed_bytes*)
|
||||
| Unterminated_comment of string
|
||||
(* | Orphan_minus*)
|
||||
| Non_canonical_zero
|
||||
(* | Negative_byte_sequence *)
|
||||
| Broken_string
|
||||
| Invalid_character_in_string
|
||||
| Reserved_name of string
|
||||
@ -116,6 +120,8 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
|
||||
| Invalid_natural
|
||||
| Invalid_attribute
|
||||
|
||||
let sprintf = Printf.sprintf
|
||||
|
||||
let error_to_string = function
|
||||
Invalid_utf8_sequence ->
|
||||
"Invalid UTF-8 sequence."
|
||||
@ -124,30 +130,15 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) =
|
||||
| Undefined_escape_sequence ->
|
||||
"Undefined escape sequence.\n\
|
||||
Hint: Remove or replace the sequence."
|
||||
(* | Missing_break ->
|
||||
"Missing break.\n\
|
||||
Hint: Insert some space."
|
||||
*) | Unterminated_string ->
|
||||
| Unterminated_string ->
|
||||
"Unterminated string.\n\
|
||||
Hint: Close with double quotes."
|
||||
(* | Unterminated_integer ->
|
||||
"Unterminated integer.\n\
|
||||
Hint: Remove the sign or proceed with a natural number." *)
|
||||
(* | Odd_lengthed_bytes ->
|
||||
"The length of the byte sequence is an odd number.\n\
|
||||
Hint: Add or remove a digit."
|
||||
*) | Unterminated_comment ending ->
|
||||
| Unterminated_comment ending ->
|
||||
sprintf "Unterminated comment.\n\
|
||||
Hint: Close with \"%s\"." ending
|
||||
(* | Orphan_minus ->
|
||||
"Orphan minus sign.\n\
|
||||
Hint: Remove the trailing space." *)
|
||||
| Non_canonical_zero ->
|
||||
"Non-canonical zero.\n\
|
||||
Hint: Use 0."
|
||||
(* | Negative_byte_sequence ->
|
||||
"Negative byte sequence.\n\
|
||||
Hint: Remove the leading minus sign." *)
|
||||
| Broken_string ->
|
||||
"The string starting here is interrupted by a line break.\n\
|
||||
Hint: Remove the break, close the string before or insert a \
|
||||
@ -369,6 +360,7 @@ and scan state = parse
|
||||
nl { scan (state#push_newline lexbuf) lexbuf }
|
||||
| ' '+ { scan (state#push_space lexbuf) lexbuf }
|
||||
| '\t'+ { scan (state#push_tabs lexbuf) lexbuf }
|
||||
|
||||
| ident { mk_ident state lexbuf }
|
||||
| constr { mk_constr state lexbuf }
|
||||
| bytes { mk_bytes seq state lexbuf }
|
||||
@ -420,27 +412,14 @@ and scan state = parse
|
||||
let state = state#set_pos pos in
|
||||
scan state lexbuf }
|
||||
|
||||
(* Some special errors *)
|
||||
|
||||
(*
|
||||
| '-' { let region, _, state = state#sync lexbuf in
|
||||
let state = scan state lexbuf in
|
||||
let open Markup in
|
||||
match FQueue.peek state#units with
|
||||
None -> assert false
|
||||
| Some (_, ((Space _ | Tabs _)::_, token))
|
||||
when Token.is_int token -> fail region Orphan_minus
|
||||
| _ -> fail region Unterminated_integer }
|
||||
|
||||
| "-0x" byte_seq?
|
||||
{ let region, _, _ = state#sync lexbuf
|
||||
in fail region Negative_byte_sequence }
|
||||
*)
|
||||
(* String *)
|
||||
|
||||
| '"' { let opening, lexeme, state = state#sync lexbuf in
|
||||
let thread = LexerLib.mk_thread opening lexeme in
|
||||
scan_string thread state lexbuf |> mk_string }
|
||||
|
||||
(* Comments *)
|
||||
|
||||
| block_comment_openings {
|
||||
let lexeme = Lexing.lexeme lexbuf in
|
||||
match state#block with
|
||||
@ -496,8 +475,6 @@ and scan_flags state acc = parse
|
||||
| eof { let _, _, state = state#sync lexbuf
|
||||
in List.rev acc, state }
|
||||
|
||||
(* TODO: Move below to [LexerCommon.mll] *)
|
||||
|
||||
(* Finishing a string *)
|
||||
|
||||
and scan_string thread state = parse
|
||||
@ -624,164 +601,6 @@ and scan_utf8_inline thread state = parse
|
||||
|
||||
{
|
||||
(* START TRAILER *)
|
||||
|
||||
|
||||
(* Scanning the lexing buffer for tokens (and markup, as a
|
||||
side-effect).
|
||||
|
||||
Because we want the lexer to have access to the right lexical
|
||||
context of a recognised lexeme (to enforce stylistic constraints or
|
||||
report special error patterns), we need to keep a hidden reference
|
||||
to a queue of recognised lexical units (that is, tokens and markup)
|
||||
that acts as a mutable state between the calls to [read]. When
|
||||
[read] is called, that queue is examined first and, if it contains
|
||||
at least one token, that token is returned; otherwise, the lexing
|
||||
buffer is scanned for at least one more new token. That is the
|
||||
general principle: we put a high-level buffer (our queue) on top of
|
||||
the low-level lexing buffer.
|
||||
|
||||
One tricky and important detail is that we must make any parser
|
||||
generated by Menhir (and calling [read]) believe that the last
|
||||
region of the input source that was matched indeed corresponds to
|
||||
the returned token, despite that many tokens and markup may have
|
||||
been matched since it was actually read from the input. In other
|
||||
words, the parser requests a token that is taken from the
|
||||
high-level buffer, but the parser requests the source regions from
|
||||
the _low-level_ lexing buffer, and they may disagree if more than
|
||||
one token has actually been recognised.
|
||||
|
||||
Consequently, in order to maintain a consistent view for the
|
||||
parser, we have to patch some fields of the lexing buffer, namely
|
||||
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
|
||||
generated by Menhir when querying source positions (regions). This
|
||||
is the purpose of the function [patch_buffer]. After reading one
|
||||
or more tokens and markup by the scanning rule [scan], we have to
|
||||
save in the hidden reference [buf_reg] the region of the source
|
||||
that was matched by [scan]. This atomic sequence of patching,
|
||||
scanning and saving is implemented by the _function_ [scan]
|
||||
(beware: it shadows the scanning rule [scan]). The function
|
||||
[patch_buffer] is, of course, also called just before returning the
|
||||
token, so the parser has a view of the lexing buffer consistent
|
||||
with the token.
|
||||
|
||||
Note that an additional reference [first_call] is needed to
|
||||
distinguish the first call to the function [scan], as the first
|
||||
scanning rule is actually [init] (which can handle the BOM), not
|
||||
[scan].
|
||||
*)
|
||||
|
||||
type logger = Markup.t list -> token -> unit
|
||||
|
||||
type input =
|
||||
File of file_path
|
||||
| String of string
|
||||
| Channel of in_channel
|
||||
| Buffer of Lexing.lexbuf
|
||||
|
||||
type instance = {
|
||||
input : input;
|
||||
read : log:logger -> Lexing.lexbuf -> token;
|
||||
buffer : Lexing.lexbuf;
|
||||
get_win : unit -> token LexerLib.window;
|
||||
get_pos : unit -> Pos.t;
|
||||
get_last : unit -> Region.t;
|
||||
get_file : unit -> file_path;
|
||||
close : unit -> unit
|
||||
}
|
||||
|
||||
type open_err = File_opening of string
|
||||
|
||||
let lexbuf_from_input = function
|
||||
File path ->
|
||||
(try
|
||||
let chan = open_in path in
|
||||
let close () = close_in chan in
|
||||
let lexbuf = Lexing.from_channel chan in
|
||||
let () =
|
||||
let open Lexing in
|
||||
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
|
||||
in Ok (lexbuf, close)
|
||||
with Sys_error msg -> Stdlib.Error (File_opening msg))
|
||||
| String s ->
|
||||
Ok (Lexing.from_string s, fun () -> ())
|
||||
| Channel chan ->
|
||||
let close () = close_in chan in
|
||||
Ok (Lexing.from_channel chan, close)
|
||||
| Buffer b -> Ok (b, fun () -> ())
|
||||
|
||||
let open_token_stream ?line ?block input =
|
||||
let file_path = match input with
|
||||
File path -> path
|
||||
| _ -> "" in
|
||||
let pos = Pos.min ~file:file_path in
|
||||
let buf_reg = ref (pos#byte, pos#byte)
|
||||
and first_call = ref true
|
||||
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
|
||||
let supply = Uutf.Manual.src decoder in
|
||||
let state = ref (LexerLib.mk_state
|
||||
~units:FQueue.empty
|
||||
~last:Region.ghost
|
||||
~window:LexerLib.Nil
|
||||
~pos
|
||||
~markup:[]
|
||||
~decoder
|
||||
~supply
|
||||
?block
|
||||
?line
|
||||
()) in
|
||||
|
||||
let get_pos () = !state#pos
|
||||
and get_last () = !state#last
|
||||
and get_win () = !state#window
|
||||
and get_file () = file_path in
|
||||
|
||||
let patch_buffer (start, stop) buffer =
|
||||
let open Lexing in
|
||||
let file_path = buffer.lex_curr_p.pos_fname in
|
||||
buffer.lex_start_p <- {start with pos_fname = file_path};
|
||||
buffer.lex_curr_p <- {stop with pos_fname = file_path}
|
||||
|
||||
and save_region buffer =
|
||||
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
|
||||
|
||||
let scan buffer =
|
||||
patch_buffer !buf_reg buffer;
|
||||
(if !first_call
|
||||
then (state := init !state buffer; first_call := false)
|
||||
else state := scan !state buffer);
|
||||
save_region buffer in
|
||||
|
||||
let next_token buffer =
|
||||
scan buffer;
|
||||
match FQueue.peek !state#units with
|
||||
None -> None
|
||||
| Some (units, ext_token) ->
|
||||
state := !state#set_units units; Some ext_token in
|
||||
|
||||
let rec read ~log buffer =
|
||||
match FQueue.deq !state#units with
|
||||
None ->
|
||||
scan buffer;
|
||||
read ~log buffer
|
||||
| Some (units, (left_mark, token)) ->
|
||||
log left_mark token;
|
||||
state := ((!state#set_units units)
|
||||
#set_last (Token.to_region token))
|
||||
#slide_token token;
|
||||
Token.check_right_context token next_token buffer;
|
||||
patch_buffer (Token.to_region token)#byte_pos buffer;
|
||||
token in
|
||||
match lexbuf_from_input input with
|
||||
Ok (buffer, close) ->
|
||||
let () =
|
||||
match input with
|
||||
File path when path <> "" -> LexerLib.reset ~file:path buffer
|
||||
| _ -> () in
|
||||
let instance = {
|
||||
input; read; buffer; get_win; get_pos; get_last; get_file; close}
|
||||
in Ok instance
|
||||
| Error _ as e -> e
|
||||
|
||||
end (* of functor [Make] in HEADER *)
|
||||
(* END TRAILER *)
|
||||
}
|
||||
|
@ -1,159 +1,6 @@
|
||||
(* Sliding window *)
|
||||
|
||||
(* The type [window] models a two-slot buffer of tokens for reporting
|
||||
after a parse error. Technically, it is a parametric type, but its
|
||||
use is meant for tokens, wherever they are defined.
|
||||
|
||||
In [Two(t1,t2)], the token [t2] is the next to be sent to the
|
||||
parser.
|
||||
|
||||
The call [slide token buffer] pushes the token [token] in the
|
||||
buffer [buffer]. If the buffer is full, that is, it is
|
||||
[Two(t1,t2)], then the token [t2] is discarded to make room for
|
||||
[token].
|
||||
*)
|
||||
|
||||
type 'a window =
|
||||
Nil
|
||||
| One of 'a
|
||||
| Two of 'a * 'a
|
||||
|
||||
let slide token = function
|
||||
Nil -> One token
|
||||
| One t | Two (t,_) -> Two (token,t)
|
||||
|
||||
module Region = Simple_utils.Region
|
||||
module Pos = Simple_utils.Pos
|
||||
|
||||
type lexeme = string
|
||||
|
||||
(* The signature [TOKEN] exports an abstract type [token], so a lexer
|
||||
can be a functor over tokens. This enables to externalise
|
||||
version-dependent constraints in any module whose signature matches
|
||||
[TOKEN]. Generic functions to construct tokens are required.
|
||||
|
||||
Note the predicate [is_eof], which caracterises the virtual token
|
||||
for end-of-file, because it requires special handling.
|
||||
*)
|
||||
|
||||
module type TOKEN =
|
||||
sig
|
||||
type token
|
||||
|
||||
(* Predicates *)
|
||||
|
||||
val is_eof : token -> bool
|
||||
|
||||
(* Projections *)
|
||||
|
||||
val to_lexeme : token -> lexeme
|
||||
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
|
||||
val to_region : token -> Region.t
|
||||
|
||||
(* Style *)
|
||||
|
||||
type error
|
||||
|
||||
val error_to_string : error -> string
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
val format_error :
|
||||
?offsets:bool ->
|
||||
[`Byte | `Point] ->
|
||||
error Region.reg ->
|
||||
file:bool ->
|
||||
string Region.reg
|
||||
|
||||
val check_right_context :
|
||||
token ->
|
||||
(Lexing.lexbuf -> (Markup.t list * token) option) ->
|
||||
Lexing.lexbuf ->
|
||||
unit
|
||||
end
|
||||
|
||||
(* The module type for lexers is [S]. It mainly exports the function
|
||||
[open_token_stream], which returns
|
||||
|
||||
* a function [read] that extracts tokens from a lexing buffer,
|
||||
together with a lexing buffer [buffer] to read from,
|
||||
* a function [close] that closes that buffer,
|
||||
* a function [get_pos] that returns the current position, and
|
||||
* a function [get_last] that returns the region of the last
|
||||
recognised token.
|
||||
* a function [get_file] that returns the name of the file being
|
||||
scanned (empty string if [stdin]).
|
||||
|
||||
Note that a module [Token] is exported too, because the signature
|
||||
of the exported functions depend on it.
|
||||
|
||||
The type [window] is a two-token window, that is, a buffer that
|
||||
contains the last recognised token, and the penultimate (if any).
|
||||
|
||||
The call [read ~log] evaluates in a lexer (also known as a
|
||||
tokeniser or scanner) whose type is [Lexing.lexbuf -> token], and
|
||||
suitable for a parser generated by Menhir. The argument labelled
|
||||
[log] is a logger, that is, it may print a token and its left
|
||||
markup to a given channel, at the caller's discretion.
|
||||
*)
|
||||
|
||||
module type S =
|
||||
sig
|
||||
module Token : TOKEN
|
||||
type token = Token.token
|
||||
|
||||
type file_path = string
|
||||
type logger = Markup.t list -> token -> unit
|
||||
|
||||
type input =
|
||||
File of file_path
|
||||
| String of string
|
||||
| Channel of in_channel
|
||||
| Buffer of Lexing.lexbuf
|
||||
|
||||
type instance = {
|
||||
input : input;
|
||||
read : log:logger -> Lexing.lexbuf -> token;
|
||||
buffer : Lexing.lexbuf;
|
||||
get_win : unit -> token window;
|
||||
get_pos : unit -> Pos.t;
|
||||
get_last : unit -> Region.t;
|
||||
get_file : unit -> file_path;
|
||||
close : unit -> unit
|
||||
}
|
||||
|
||||
type open_err = File_opening of string
|
||||
|
||||
val lexbuf_from_input :
|
||||
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
|
||||
|
||||
type line_comment = string (* Opening of a line comment *)
|
||||
type block_comment = <opening : string; closing : string>
|
||||
|
||||
val mk_block : opening:string -> closing:string -> block_comment
|
||||
|
||||
val open_token_stream :
|
||||
?line:line_comment ->
|
||||
?block:block_comment ->
|
||||
input ->
|
||||
(instance, open_err) Stdlib.result
|
||||
|
||||
(* Error reporting *)
|
||||
|
||||
type error
|
||||
|
||||
val error_to_string : error -> string
|
||||
|
||||
exception Error of error Region.reg
|
||||
|
||||
val format_error :
|
||||
?offsets:bool ->
|
||||
[`Byte | `Point] ->
|
||||
error Region.reg ->
|
||||
file:bool ->
|
||||
string Region.reg
|
||||
end
|
||||
|
||||
(* LEXER ENGINE *)
|
||||
|
||||
(* Resetting file name and line number in the lexing buffer
|
||||
@ -205,27 +52,20 @@ let rollback buffer =
|
||||
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
|
||||
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
|
||||
|
||||
(* Utility types *)
|
||||
|
||||
type file_path = string
|
||||
type lexeme = string
|
||||
|
||||
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
|
||||
|
||||
(* When scanning structured constructs, like strings and comments, we
|
||||
need to keep the region of the opening symbol (like double quote,
|
||||
"//" or "(*") in order to report any error more precisely. Since
|
||||
ocamllex is byte-oriented, we need to store the parsed bytes as
|
||||
characters in an accumulator [acc] and also its length [len], so,
|
||||
we are done, it is easy to build the string making up the
|
||||
structured construct with [mk_str] (see above).
|
||||
|
||||
The resulting data structure is called a _thread_. (Note for
|
||||
Emacs: "*)".)
|
||||
*)
|
||||
|
||||
type thread = <
|
||||
opening : Region.t;
|
||||
length : int;
|
||||
acc : char list;
|
||||
to_string : string;
|
||||
push_char : char -> thread;
|
||||
push_string : string -> thread;
|
||||
to_string : string;
|
||||
set_opening : Region.t -> thread
|
||||
>
|
||||
|
||||
@ -233,12 +73,12 @@ let mk_thread region lexeme : thread =
|
||||
(* The call [explode s a] is the list made by pushing the characters
|
||||
in the string [s] on top of [a], in reverse order. For example,
|
||||
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
|
||||
|
||||
let explode s acc =
|
||||
let rec push = function
|
||||
0 -> acc
|
||||
| i -> s.[i-1] :: push (i-1)
|
||||
in push (String.length s)
|
||||
in
|
||||
in push (String.length s) in
|
||||
object
|
||||
val opening = region
|
||||
method opening = opening
|
||||
@ -273,84 +113,82 @@ let mk_thread region lexeme : thread =
|
||||
in fill (length-1) acc |> Bytes.to_string
|
||||
end
|
||||
|
||||
type file_path = string
|
||||
|
||||
(* STATE *)
|
||||
|
||||
(* Beyond tokens, the result of lexing is a state. The type
|
||||
[state] represents the logical state of the lexing engine, that
|
||||
is, a value which is threaded during scanning and which denotes
|
||||
useful, high-level information beyond what the type
|
||||
[Lexing.lexbuf] in the standard library already provides for
|
||||
all generic lexers.
|
||||
(* Scanning the lexing buffer for tokens (and markup, as a
|
||||
side-effect).
|
||||
|
||||
Tokens are the smallest units used by the parser to build the
|
||||
abstract syntax tree. The state includes a queue of recognised
|
||||
tokens, with the markup at the left of its lexeme until either
|
||||
the start of the file or the end of the previously recognised
|
||||
token.
|
||||
Because we want the lexer to have access to the right lexical
|
||||
context of a recognised lexeme (to enforce stylistic constraints or
|
||||
report special error patterns), we need to keep a hidden reference
|
||||
to a queue of recognised lexical units (that is, tokens and markup)
|
||||
that acts as a mutable state between the calls to [read]. When
|
||||
[read] is called, that queue is examined first and, if it contains
|
||||
at least one token, that token is returned; otherwise, the lexing
|
||||
buffer is scanned for at least one more new token. That is the
|
||||
general principle: we put a high-level buffer (our queue) on top of
|
||||
the low-level lexing buffer.
|
||||
|
||||
The markup from the last recognised token or, if the first
|
||||
token has not been recognised yet, from the beginning of the
|
||||
file is stored in the field [markup] of the state because it is
|
||||
a side-effect, with respect to the output token list, and we
|
||||
use a record with a single field [units] because that record
|
||||
may be easily extended during the future maintenance of this
|
||||
lexer.
|
||||
One tricky and important detail is that we must make any parser
|
||||
generated by Menhir (and calling [read]) believe that the last
|
||||
region of the input source that was matched indeed corresponds to
|
||||
the returned token, despite that many tokens and markup may have
|
||||
been matched since it was actually read from the input. In other
|
||||
words, the parser requests a token that is taken from the
|
||||
high-level buffer, but the parser requests the source regions from
|
||||
the _low-level_ lexing buffer, and they may disagree if more than
|
||||
one token has actually been recognised.
|
||||
|
||||
The state also includes a field [pos] which holds the current
|
||||
position in the LIGO source file. The position is not always
|
||||
updated after a single character has been matched: that depends
|
||||
on the regular expression that matched the lexing buffer.
|
||||
Consequently, in order to maintain a consistent view for the
|
||||
parser, we have to patch some fields of the lexing buffer, namely
|
||||
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
|
||||
generated by Menhir when querying source positions (regions). This
|
||||
is the purpose of the function [patch_buffer]. After reading one or
|
||||
more tokens and markup by the scanning rule [scan], we have to save
|
||||
in the hidden reference [buf_reg] the region of the source that was
|
||||
matched by [scan]. This atomic sequence of patching, scanning and
|
||||
saving is implemented by the _function_ [scan] (beware: it shadows
|
||||
the scanning rule [scan]). The function [patch_buffer] is, of
|
||||
course, also called just before returning the token, so the parser
|
||||
has a view of the lexing buffer consistent with the token.
|
||||
|
||||
The field [window] is a two-token window, that is, a buffer
|
||||
that contains the last recognised token, and the penultimate
|
||||
(if any).
|
||||
|
||||
The fields [decoder] and [supply] offer the support needed
|
||||
for the lexing of UTF-8 encoded characters in comments (the
|
||||
only place where they are allowed in LIGO). The former is the
|
||||
decoder proper and the latter is the effectful function
|
||||
[supply] that takes a byte, a start index and a length and feed
|
||||
it to [decoder]. See the documentation of the third-party
|
||||
library Uutf.
|
||||
Note that an additional reference [first_call] is needed to
|
||||
distinguish the first call to the function [scan], as the first
|
||||
scanning rule is actually [init] (which can handle the BOM), not
|
||||
[scan].
|
||||
*)
|
||||
|
||||
type line_comment = string (* Opening of a line comment *)
|
||||
type block_comment = <opening : string; closing : string>
|
||||
type 'token window =
|
||||
Nil
|
||||
| One of 'token
|
||||
| Two of 'token * 'token
|
||||
|
||||
let mk_block ~opening ~closing : block_comment =
|
||||
object
|
||||
method opening = opening
|
||||
method closing = closing
|
||||
end
|
||||
|
||||
type 'a state = <
|
||||
units : (Markup.t list * 'a) FQueue.t;
|
||||
type 'token state = <
|
||||
units : (Markup.t list * 'token) FQueue.t;
|
||||
markup : Markup.t list;
|
||||
window : 'a window;
|
||||
window : 'token window;
|
||||
last : Region.t;
|
||||
pos : Pos.t;
|
||||
decoder : Uutf.decoder;
|
||||
supply : Bytes.t -> int -> int -> unit;
|
||||
block : block_comment option;
|
||||
line : line_comment option;
|
||||
block : EvalOpt.block_comment option;
|
||||
line : EvalOpt.line_comment option;
|
||||
|
||||
enqueue : 'a -> 'a state;
|
||||
set_units : (Markup.t list * 'a) FQueue.t -> 'a state;
|
||||
set_last : Region.t -> 'a state;
|
||||
set_pos : Pos.t -> 'a state;
|
||||
slide_token : 'a -> 'a state;
|
||||
enqueue : 'token -> 'token state;
|
||||
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
|
||||
set_last : Region.t -> 'token state;
|
||||
set_pos : Pos.t -> 'token state;
|
||||
slide_token : 'token -> 'token state;
|
||||
|
||||
sync : Lexing.lexbuf -> Region.t * lexeme * 'a state;
|
||||
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
|
||||
|
||||
push_newline : Lexing.lexbuf -> 'a state;
|
||||
push_line : thread -> 'a state;
|
||||
push_block : thread -> 'a state;
|
||||
push_space : Lexing.lexbuf -> 'a state;
|
||||
push_tabs : Lexing.lexbuf -> 'a state;
|
||||
push_bom : Lexing.lexbuf -> 'a state;
|
||||
push_markup : Markup.t -> 'a state;
|
||||
push_newline : Lexing.lexbuf -> 'token state;
|
||||
push_line : thread -> 'token state;
|
||||
push_block : thread -> 'token state;
|
||||
push_space : Lexing.lexbuf -> 'token state;
|
||||
push_tabs : Lexing.lexbuf -> 'token state;
|
||||
push_bom : Lexing.lexbuf -> 'token state;
|
||||
push_markup : Markup.t -> 'token state;
|
||||
>
|
||||
|
||||
let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply
|
||||
@ -371,12 +209,6 @@ type file_path = string
|
||||
method block = block
|
||||
method line = line
|
||||
|
||||
(* The call [enqueue (token, state)] updates functionally the
|
||||
state [state] by associating the token [token] with the
|
||||
stored markup and enqueuing the pair into the units
|
||||
queue. The field [markup] is then reset to the empty
|
||||
list. *)
|
||||
|
||||
method enqueue token =
|
||||
{< units = FQueue.enq (markup, token) units;
|
||||
markup = [] >}
|
||||
@ -386,13 +218,9 @@ type file_path = string
|
||||
method set_pos pos = {< pos = pos >}
|
||||
|
||||
method slide_token token =
|
||||
{< window = slide token window >}
|
||||
|
||||
(* The call [sync state buffer] updates the current position
|
||||
in accordance with the contents of the lexing buffer, more
|
||||
precisely, depending on the length of the string which has
|
||||
just been recognised by the scanner: that length is used as
|
||||
a positive offset to the current column. *)
|
||||
match self#window with
|
||||
Nil -> {< window = One token >}
|
||||
| One t | Two (t,_) -> {< window = Two (token,t) >}
|
||||
|
||||
method sync buffer =
|
||||
let lex = Lexing.lexeme buffer in
|
||||
@ -447,5 +275,121 @@ type file_path = string
|
||||
let region, value, state = self#sync buffer in
|
||||
let unit = Markup.BOM Region.{region; value}
|
||||
in state#push_markup unit
|
||||
|
||||
end
|
||||
|
||||
(* LEXER INSTANCE *)
|
||||
|
||||
type input =
|
||||
File of file_path
|
||||
| String of string
|
||||
| Channel of in_channel
|
||||
| Buffer of Lexing.lexbuf
|
||||
|
||||
type 'token logger = Markup.t list -> 'token -> unit
|
||||
|
||||
type 'token instance = {
|
||||
input : input;
|
||||
read : log:('token logger) -> Lexing.lexbuf -> 'token;
|
||||
buffer : Lexing.lexbuf;
|
||||
get_win : unit -> 'token window;
|
||||
get_pos : unit -> Pos.t;
|
||||
get_last : unit -> Region.t;
|
||||
get_file : unit -> file_path;
|
||||
close : unit -> unit
|
||||
}
|
||||
|
||||
type open_err = File_opening of string
|
||||
|
||||
let lexbuf_from_input = function
|
||||
String s ->
|
||||
Ok (Lexing.from_string s, fun () -> ())
|
||||
| Channel chan ->
|
||||
let close () = close_in chan in
|
||||
Ok (Lexing.from_channel chan, close)
|
||||
| Buffer b ->
|
||||
Ok (b, fun () -> ())
|
||||
| File path ->
|
||||
try
|
||||
let chan = open_in path in
|
||||
let close () = close_in chan in
|
||||
let lexbuf = Lexing.from_channel chan in
|
||||
let () =
|
||||
let open Lexing in
|
||||
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
|
||||
in Ok (lexbuf, close)
|
||||
with Sys_error msg -> Stdlib.Error (File_opening msg)
|
||||
|
||||
let open_token_stream ?line ?block ~init ~scan
|
||||
~token_to_region ~style input =
|
||||
let file_path = match input with
|
||||
File path -> path
|
||||
| _ -> "" in
|
||||
let pos = Pos.min ~file:file_path in
|
||||
let buf_reg = ref (pos#byte, pos#byte)
|
||||
and first_call = ref true
|
||||
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
|
||||
let supply = Uutf.Manual.src decoder in
|
||||
let state = ref (mk_state
|
||||
~units:FQueue.empty
|
||||
~last:Region.ghost
|
||||
~window:Nil
|
||||
~pos
|
||||
~markup:[]
|
||||
~decoder
|
||||
~supply
|
||||
?block
|
||||
?line
|
||||
()) in
|
||||
let get_pos () = !state#pos
|
||||
and get_last () = !state#last
|
||||
and get_win () = !state#window
|
||||
and get_file () = file_path in
|
||||
|
||||
let patch_buffer (start, stop) buffer =
|
||||
let open Lexing in
|
||||
let file_path = buffer.lex_curr_p.pos_fname in
|
||||
buffer.lex_start_p <- {start with pos_fname = file_path};
|
||||
buffer.lex_curr_p <- {stop with pos_fname = file_path}
|
||||
|
||||
and save_region buffer =
|
||||
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
|
||||
|
||||
let scan' init scan buffer =
|
||||
patch_buffer !buf_reg buffer;
|
||||
(if !first_call
|
||||
then (state := init !state buffer; first_call := false)
|
||||
else state := scan !state buffer);
|
||||
save_region buffer in
|
||||
|
||||
let next_token init scan buffer =
|
||||
scan' init scan buffer;
|
||||
match FQueue.peek !state#units with
|
||||
None -> None
|
||||
| Some (units, ext_token) ->
|
||||
state := !state#set_units units; Some ext_token in
|
||||
|
||||
let rec read init scan ~token_to_region ~style ~log buffer =
|
||||
match FQueue.deq !state#units with
|
||||
None ->
|
||||
scan' init scan buffer;
|
||||
read init scan ~token_to_region ~style ~log buffer
|
||||
| Some (units, (left_mark, token)) ->
|
||||
log left_mark token;
|
||||
state := ((!state#set_units units)
|
||||
#set_last (token_to_region token))
|
||||
#slide_token token;
|
||||
style token (next_token init scan) buffer;
|
||||
patch_buffer (token_to_region token)#byte_pos buffer;
|
||||
token in
|
||||
|
||||
match lexbuf_from_input input with
|
||||
Ok (buffer, close) ->
|
||||
let () =
|
||||
match input with
|
||||
File path when path <> "" -> reset ~file:path buffer
|
||||
| _ -> () in
|
||||
let instance = {
|
||||
read = read init scan ~token_to_region ~style;
|
||||
input; buffer; get_win; get_pos; get_last; get_file; close}
|
||||
in Ok instance
|
||||
| Error _ as e -> e
|
||||
|
205
src/passes/1-parser/shared/LexerLib.mli
Normal file
205
src/passes/1-parser/shared/LexerLib.mli
Normal file
@ -0,0 +1,205 @@
|
||||
(* A library for writing UTF8-aware lexers *)
|
||||
|
||||
module Region = Simple_utils.Region
|
||||
module Pos = Simple_utils.Pos
|
||||
|
||||
(* The function [rollback] resets the lexing buffer to the state it
|
||||
was when it matched the last regular expression. This function is
|
||||
safe to use only in the semantic action of the rule which last
|
||||
matched. *)
|
||||
|
||||
val rollback : Lexing.lexbuf -> unit
|
||||
|
||||
(* Utility types *)
|
||||
|
||||
type file_path = string
|
||||
type lexeme = string
|
||||
|
||||
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
|
||||
|
||||
(* When scanning structured constructs, like strings and comments, we
|
||||
need to keep the region of the opening symbol (like double quote,
|
||||
"//" or "(*") in order to report any error more precisely. Since
|
||||
ocamllex is byte-oriented, we need to store the parsed bytes as
|
||||
characters in an accumulator [acc] and also its length [len], so,
|
||||
we are done, it is easy to build the string making up the
|
||||
structured construct with [mk_str] (see above).
|
||||
|
||||
The resulting data structure is called a _thread_. (Note for
|
||||
Emacs: "*)".)
|
||||
*)
|
||||
|
||||
type thread = <
|
||||
opening : Region.t;
|
||||
length : int;
|
||||
acc : char list;
|
||||
to_string : string;
|
||||
push_char : char -> thread;
|
||||
push_string : string -> thread;
|
||||
set_opening : Region.t -> thread
|
||||
>
|
||||
|
||||
val mk_thread : Region.t -> lexeme -> thread
|
||||
|
||||
(* STATE *)
|
||||
|
||||
(* Beyond producing tokens, the result of lexing is a _state_. The
|
||||
type [state] represents the abstract logical state of the lexing
|
||||
engine, that is, a value which is threaded during scanning and
|
||||
which denotes useful, high-level information beyond what the type
|
||||
[Lexing.lexbuf] in the standard library already provides for all
|
||||
generic lexers. We qualify it as "logical state" because the lexing
|
||||
buffer itself has a "physical state" defined by the type
|
||||
[Lexing.lexbuf].
|
||||
|
||||
Tokens are the smallest units used by the parser to build the
|
||||
abstract syntax tree. The state includes a queue of recognised
|
||||
tokens, with the markup at the left of its lexeme until either the
|
||||
start of the file or the end of the previously recognised token.
|
||||
|
||||
The markup from the last recognised token or, if the first token
|
||||
has not been recognised yet, from the beginning of the file is
|
||||
stored in the field [markup] of the state because it is a
|
||||
side-effect, with respect to the output token list, and we use a
|
||||
record with a single field [units] because that record may be
|
||||
easily extended during the future maintenance of this lexer.
|
||||
|
||||
The state also includes a field [pos] which holds the current
|
||||
position in the LIGO source file. The position is not always
|
||||
updated after a single character has been matched: that depends on
|
||||
the regular expression that matched the lexing buffer.
|
||||
|
||||
The field [window] is a two-token window, that is, a buffer that
|
||||
contains the last recognised token, and the penultimate (if any).
|
||||
Technically, it is a parametric type, but its use is meant for
|
||||
tokens, wherever they are defined. In [Two (t1,t2)], and in case
|
||||
of a syntax error, [t1] is the first invalid token and [t2] is the
|
||||
last valid one.
|
||||
|
||||
The fields [decoder] and [supply] offer the support needed for
|
||||
the lexing of UTF-8 encoded characters in comments (the only place
|
||||
where they are allowed in LIGO). The former is the decoder proper
|
||||
and the latter is the effectful function [supply] that takes a
|
||||
byte, a start index and a length and feed it to [decoder]. See the
|
||||
documentation of the third-party library Uutf.
|
||||
|
||||
Some methods are now documented.
|
||||
|
||||
The call [state#enqueue token] updates functionally the state
|
||||
[state] by associating the token [token] with the stored markup and
|
||||
enqueuing the pair into the units queue. The field [markup] is then
|
||||
reset to the empty list.
|
||||
|
||||
The call [state#slide_token token] pushes the token [token] in
|
||||
the buffer [buffer]. If the buffer is full, that is, it is [Two
|
||||
(t1,t2)], then the token [t2] is discarded to make room for
|
||||
[token].
|
||||
|
||||
The call [state#sync buffer] updates the current position in
|
||||
accordance with the contents of the lexing buffer, more precisely,
|
||||
depending on the length of the string which has just been
|
||||
recognised by the scanner: that length is used as a positive offset
|
||||
to the current column.
|
||||
*)
|
||||
|
||||
type 'token window =
|
||||
Nil
|
||||
| One of 'token
|
||||
| Two of 'token * 'token
|
||||
|
||||
type 'token state = <
|
||||
units : (Markup.t list * 'token) FQueue.t;
|
||||
markup : Markup.t list;
|
||||
window : 'token window;
|
||||
last : Region.t;
|
||||
pos : Pos.t;
|
||||
decoder : Uutf.decoder;
|
||||
supply : Bytes.t -> int -> int -> unit;
|
||||
block : EvalOpt.block_comment option;
|
||||
line : EvalOpt.line_comment option;
|
||||
|
||||
enqueue : 'token -> 'token state;
|
||||
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
|
||||
set_last : Region.t -> 'token state;
|
||||
set_pos : Pos.t -> 'token state;
|
||||
slide_token : 'token -> 'token state;
|
||||
|
||||
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
|
||||
|
||||
push_newline : Lexing.lexbuf -> 'token state;
|
||||
push_line : thread -> 'token state;
|
||||
push_block : thread -> 'token state;
|
||||
push_space : Lexing.lexbuf -> 'token state;
|
||||
push_tabs : Lexing.lexbuf -> 'token state;
|
||||
push_bom : Lexing.lexbuf -> 'token state;
|
||||
push_markup : Markup.t -> 'token state;
|
||||
>
|
||||
|
||||
(* LEXER INSTANCE *)
|
||||
|
||||
(* The function [open_token_stream] returns a lexer instance made of
|
||||
|
||||
* the input [input] of type [input];
|
||||
* a function [read] that extracts tokens from a lexing buffer,
|
||||
together with a lexing buffer [buffer] to read from,
|
||||
* a function [close] that closes that buffer,
|
||||
* a function [get_pos] that returns the current position, and
|
||||
* a function [get_last] that returns the region of the last
|
||||
recognised token.
|
||||
* a function [get_file] that returns the name of the file being
|
||||
scanned (empty string if [stdin]).
|
||||
|
||||
Note that a module [Token] is exported too, because the signature
|
||||
of the exported functions depend on it.
|
||||
|
||||
The type [window] is a two-token window, that is, a buffer that
|
||||
contains the last recognised token, and the penultimate (if any).
|
||||
|
||||
The call [read ?line ?block ~init ~scan ~token_to_region ~style
|
||||
input] evaluates in a lexer (also known as a tokeniser or scanner)
|
||||
whose type is [log:('token logger) -> Lexing.lexbuf -> 'token], and
|
||||
suitable for a parser generated by Menhir. The argument labelled
|
||||
[log] is a logger, that is, it may print a token and its left
|
||||
markup to a given channel, at the caller's discretion. The argument
|
||||
labelled [~init] is the scanner to be called first, usually for
|
||||
reading the BOM, then [scan] is used for the following calls. The
|
||||
function labelled [~style] is used to check stylistic constraints
|
||||
on the tokens and the markup between them.
|
||||
*)
|
||||
|
||||
type input =
|
||||
File of file_path
|
||||
| String of string
|
||||
| Channel of in_channel
|
||||
| Buffer of Lexing.lexbuf
|
||||
|
||||
type 'token logger = Markup.t list -> 'token -> unit
|
||||
|
||||
type 'token instance = {
|
||||
input : input;
|
||||
read : log:('token logger) -> Lexing.lexbuf -> 'token;
|
||||
buffer : Lexing.lexbuf;
|
||||
get_win : unit -> 'token window;
|
||||
get_pos : unit -> Pos.t;
|
||||
get_last : unit -> Region.t;
|
||||
get_file : unit -> file_path;
|
||||
close : unit -> unit
|
||||
}
|
||||
|
||||
type open_err = File_opening of string
|
||||
|
||||
val lexbuf_from_input :
|
||||
input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result
|
||||
|
||||
val open_token_stream :
|
||||
?line:EvalOpt.line_comment ->
|
||||
?block:EvalOpt.block_comment ->
|
||||
init:('token state -> Lexing.lexbuf -> 'token state) ->
|
||||
scan:('token state -> Lexing.lexbuf -> 'token state) ->
|
||||
token_to_region:('token -> Region.t) ->
|
||||
style:('token ->
|
||||
(Lexing.lexbuf -> (Markup.t list * 'token) option) ->
|
||||
Lexing.lexbuf ->
|
||||
unit) ->
|
||||
input ->
|
||||
('token instance, open_err) Stdlib.result
|
@ -4,7 +4,8 @@ module Region = Simple_utils.Region
|
||||
|
||||
module type S =
|
||||
sig
|
||||
module Lexer : LexerLib.S
|
||||
module Lexer : Lexer.S
|
||||
type token = Lexer.token
|
||||
|
||||
val output_token :
|
||||
?offsets:bool ->
|
||||
@ -12,7 +13,7 @@ module type S =
|
||||
EvalOpt.command ->
|
||||
out_channel ->
|
||||
Markup.t list ->
|
||||
Lexer.token ->
|
||||
token ->
|
||||
unit
|
||||
|
||||
type file_path = string
|
||||
@ -22,19 +23,26 @@ module type S =
|
||||
[`Byte | `Point] ->
|
||||
?block:EvalOpt.block_comment ->
|
||||
?line:EvalOpt.line_comment ->
|
||||
Lexer.input ->
|
||||
token_to_region:(token -> Region.t) ->
|
||||
style:(token ->
|
||||
(Lexing.lexbuf -> (Markup.t list * token) option) ->
|
||||
Lexing.lexbuf ->
|
||||
unit) ->
|
||||
LexerLib.input ->
|
||||
EvalOpt.command ->
|
||||
(unit, string Region.reg) Stdlib.result
|
||||
end
|
||||
|
||||
module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
|
||||
module Make (Lexer: Lexer.S) : (S with module Lexer = Lexer) =
|
||||
struct
|
||||
module Lexer = Lexer
|
||||
module Token = Lexer.Token
|
||||
type token = Lexer.token
|
||||
|
||||
(** Pretty-printing in a string the lexemes making up the markup
|
||||
(* Pretty-printing in a string the lexemes making up the markup
|
||||
between two tokens, concatenated with the last lexeme
|
||||
itself. *)
|
||||
|
||||
let output_token ?(offsets=true) mode command
|
||||
channel left_mark token : unit =
|
||||
let output str = Printf.fprintf channel "%s%!" str in
|
||||
@ -57,10 +65,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
|
||||
|
||||
type file_path = string
|
||||
|
||||
let trace ?(offsets=true) mode ?block ?line input command :
|
||||
let trace ?(offsets=true) mode ?block ?line
|
||||
~token_to_region ~style input command :
|
||||
(unit, string Region.reg) Stdlib.result =
|
||||
match Lexer.open_token_stream ?line ?block input with
|
||||
Ok Lexer.{read; buffer; close; _} ->
|
||||
match LexerLib.open_token_stream
|
||||
~init:Lexer.init
|
||||
~scan:Lexer.scan
|
||||
~token_to_region
|
||||
~style
|
||||
?line ?block input
|
||||
with
|
||||
Ok LexerLib.{read; buffer; close; _} ->
|
||||
let log = output_token ~offsets mode command stdout
|
||||
and close_all () = flush_all (); close () in
|
||||
let rec iter () =
|
||||
@ -69,12 +84,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) =
|
||||
if Token.is_eof token
|
||||
then Stdlib.Ok ()
|
||||
else iter ()
|
||||
| exception Lexer.Token.Error error ->
|
||||
let msg =
|
||||
Lexer.Token.format_error
|
||||
~offsets mode ~file:true error
|
||||
in Stdlib.Error msg
|
||||
| exception Lexer.Error error ->
|
||||
let msg =
|
||||
Lexer.format_error ~offsets mode ~file:true error
|
||||
in Stdlib.Error msg in
|
||||
let result = iter ()
|
||||
in close_all (); result
|
||||
| Stdlib.Error (Lexer.File_opening msg) ->
|
||||
| Stdlib.Error (LexerLib.File_opening msg) ->
|
||||
flush_all (); Stdlib.Error (Region.wrap_ghost msg)
|
||||
end
|
||||
|
@ -2,7 +2,8 @@ module Region = Simple_utils.Region
|
||||
|
||||
module type S =
|
||||
sig
|
||||
module Lexer : LexerLib.S
|
||||
module Lexer : Lexer.S
|
||||
type token = Lexer.token
|
||||
|
||||
val output_token :
|
||||
?offsets:bool ->
|
||||
@ -10,7 +11,7 @@ module type S =
|
||||
EvalOpt.command ->
|
||||
out_channel ->
|
||||
Markup.t list ->
|
||||
Lexer.token ->
|
||||
token ->
|
||||
unit
|
||||
|
||||
type file_path = string
|
||||
@ -20,9 +21,14 @@ module type S =
|
||||
[`Byte | `Point] ->
|
||||
?block:EvalOpt.block_comment ->
|
||||
?line:EvalOpt.line_comment ->
|
||||
Lexer.input ->
|
||||
token_to_region:(token -> Region.t) ->
|
||||
style:(token ->
|
||||
(Lexing.lexbuf -> (Markup.t list * token) option) ->
|
||||
Lexing.lexbuf ->
|
||||
unit) ->
|
||||
LexerLib.input ->
|
||||
EvalOpt.command ->
|
||||
(unit, string Region.reg) Stdlib.result
|
||||
end
|
||||
|
||||
module Make (Lexer: LexerLib.S) : S with module Lexer = Lexer
|
||||
module Make (Lexer: Lexer.S) : S with module Lexer = Lexer
|
||||
|
@ -9,7 +9,7 @@ module type IO =
|
||||
val options : EvalOpt.options (* CLI options *)
|
||||
end
|
||||
|
||||
module Make (IO: IO) (Lexer: LexerLib.S) =
|
||||
module Make (IO: IO) (Lexer: Lexer.S) =
|
||||
struct
|
||||
(* Error printing and exception tracing *)
|
||||
|
||||
@ -39,11 +39,16 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
|
||||
| Stdlib.Ok pp_buffer ->
|
||||
(* Running the lexer on the preprocessed input *)
|
||||
|
||||
let source = Lexer.String (Buffer.contents pp_buffer) in
|
||||
match Lexer.open_token_stream ?line:IO.options#line
|
||||
let source = LexerLib.String (Buffer.contents pp_buffer) in
|
||||
match LexerLib.open_token_stream
|
||||
?line:IO.options#line
|
||||
?block:IO.options#block
|
||||
~init:Lexer.init
|
||||
~scan:Lexer.scan
|
||||
~token_to_region:Lexer.Token.to_region
|
||||
~style:Lexer.Token.check_right_context
|
||||
source with
|
||||
Ok Lexer.{read; buffer; close; _} ->
|
||||
Ok LexerLib.{read; buffer; close; _} ->
|
||||
let close_all () = flush_all (); close () in
|
||||
let rec read_tokens tokens =
|
||||
match read ~log:(fun _ _ -> ()) buffer with
|
||||
@ -51,9 +56,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
|
||||
if Lexer.Token.is_eof token
|
||||
then Stdlib.Ok (List.rev tokens)
|
||||
else read_tokens (token::tokens)
|
||||
(* | exception _ ->
|
||||
Printf.eprintf "Here\n%!"; exit 1
|
||||
*) | exception Lexer.Token.Error error ->
|
||||
| exception Lexer.Token.Error error ->
|
||||
let file =
|
||||
match IO.options#input with
|
||||
None | Some "-" -> false
|
||||
@ -74,7 +77,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
|
||||
in Stdlib.Error msg in
|
||||
let result = read_tokens []
|
||||
in close_all (); result
|
||||
| Stdlib.Error (Lexer.File_opening msg) ->
|
||||
| Stdlib.Error (LexerLib.File_opening msg) ->
|
||||
flush_all (); Stdlib.Error (Region.wrap_ghost msg) in
|
||||
match IO.options#input with
|
||||
None -> preproc stdin
|
||||
@ -115,7 +118,9 @@ module Make (IO: IO) (Lexer: LexerLib.S) =
|
||||
IO.options#mode
|
||||
?block:IO.options#block
|
||||
?line:IO.options#line
|
||||
(Lexer.String preproc_str)
|
||||
~token_to_region:Lexer.Token.to_region
|
||||
~style:Lexer.Token.check_right_context
|
||||
(LexerLib.String preproc_str)
|
||||
IO.options#cmd
|
||||
in match IO.options#input with
|
||||
None -> preproc stdin
|
||||
|
@ -7,7 +7,7 @@ module type IO =
|
||||
val options : EvalOpt.options (* CLI options *)
|
||||
end
|
||||
|
||||
module Make (IO: IO) (Lexer: LexerLib.S) :
|
||||
module Make (IO: IO) (Lexer: Lexer.S) :
|
||||
sig
|
||||
val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result
|
||||
val trace : unit -> (unit, string Region.reg) Stdlib.result
|
||||
|
@ -56,8 +56,8 @@ module type PARSER =
|
||||
(* Main functor *)
|
||||
|
||||
module Make (IO: IO)
|
||||
(Lexer: LexerLib.S)
|
||||
(Parser: PARSER with type token = Lexer.Token.token)
|
||||
(Lexer: Lexer.S)
|
||||
(Parser: PARSER with type token = Lexer.token)
|
||||
(ParErr: sig val message : int -> string end) =
|
||||
struct
|
||||
module I = Parser.MenhirInterpreter
|
||||
@ -143,14 +143,14 @@ module Make (IO: IO)
|
||||
~offsets:IO.options#offsets
|
||||
IO.options#mode IO.options#cmd stdout
|
||||
|
||||
let incr_contract Lexer.{read; buffer; get_win; close; _} =
|
||||
let incr_contract LexerLib.{read; buffer; get_win; close; _} =
|
||||
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
|
||||
and failure = failure get_win in
|
||||
let parser = Incr.contract buffer.Lexing.lex_curr_p in
|
||||
let ast = I.loop_handle success failure supplier parser
|
||||
in flush_all (); close (); ast
|
||||
|
||||
let incr_expr Lexer.{read; buffer; get_win; close; _} =
|
||||
let incr_expr LexerLib.{read; buffer; get_win; close; _} =
|
||||
let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer
|
||||
and failure = failure get_win in
|
||||
let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in
|
||||
|
@ -55,8 +55,8 @@ module type PARSER =
|
||||
end
|
||||
|
||||
module Make (IO: IO)
|
||||
(Lexer: LexerLib.S)
|
||||
(Parser: PARSER with type token = Lexer.Token.token)
|
||||
(Lexer: Lexer.S)
|
||||
(Parser: PARSER with type token = Lexer.token)
|
||||
(ParErr: sig val message : int -> string end) :
|
||||
sig
|
||||
(* WARNING: The following parsers may all raise [Lexer.Error] *)
|
||||
@ -78,8 +78,8 @@ module Make (IO: IO)
|
||||
|
||||
exception Point of error
|
||||
|
||||
val incr_contract : Lexer.instance -> Parser.ast
|
||||
val incr_expr : Lexer.instance -> Parser.expr
|
||||
val incr_contract : Lexer.token LexerLib.instance -> Parser.ast
|
||||
val incr_expr : Lexer.token LexerLib.instance -> Parser.expr
|
||||
|
||||
val format_error :
|
||||
?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg
|
||||
|
@ -37,7 +37,7 @@ module type Printer =
|
||||
val print_expr : state -> expr -> unit
|
||||
end
|
||||
|
||||
module Make (Lexer: LexerLib.S)
|
||||
module Make (Lexer: Lexer.S)
|
||||
(AST: sig type t type expr end)
|
||||
(Parser: ParserAPI.PARSER
|
||||
with type ast = AST.t
|
||||
@ -89,12 +89,12 @@ module Make (Lexer: LexerLib.S)
|
||||
ParserLog.mk_state ~offsets:SubIO.options#offsets
|
||||
~mode:SubIO.options#mode
|
||||
~buffer:output in
|
||||
let close () = lexer_inst.Lexer.close () in
|
||||
let close () = lexer_inst.LexerLib.close () in
|
||||
let expr =
|
||||
try
|
||||
if SubIO.options#mono then
|
||||
let tokeniser = lexer_inst.Lexer.read ~log
|
||||
and lexbuf = lexer_inst.Lexer.buffer
|
||||
let tokeniser = lexer_inst.LexerLib.read ~log
|
||||
and lexbuf = lexer_inst.LexerLib.buffer
|
||||
in Front.mono_expr tokeniser lexbuf
|
||||
else
|
||||
Front.incr_expr lexer_inst
|
||||
@ -124,12 +124,12 @@ module Make (Lexer: LexerLib.S)
|
||||
ParserLog.mk_state ~offsets:SubIO.options#offsets
|
||||
~mode:SubIO.options#mode
|
||||
~buffer:output in
|
||||
let close () = lexer_inst.Lexer.close () in
|
||||
let close () = lexer_inst.LexerLib.close () in
|
||||
let ast =
|
||||
try
|
||||
if SubIO.options#mono then
|
||||
let tokeniser = lexer_inst.Lexer.read ~log
|
||||
and lexbuf = lexer_inst.Lexer.buffer
|
||||
let tokeniser = lexer_inst.LexerLib.read ~log
|
||||
and lexbuf = lexer_inst.LexerLib.buffer
|
||||
in Front.mono_contract tokeniser lexbuf
|
||||
else
|
||||
Front.incr_contract lexer_inst
|
||||
@ -163,9 +163,17 @@ module Make (Lexer: LexerLib.S)
|
||||
|
||||
| exception Lexer.Error err ->
|
||||
let file =
|
||||
lexer_inst.Lexer.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
|
||||
let error =
|
||||
Lexer.format_error ~offsets:SubIO.options#offsets
|
||||
lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
|
||||
let error = Lexer.format_error
|
||||
~offsets:SubIO.options#offsets
|
||||
SubIO.options#mode err ~file:(file <> "")
|
||||
in Stdlib.Error error
|
||||
|
||||
| exception Lexer.Token.Error err ->
|
||||
let file =
|
||||
lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in
|
||||
let error = Lexer.Token.format_error
|
||||
~offsets:SubIO.options#offsets
|
||||
SubIO.options#mode err ~file:(file <> "")
|
||||
in Stdlib.Error error
|
||||
|
||||
@ -181,7 +189,7 @@ module Make (Lexer: LexerLib.S)
|
||||
|
||||
| exception Parser.Error ->
|
||||
let invalid, valid_opt =
|
||||
match lexer_inst.Lexer.get_win () with
|
||||
match lexer_inst.LexerLib.get_win () with
|
||||
LexerLib.Nil ->
|
||||
assert false (* Safe: There is always at least EOF. *)
|
||||
| LexerLib.One invalid -> invalid, None
|
||||
@ -205,8 +213,8 @@ module Make (Lexer: LexerLib.S)
|
||||
(* Parsing a contract *)
|
||||
|
||||
let gen_parser options input parser =
|
||||
match Lexer.lexbuf_from_input input with
|
||||
Stdlib.Error (Lexer.File_opening msg) ->
|
||||
match LexerLib.lexbuf_from_input input with
|
||||
Stdlib.Error (LexerLib.File_opening msg) ->
|
||||
Stdlib.Error (Region.wrap_ghost msg)
|
||||
| Ok (lexbuf, close) ->
|
||||
(* Preprocessing the input source *)
|
||||
@ -224,48 +232,53 @@ module Make (Lexer: LexerLib.S)
|
||||
(* Lexing and parsing the preprocessed input source *)
|
||||
|
||||
let () = close () in
|
||||
let input' = Lexer.String (Buffer.contents buffer) in
|
||||
match Lexer.open_token_stream ?line:options#line
|
||||
let input' = LexerLib.String (Buffer.contents buffer) in
|
||||
match LexerLib.open_token_stream
|
||||
~init:Lexer.init
|
||||
~scan:Lexer.scan
|
||||
~token_to_region:Lexer.Token.to_region
|
||||
~style:Lexer.Token.check_right_context
|
||||
?line:options#line
|
||||
?block:options#block
|
||||
input'
|
||||
with
|
||||
Ok instance ->
|
||||
let open Lexing in
|
||||
instance.Lexer.buffer.lex_curr_p <-
|
||||
{instance.Lexer.buffer.lex_curr_p with pos_fname=file};
|
||||
instance.LexerLib.buffer.lex_curr_p <-
|
||||
{instance.LexerLib.buffer.lex_curr_p with pos_fname=file};
|
||||
apply instance parser
|
||||
| Stdlib.Error (Lexer.File_opening msg) ->
|
||||
| Stdlib.Error (LexerLib.File_opening msg) ->
|
||||
Stdlib.Error (Region.wrap_ghost msg)
|
||||
|
||||
(* Parsing a contract in a file *)
|
||||
|
||||
let contract_in_file (source : string) =
|
||||
let options = SubIO.make ~input:(Some source) ~expr:false
|
||||
in gen_parser options (Lexer.File source) parse_contract
|
||||
in gen_parser options (LexerLib.File source) parse_contract
|
||||
|
||||
(* Parsing a contract in a string *)
|
||||
|
||||
let contract_in_string (source : string) =
|
||||
let options = SubIO.make ~input:None ~expr:false in
|
||||
gen_parser options (Lexer.String source) parse_contract
|
||||
gen_parser options (LexerLib.String source) parse_contract
|
||||
|
||||
(* Parsing a contract in stdin *)
|
||||
|
||||
let contract_in_stdin () =
|
||||
let options = SubIO.make ~input:None ~expr:false in
|
||||
gen_parser options (Lexer.Channel stdin) parse_contract
|
||||
gen_parser options (LexerLib.Channel stdin) parse_contract
|
||||
|
||||
(* Parsing an expression in a string *)
|
||||
|
||||
let expr_in_string (source : string) =
|
||||
let options = SubIO.make ~input:None ~expr:true in
|
||||
gen_parser options (Lexer.String source) parse_expr
|
||||
gen_parser options (LexerLib.String source) parse_expr
|
||||
|
||||
(* Parsing an expression in stdin *)
|
||||
|
||||
let expr_in_stdin () =
|
||||
let options = SubIO.make ~input:None ~expr:true in
|
||||
gen_parser options (Lexer.Channel stdin) parse_expr
|
||||
gen_parser options (LexerLib.Channel stdin) parse_expr
|
||||
|
||||
(* Preprocess only *)
|
||||
|
||||
|
@ -43,7 +43,7 @@ module type Printer =
|
||||
|
||||
(* Main functor to make the parser *)
|
||||
|
||||
module Make (Lexer : LexerLib.S)
|
||||
module Make (Lexer : Lexer.S)
|
||||
(AST : sig type t type expr end)
|
||||
(Parser : ParserAPI.PARSER
|
||||
with type ast = AST.t
|
||||
|
@ -1 +1 @@
|
||||
const a: string = -0x222;
|
||||
const a: string = - (**) 0x2222
|
||||
|
@ -1 +1 @@
|
||||
let a = -0x222;
|
||||
let a = - (**) 0x2222
|
||||
|
@ -1 +1 @@
|
||||
let a = -0x222;
|
||||
let a = - /**/ 0x2222;
|
||||
|
@ -1 +0,0 @@
|
||||
let arguments = 1;
|
Loading…
Reference in New Issue
Block a user