From 005b7fd69b0e0a3b5eb5aeee9de49495041681f0 Mon Sep 17 00:00:00 2001 From: Christian Rinderknecht Date: Tue, 28 Apr 2020 19:26:31 +0200 Subject: [PATCH] Finished refactoring of lexer. --- src/bin/expect_tests/lexer_tests.ml | 59 +- src/passes/1-parser/cameligo/.links | 1 + src/passes/1-parser/cameligo/LexToken.mll | 334 ++++++----- src/passes/1-parser/pascaligo/.links | 1 + src/passes/1-parser/pascaligo/LexToken.mli | 2 +- src/passes/1-parser/pascaligo/LexToken.mll | 133 ++--- src/passes/1-parser/reasonligo/.links | 1 + src/passes/1-parser/reasonligo/LexToken.mli | 71 +-- src/passes/1-parser/reasonligo/LexToken.mll | 402 +++++++------ src/passes/1-parser/shared/.links | 2 - src/passes/1-parser/shared/Lexer.mli | 32 +- src/passes/1-parser/shared/Lexer.mll | 253 ++------ src/passes/1-parser/shared/LexerLib.ml | 584 +++++++++---------- src/passes/1-parser/shared/LexerLib.mli | 205 +++++++ src/passes/1-parser/shared/LexerLog.ml | 38 +- src/passes/1-parser/shared/LexerLog.mli | 14 +- src/passes/1-parser/shared/LexerUnit.ml | 27 +- src/passes/1-parser/shared/LexerUnit.mli | 2 +- src/passes/1-parser/shared/ParserAPI.ml | 8 +- src/passes/1-parser/shared/ParserAPI.mli | 8 +- src/passes/1-parser/shared/ParserUnit.ml | 65 ++- src/passes/1-parser/shared/ParserUnit.mli | 2 +- src/test/lexer/negative_byte_sequence.ligo | 2 +- src/test/lexer/negative_byte_sequence.mligo | 2 +- src/test/lexer/negative_byte_sequence.religo | 2 +- src/test/lexer/reserved_name.ligo | 1 - src/test/lexer/reserved_name.religo | 2 +- 27 files changed, 1174 insertions(+), 1079 deletions(-) delete mode 100644 src/passes/1-parser/shared/.links create mode 100644 src/passes/1-parser/shared/LexerLib.mli delete mode 100644 src/test/lexer/reserved_name.ligo diff --git a/src/bin/expect_tests/lexer_tests.ml b/src/bin/expect_tests/lexer_tests.ml index bde7e658c..20d26ab11 100644 --- a/src/bin/expect_tests/lexer_tests.ml +++ b/src/bin/expect_tests/lexer_tests.ml @@ -2,7 +2,7 @@ open Cli_expect let%expect_test _ = run_ligo_bad [ "compile-contract" ; "../../test/lexer/broken_string.ligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "broken_string.ligo", line 1, characters 18-19: The string starting here is interrupted by a line break. Hint: Remove the break, close the string before or insert a backslash. @@ -19,7 +19,7 @@ ligo: : Lexical error in file "broken_string.ligo", line 1, characters 18-19: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/broken_string.mligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "broken_string.mligo", line 1, characters 8-9: The string starting here is interrupted by a line break. Hint: Remove the break, close the string before or insert a backslash. @@ -36,7 +36,7 @@ ligo: : Lexical error in file "broken_string.mligo", line 1, characters 8-9: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/broken_string.religo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9: The string starting here is interrupted by a line break. Hint: Remove the break, close the string before or insert a backslash. @@ -53,8 +53,8 @@ ligo: : Lexical error in file "broken_string.religo", line 1, characters 8-9: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.ligo" ; "main" ] ; - [%expect {| -ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-23: + [%expect {| +ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters 18-31: Negative byte sequence. Hint: Remove the leading minus sign. {} @@ -70,8 +70,8 @@ ligo: : Lexical error in file "negative_byte_sequence.ligo", line 1, characters |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.mligo" ; "main" ] ; - [%expect {| -ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-13: + [%expect {| +ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters 8-21: Negative byte sequence. Hint: Remove the leading minus sign. {} @@ -87,8 +87,8 @@ ligo: : Lexical error in file "negative_byte_sequence.mligo", line 1, characters |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/negative_byte_sequence.religo" ; "main" ] ; - [%expect {| -ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-13: + [%expect {| +ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, characters 8-21: Negative byte sequence. Hint: Remove the leading minus sign. {} @@ -103,8 +103,9 @@ ligo: : Lexical error in file "negative_byte_sequence.religo", line 1, character * Check the changelog by running 'ligo changelog' |} ]; +(* run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.ligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13: Reserved name: "arguments". Hint: Change the name. @@ -119,9 +120,10 @@ ligo: : Lexical error in file "reserved_name.ligo", line 1, characters 4-13: * Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new * Check the changelog by running 'ligo changelog' |} ]; + *) run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.religo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "reserved_name.religo", line 1, characters 4-7: Reserved name: "end". Hint: Change the name. @@ -138,7 +140,7 @@ ligo: : Lexical error in file "reserved_name.religo", line 1, characters 4-7: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/reserved_name.mligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "reserved_name.mligo", line 1, characters 4-10: Reserved name: "object". Hint: Change the name. @@ -155,7 +157,7 @@ ligo: : Lexical error in file "reserved_name.mligo", line 1, characters 4-10: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/unexpected_character.ligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "unexpected_character.ligo", line 1, characters 18-19: Unexpected character '\239'. {} @@ -171,7 +173,7 @@ ligo: : Lexical error in file "unexpected_character.ligo", line 1, characters 18 |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/unexpected_character.mligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "unexpected_character.mligo", line 1, characters 8-9: Unexpected character '\239'. {} @@ -187,7 +189,7 @@ ligo: : Lexical error in file "unexpected_character.mligo", line 1, characters 8 |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/unexpected_character.religo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "unexpected_character.religo", line 1, characters 8-9: Unexpected character '\239'. {} @@ -203,11 +205,10 @@ ligo: : Lexical error in file "unexpected_character.religo", line 1, characters |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/unterminated_comment.mligo" ; "main" ] ; - [%expect {| -ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0-2: + [%expect {| +ligo: : Preprocessing error in file "../../test/lexer/unterminated_comment.mligo", line 1, characters 0-2: Unterminated comment. - Hint: Close with "*)". - {} + Hint: Close with "*)". {} If you're not sure how to fix this error, you can @@ -220,7 +221,7 @@ ligo: : Lexical error in file "unterminated_comment.mligo", line 1, characters 0 |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_symbol.ligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "invalid_symbol.ligo", line 1, characters 17-20: Invalid symbol. Hint: Check the LIGO syntax you use. @@ -237,7 +238,7 @@ ligo: : Lexical error in file "invalid_symbol.ligo", line 1, characters 17-20: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_symbol.mligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "invalid_symbol.mligo", line 1, characters 10-13: Invalid symbol. Hint: Check the LIGO syntax you use. @@ -254,7 +255,7 @@ ligo: : Lexical error in file "invalid_symbol.mligo", line 1, characters 10-13: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_symbol.religo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "invalid_symbol.religo", line 1, characters 10-11: Invalid symbol. Hint: Check the LIGO syntax you use. @@ -271,7 +272,7 @@ ligo: : Lexical error in file "invalid_symbol.religo", line 1, characters 10-11: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/missing_break.ligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "missing_break.ligo", line 1, characters 18-18: Missing break. Hint: Insert some space. @@ -288,7 +289,7 @@ ligo: : Lexical error in file "missing_break.ligo", line 1, characters 18-18: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/missing_break.mligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "missing_break.mligo", line 1, characters 11-11: Missing break. Hint: Insert some space. @@ -305,7 +306,7 @@ ligo: : Lexical error in file "missing_break.mligo", line 1, characters 11-11: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/missing_break.religo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "missing_break.religo", line 1, characters 11-11: Missing break. Hint: Insert some space. @@ -322,7 +323,7 @@ ligo: : Lexical error in file "missing_break.religo", line 1, characters 11-11: |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_character_in_string.ligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "invalid_character_in_string.ligo", line 1, characters 19-20: Invalid character in string. Hint: Remove or replace the character. @@ -339,7 +340,7 @@ ligo: : Lexical error in file "invalid_character_in_string.ligo", line 1, charac |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_character_in_string.mligo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "invalid_character_in_string.mligo", line 1, characters 9-10: Invalid character in string. Hint: Remove or replace the character. @@ -356,7 +357,7 @@ ligo: : Lexical error in file "invalid_character_in_string.mligo", line 1, chara |} ]; run_ligo_bad [ "compile-contract" ; "../../test/lexer/invalid_character_in_string.religo" ; "main" ] ; - [%expect {| + [%expect {| ligo: : Lexical error in file "invalid_character_in_string.religo", line 1, characters 9-10: Invalid character in string. Hint: Remove or replace the character. @@ -370,4 +371,4 @@ ligo: : Lexical error in file "invalid_character_in_string.religo", line 1, char * Ask a question on our Discord: https://discord.gg/9rhYaEt * Open a gitlab issue: https://gitlab.com/ligolang/ligo/issues/new * Check the changelog by running 'ligo changelog' - |} ] \ No newline at end of file + |} ] diff --git a/src/passes/1-parser/cameligo/.links b/src/passes/1-parser/cameligo/.links index 702a10aca..fc8466c8e 100644 --- a/src/passes/1-parser/cameligo/.links +++ b/src/passes/1-parser/cameligo/.links @@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile ../shared/Lexer.mli ../shared/Lexer.mll +../shared/LexerLib.ml ../shared/EvalOpt.ml ../shared/EvalOpt.mli ../shared/FQueue.ml diff --git a/src/passes/1-parser/cameligo/LexToken.mll b/src/passes/1-parser/cameligo/LexToken.mll index bb44c4e78..a87150cf7 100644 --- a/src/passes/1-parser/cameligo/LexToken.mll +++ b/src/passes/1-parser/cameligo/LexToken.mll @@ -102,138 +102,167 @@ type t = | EOF of Region.t (* End of file *) + +(* Projections *) + type token = t let proj_token = function - ARROW region -> region, "ARROW" -| CONS region -> region, "CONS" -| CAT region -> region, "CAT" -| MINUS region -> region, "MINUS" -| PLUS region -> region, "PLUS" -| SLASH region -> region, "SLASH" -| TIMES region -> region, "TIMES" -| LPAR region -> region, "LPAR" -| RPAR region -> region, "RPAR" -| LBRACKET region -> region, "LBRACKET" -| RBRACKET region -> region, "RBRACKET" -| LBRACE region -> region, "LBRACE" -| RBRACE region -> region, "RBRACE" -| COMMA region -> region, "COMMA" -| SEMI region -> region, "SEMI" -| VBAR region -> region, "VBAR" -| COLON region -> region, "COLON" -| DOT region -> region, "DOT" -| WILD region -> region, "WILD" -| EQ region -> region, "EQ" -| NE region -> region, "NE" -| LT region -> region, "LT" -| GT region -> region, "GT" -| LE region -> region, "LE" -| GE region -> region, "GE" -| BOOL_OR region -> region, "BOOL_OR" -| BOOL_AND region -> region, "BOOL_AND" -| Ident Region.{region; value} -> - region, sprintf "Ident %s" value -| Constr Region.{region; value} -> - region, sprintf "Constr %s" value + (* Literals *) + + String Region.{region; value} -> + region, sprintf "Str %s" value +| Bytes Region.{region; value = s,b} -> + region, + sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b) | Int Region.{region; value = s,n} -> region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) | Nat Region.{region; value = s,n} -> region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) | Mutez Region.{region; value = s,n} -> region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) -| String Region.{region; value} -> - region, sprintf "Str %s" value -| Bytes Region.{region; value = s,b} -> - region, - sprintf "Bytes (\"%s\", \"0x%s\")" - s (Hex.show b) +| Ident Region.{region; value} -> + region, sprintf "Ident %s" value +| Constr Region.{region; value} -> + region, sprintf "Constr %s" value | Attr Region.{region; value} -> region, sprintf "Attr \"%s\"" value + + (* Symbols *) + +| ARROW region -> region, "ARROW" +| CONS region -> region, "CONS" +| CAT region -> region, "CAT" +| MINUS region -> region, "MINUS" +| PLUS region -> region, "PLUS" +| SLASH region -> region, "SLASH" +| TIMES region -> region, "TIMES" +| LPAR region -> region, "LPAR" +| RPAR region -> region, "RPAR" +| LBRACKET region -> region, "LBRACKET" +| RBRACKET region -> region, "RBRACKET" +| LBRACE region -> region, "LBRACE" +| RBRACE region -> region, "RBRACE" +| COMMA region -> region, "COMMA" +| SEMI region -> region, "SEMI" +| VBAR region -> region, "VBAR" +| COLON region -> region, "COLON" +| DOT region -> region, "DOT" +| WILD region -> region, "WILD" +| EQ region -> region, "EQ" +| NE region -> region, "NE" +| LT region -> region, "LT" +| GT region -> region, "GT" +| LE region -> region, "LE" +| GE region -> region, "GE" +| BOOL_OR region -> region, "BOOL_OR" +| BOOL_AND region -> region, "BOOL_AND" + + (* Keywords *) + | Begin region -> region, "Begin" -| Else region -> region, "Else" -| End region -> region, "End" +| Else region -> region, "Else" +| End region -> region, "End" | False region -> region, "False" -| Fun region -> region, "Fun" -| Rec region -> region, "Rec" -| If region -> region, "If" -| In region -> region, "In" -| Let region -> region, "Let" +| Fun region -> region, "Fun" +| Rec region -> region, "Rec" +| If region -> region, "If" +| In region -> region, "In" +| Let region -> region, "Let" | Match region -> region, "Match" -| Mod region -> region, "Mod" -| Not region -> region, "Not" -| Of region -> region, "Of" -| Or region -> region, "Or" -| Then region -> region, "Then" -| True region -> region, "True" -| Type region -> region, "Type" -| With region -> region, "With" -| C_None region -> region, "C_None" -| C_Some region -> region, "C_Some" +| Mod region -> region, "Mod" +| Not region -> region, "Not" +| Of region -> region, "Of" +| Or region -> region, "Or" +| Then region -> region, "Then" +| True region -> region, "True" +| Type region -> region, "Type" +| With region -> region, "With" + + (* Data *) + +| C_None region -> region, "C_None" +| C_Some region -> region, "C_Some" + + (* Virtual tokens *) + | EOF region -> region, "EOF" -let to_lexeme = function - ARROW _ -> "->" -| CONS _ -> "::" -| CAT _ -> "^" -| MINUS _ -> "-" -| PLUS _ -> "+" -| SLASH _ -> "/" -| TIMES _ -> "*" -| LPAR _ -> "(" -| RPAR _ -> ")" -| LBRACKET _ -> "[" -| RBRACKET _ -> "]" -| LBRACE _ -> "{" -| RBRACE _ -> "}" -| COMMA _ -> "," -| SEMI _ -> ";" -| VBAR _ -> "|" -| COLON _ -> ":" -| DOT _ -> "." -| WILD _ -> "_" -| EQ _ -> "=" -| NE _ -> "<>" -| LT _ -> "<" -| GT _ -> ">" -| LE _ -> "<=" -| GE _ -> ">=" -| BOOL_OR _ -> "||" -| BOOL_AND _ -> "&&" -| Ident id -> id.Region.value -| Constr id -> id.Region.value +let to_lexeme = function + (* Literals *) + + String s -> String.escaped s.Region.value +| Bytes b -> fst b.Region.value | Int i | Nat i -| Mutez i -> fst i.Region.value -| String s -> String.escaped s.Region.value -| Bytes b -> fst b.Region.value -| Attr a -> a.Region.value +| Mutez i -> fst i.Region.value +| Ident id -> id.Region.value +| Constr id -> id.Region.value +| Attr a -> a.Region.value + + (* Symbols *) + +| ARROW _ -> "->" +| CONS _ -> "::" +| CAT _ -> "^" +| MINUS _ -> "-" +| PLUS _ -> "+" +| SLASH _ -> "/" +| TIMES _ -> "*" +| LPAR _ -> "(" +| RPAR _ -> ")" +| LBRACKET _ -> "[" +| RBRACKET _ -> "]" +| LBRACE _ -> "{" +| RBRACE _ -> "}" +| COMMA _ -> "," +| SEMI _ -> ";" +| VBAR _ -> "|" +| COLON _ -> ":" +| DOT _ -> "." +| WILD _ -> "_" +| EQ _ -> "=" +| NE _ -> "<>" +| LT _ -> "<" +| GT _ -> ">" +| LE _ -> "<=" +| GE _ -> ">=" +| BOOL_OR _ -> "||" +| BOOL_AND _ -> "&&" + + (* Keywords *) | Begin _ -> "begin" -| Else _ -> "else" -| End _ -> "end" +| Else _ -> "else" +| End _ -> "end" | False _ -> "false" -| Fun _ -> "fun" -| Rec _ -> "rec" -| If _ -> "if" -| In _ -> "in" -| Let _ -> "let" +| Fun _ -> "fun" +| Rec _ -> "rec" +| If _ -> "if" +| In _ -> "in" +| Let _ -> "let" | Match _ -> "match" -| Mod _ -> "mod" -| Not _ -> "not" -| Of _ -> "of" -| Or _ -> "or" -| True _ -> "true" -| Type _ -> "type" -| Then _ -> "then" -| With _ -> "with" +| Mod _ -> "mod" +| Not _ -> "not" +| Of _ -> "of" +| Or _ -> "or" +| True _ -> "true" +| Type _ -> "type" +| Then _ -> "then" +| With _ -> "with" + +(* Data constructors *) | C_None _ -> "None" | C_Some _ -> "Some" +(* Virtual tokens *) + | EOF _ -> "" +(* CONVERSIONS *) + let to_string token ?(offsets=true) mode = let region, val_str = proj_token token in let reg_str = region#compact ~offsets mode @@ -241,10 +270,6 @@ let to_string token ?(offsets=true) mode = let to_region token = proj_token token |> fst -(* Injections *) - -type int_err = Non_canonical_zero - (* LEXIS *) let keywords = [ @@ -385,6 +410,8 @@ let mk_bytes lexeme region = let value = lexeme, `Hex norm in Bytes Region.{region; value} +type int_err = Non_canonical_zero + let mk_int lexeme region = let z = Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string @@ -398,23 +425,21 @@ type nat_err = let mk_nat lexeme region = match (String.index_opt lexeme 'n') with - | None -> Error Invalid_natural - | Some _ -> ( - let z = - Str.(global_replace (regexp "_") "" lexeme) |> - Str.(global_replace (regexp "n") "") |> - Z.of_string in - if Z.equal z Z.zero && lexeme <> "0n" - then Error Non_canonical_zero_nat - else Ok (Nat Region.{region; value = lexeme,z}) - ) + None -> Error Invalid_natural + | Some _ -> let z = + Str.(global_replace (regexp "_") "" lexeme) |> + Str.(global_replace (regexp "n") "") |> + Z.of_string in + if Z.equal z Z.zero && lexeme <> "0n" + then Error Non_canonical_zero_nat + else Ok (Nat Region.{region; value = lexeme,z}) let mk_mutez lexeme region = let z = Str.(global_replace (regexp "_") "" lexeme) |> Str.(global_replace (regexp "mutez") "") |> Z.of_string in - if Z.equal z Z.zero && lexeme <> "0mutez" + if Z.equal z Z.zero && lexeme <> "0mutez" then Error Non_canonical_zero else Ok (Mutez Region.{region; value = lexeme, z}) @@ -422,8 +447,6 @@ let eof region = EOF region type sym_err = Invalid_symbol -type attr_err = Invalid_attribute - let mk_sym lexeme region = match lexeme with (* Lexemes in common with all concrete syntaxes *) @@ -473,24 +496,27 @@ let mk_constr lexeme region = (* Attributes *) +type attr_err = Invalid_attribute + let mk_attr header lexeme region = - if header = "[@" then - Error Invalid_attribute + if header = "[@" then Error Invalid_attribute else Ok (Attr Region.{value=lexeme; region}) (* Predicates *) let is_string = function String _ -> true | _ -> false -let is_bytes = function Bytes _ -> true | _ -> false -let is_int = function Int _ -> true | _ -> false -let is_ident = function Ident _ -> true | _ -> false -let is_eof = function EOF _ -> true | _ -> false +let is_bytes = function Bytes _ -> true | _ -> false +let is_int = function Int _ -> true | _ -> false +let is_ident = function Ident _ -> true | _ -> false +let is_eof = function EOF _ -> true | _ -> false +let is_minus = function MINUS _ -> true | _ -> false (* Errors *) type error = Odd_lengthed_bytes | Missing_break +| Negative_byte_sequence let error_to_string = function Odd_lengthed_bytes -> @@ -499,6 +525,9 @@ let error_to_string = function | Missing_break -> "Missing break.\n\ Hint: Insert some space." +| Negative_byte_sequence -> + "Negative byte sequence.\n\ + Hint: Remove the leading minus sign." exception Error of error Region.reg @@ -511,29 +540,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file = let fail region value = raise (Error Region.{region; value}) let check_right_context token next_token buffer : unit = - if not (is_eof token) then - if is_int token || is_bytes token then - match next_token buffer with - Some ([], next) -> - let pos = (to_region token)#stop in - let region = Region.make ~start:pos ~stop:pos in - if is_int next then - fail region Odd_lengthed_bytes - else - if is_ident next || is_string next || is_bytes next then - fail region Missing_break - | Some (_::_, _) | None -> () - else - if is_ident token || is_string token then - match next_token buffer with - Some ([], next) -> - if is_ident next || is_string next - || is_bytes next || is_int next - then - let pos = (to_region token)#stop in - let region = Region.make ~start:pos ~stop:pos - in fail region Missing_break - | Some (_::_, _) | None -> () + let pos = (to_region token)#stop in + let region = Region.make ~start:pos ~stop:pos in + match next_token buffer with + None -> () + | Some (markup, next) -> + if is_minus token && is_bytes next + then let region = + Region.cover (to_region token) (to_region next) + in fail region Negative_byte_sequence + else + match markup with + [] -> + if is_int token + then if is_string next || is_ident next + then fail region Missing_break + else () + else + if is_string token + then if is_int next || is_bytes next || is_ident next + then fail region Missing_break + else () + else + if is_bytes token + then if is_string next || is_ident next + then fail region Missing_break + else if is_int next + then fail region Odd_lengthed_bytes + else () + else () + | _::_ -> () (* END TRAILER *) } diff --git a/src/passes/1-parser/pascaligo/.links b/src/passes/1-parser/pascaligo/.links index fed453f99..45c9a4602 100644 --- a/src/passes/1-parser/pascaligo/.links +++ b/src/passes/1-parser/pascaligo/.links @@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile ../shared/Lexer.mli ../shared/Lexer.mll +../shared/LexerLib.mli ../shared/LexerLib.ml ../shared/EvalOpt.ml ../shared/EvalOpt.mli diff --git a/src/passes/1-parser/pascaligo/LexToken.mli b/src/passes/1-parser/pascaligo/LexToken.mli index 88adbff37..be263d9a5 100644 --- a/src/passes/1-parser/pascaligo/LexToken.mli +++ b/src/passes/1-parser/pascaligo/LexToken.mli @@ -1,4 +1,4 @@ -(** This signature defines the lexical tokens for LIGO +(* This signature defines the lexical tokens for LIGO _Tokens_ are the abstract units which are used by the parser to build the abstract syntax tree (AST), in other words, the stream of diff --git a/src/passes/1-parser/pascaligo/LexToken.mll b/src/passes/1-parser/pascaligo/LexToken.mll index f300187fe..933db3cdb 100644 --- a/src/passes/1-parser/pascaligo/LexToken.mll +++ b/src/passes/1-parser/pascaligo/LexToken.mll @@ -5,24 +5,14 @@ (* Shorthands *) -type lexeme = string - -let sprintf = Printf.sprintf - module Region = Simple_utils.Region module Pos = Simple_utils.Pos module SMap = Map.Make (String) module SSet = Set.Make (String) -(* Hack to roll back one lexeme in the current semantic action *) -(* -let rollback buffer = - let open Lexing in - let len = String.length (lexeme buffer) in - let pos_cnum = buffer.lex_curr_p.pos_cnum - len in - buffer.lex_curr_pos <- buffer.lex_curr_pos - len; - buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum} -*) +type lexeme = string + +let sprintf = Printf.sprintf (* TOKENS *) @@ -123,6 +113,8 @@ type t = | EOF of Region.t +(* Projections *) + type token = t let proj_token = function @@ -130,32 +122,20 @@ let proj_token = function String Region.{region; value} -> region, sprintf "String %s" value - | Bytes Region.{region; value = s,b} -> region, - sprintf "Bytes (\"%s\", \"0x%s\")" - s (Hex.show b) - + sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b) | Int Region.{region; value = s,n} -> region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) - | Nat Region.{region; value = s,n} -> region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) - | Mutez Region.{region; value = s,n} -> region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) - | Ident Region.{region; value} -> region, sprintf "Ident \"%s\"" value - | Constr Region.{region; value} -> region, sprintf "Constr \"%s\"" value -(* -| Attr {header; string={region; value}} -> - region, sprintf "Attr (\"%s\",\"%s\")" header value - *) - (* Symbols *) | SEMI region -> region, "SEMI" @@ -244,7 +224,7 @@ let to_lexeme = function | Bytes b -> fst b.Region.value | Int i | Nat i -| Mutez i -> fst i.Region.value +| Mutez i -> fst i.Region.value | Ident id | Constr id -> id.Region.value @@ -382,9 +362,7 @@ let keywords = [ (fun reg -> With reg) ] -let reserved = - let open SSet in - empty |> add "arguments" +let reserved = SSet.empty let constructors = [ (fun reg -> False reg); @@ -484,22 +462,20 @@ type nat_err = let mk_nat lexeme region = match String.index_opt lexeme 'n' with - None -> Error Invalid_natural - | Some _ -> - let z = - Str.(global_replace (regexp "_") "" lexeme) |> - Str.(global_replace (regexp "n") "") |> - Z.of_string in - if Z.equal z Z.zero && lexeme <> "0n" - then Error Non_canonical_zero_nat - else Ok (Nat Region.{region; value = lexeme,z}) + None -> Error Invalid_natural + | Some _ -> let z = + Str.(global_replace (regexp "_") "" lexeme) |> + Str.(global_replace (regexp "n") "") |> + Z.of_string in + if Z.equal z Z.zero && lexeme <> "0n" + then Error Non_canonical_zero_nat + else Ok (Nat Region.{region; value = lexeme,z}) let mk_mutez lexeme region = - let z = - Str.(global_replace (regexp "_") "" lexeme) |> - Str.(global_replace (regexp "mutez") "") |> - Z.of_string in - if Z.equal z Z.zero && lexeme <> "0mutez" + let z = Str.(global_replace (regexp "_") "" lexeme) |> + Str.(global_replace (regexp "mutez") "") |> + Z.of_string in + if Z.equal z Z.zero && lexeme <> "0mutez" then Error Non_canonical_zero else Ok (Mutez Region.{region; value = lexeme, z}) @@ -557,22 +533,23 @@ let mk_constr lexeme region = type attr_err = Invalid_attribute -let mk_attr _header _string _region = - Error Invalid_attribute +let mk_attr _ _ _ = Error Invalid_attribute (* Predicates *) let is_string = function String _ -> true | _ -> false -let is_bytes = function Bytes _ -> true | _ -> false -let is_int = function Int _ -> true | _ -> false -let is_ident = function Ident _ -> true | _ -> false -let is_eof = function EOF _ -> true | _ -> false +let is_bytes = function Bytes _ -> true | _ -> false +let is_int = function Int _ -> true | _ -> false +let is_ident = function Ident _ -> true | _ -> false +let is_eof = function EOF _ -> true | _ -> false +let is_minus = function MINUS _ -> true | _ -> false (* Errors *) type error = Odd_lengthed_bytes | Missing_break +| Negative_byte_sequence let error_to_string = function Odd_lengthed_bytes -> @@ -581,6 +558,9 @@ let error_to_string = function | Missing_break -> "Missing break.\n\ Hint: Insert some space." +| Negative_byte_sequence -> + "Negative byte sequence.\n\ + Hint: Remove the leading minus sign." exception Error of error Region.reg @@ -593,29 +573,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file = let fail region value = raise (Error Region.{region; value}) let check_right_context token next_token buffer : unit = - if not (is_eof token) then - if is_int token || is_bytes token then - match next_token buffer with - Some ([], next) -> - let pos = (to_region token)#stop in - let region = Region.make ~start:pos ~stop:pos in - if is_int next then - fail region Odd_lengthed_bytes - else - if is_ident next || is_string next || is_bytes next then - fail region Missing_break - | Some (_::_, _) | None -> () - else - if is_ident token || is_string token then - match next_token buffer with - Some ([], next) -> - if is_ident next || is_string next - || is_bytes next || is_int next - then - let pos = (to_region token)#stop in - let region = Region.make ~start:pos ~stop:pos - in fail region Missing_break - | Some (_::_, _) | None -> () + let pos = (to_region token)#stop in + let region = Region.make ~start:pos ~stop:pos in + match next_token buffer with + None -> () + | Some (markup, next) -> + if is_minus token && is_bytes next + then let region = + Region.cover (to_region token) (to_region next) + in fail region Negative_byte_sequence + else + match markup with + [] -> + if is_int token + then if is_string next || is_ident next + then fail region Missing_break + else () + else + if is_string token + then if is_int next || is_bytes next || is_ident next + then fail region Missing_break + else () + else + if is_bytes token + then if is_string next || is_ident next + then fail region Missing_break + else if is_int next + then fail region Odd_lengthed_bytes + else () + else () + | _::_ -> () (* END TRAILER *) } diff --git a/src/passes/1-parser/reasonligo/.links b/src/passes/1-parser/reasonligo/.links index 2be7fda97..214b46e6c 100644 --- a/src/passes/1-parser/reasonligo/.links +++ b/src/passes/1-parser/reasonligo/.links @@ -2,6 +2,7 @@ $HOME/git/OCaml-build/Makefile ../shared/Lexer.mli ../shared/Lexer.mll +../shared/LexerLib.ml ../shared/EvalOpt.ml ../shared/EvalOpt.mli ../shared/FQueue.ml diff --git a/src/passes/1-parser/reasonligo/LexToken.mli b/src/passes/1-parser/reasonligo/LexToken.mli index 240e824d2..f568b5894 100644 --- a/src/passes/1-parser/reasonligo/LexToken.mli +++ b/src/passes/1-parser/reasonligo/LexToken.mli @@ -31,48 +31,49 @@ type lexeme = string type t = (* Symbols *) - CAT of Region.t (* "++" *) + CAT of Region.t (* "++" *) (* Arithmetics *) -| MINUS of Region.t (* "-" *) -| PLUS of Region.t (* "+" *) -| SLASH of Region.t (* "/" *) -| TIMES of Region.t (* "*" *) +| MINUS of Region.t (* "-" *) +| PLUS of Region.t (* "+" *) +| SLASH of Region.t (* "/" *) +| TIMES of Region.t (* "*" *) (* Compounds *) -| LPAR of Region.t (* "(" *) -| RPAR of Region.t (* ")" *) +| LPAR of Region.t (* "(" *) +| RPAR of Region.t (* ")" *) | LBRACKET of Region.t (* "[" *) | RBRACKET of Region.t (* "]" *) -| LBRACE of Region.t (* "{" *) -| RBRACE of Region.t (* "}" *) +| LBRACE of Region.t (* "{" *) +| RBRACE of Region.t (* "}" *) (* Separators *) -| COMMA of Region.t (* "," *) -| SEMI of Region.t (* ";" *) -| VBAR of Region.t (* "|" *) -| COLON of Region.t (* ":" *) -| DOT of Region.t (* "." *) +| COMMA of Region.t (* "," *) +| SEMI of Region.t (* ";" *) +| VBAR of Region.t (* "|" *) +| COLON of Region.t (* ":" *) +| DOT of Region.t (* "." *) | ELLIPSIS of Region.t (* "..." *) +| ARROW of Region.t (* "=>" *) (* Wildcard *) -| WILD of Region.t (* "_" *) +| WILD of Region.t (* "_" *) (* Comparisons *) -| EQ of Region.t (* "=" *) -| EQEQ of Region.t (* "==" *) -| NE of Region.t (* "!=" *) -| LT of Region.t (* "<" *) -| GT of Region.t (* ">" *) -| LE of Region.t (* "=<" *) -| GE of Region.t (* ">=" *) +| EQ of Region.t (* "=" *) +| EQEQ of Region.t (* "==" *) +| NE of Region.t (* "!=" *) +| LT of Region.t (* "<" *) +| GT of Region.t (* ">" *) +| LE of Region.t (* "<=" *) +| GE of Region.t (* ">=" *) -| ARROW of Region.t (* "=>" *) + (* Logic *) | BOOL_OR of Region.t (* "||" *) | BOOL_AND of Region.t (* "&&" *) @@ -91,18 +92,18 @@ type t = (* Keywords *) -| Else of Region.t -| False of Region.t -| If of Region.t -| Let of Region.t -| Rec of Region.t +| Else of Region.t +| False of Region.t +| If of Region.t +| Let of Region.t +| Mod of Region.t +| Or of Region.t +| Rec of Region.t | Switch of Region.t -| Mod of Region.t -| Or of Region.t -| True of Region.t -| Type of Region.t +| True of Region.t +| Type of Region.t -(* Data constructors *) + (* Data constructors *) | C_None of Region.t (* "None" *) | C_Some of Region.t (* "Some" *) @@ -111,8 +112,6 @@ type t = | EOF of Region.t (* End of file *) -type token = t - (* Projections The difference between extracting the lexeme and a string from a @@ -121,6 +120,8 @@ type token = t lexeme (concrete syntax). *) +type token = t + val to_lexeme : token -> lexeme val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string val to_region : token -> Region.t diff --git a/src/passes/1-parser/reasonligo/LexToken.mll b/src/passes/1-parser/reasonligo/LexToken.mll index 92aec58e3..a4245fb6d 100644 --- a/src/passes/1-parser/reasonligo/LexToken.mll +++ b/src/passes/1-parser/reasonligo/LexToken.mll @@ -1,66 +1,69 @@ { (* START OF HEADER *) -type lexeme = string - -let sprintf = Printf.sprintf +(* Shorthands *) module Region = Simple_utils.Region module Pos = Simple_utils.Pos module SMap = Utils.String.Map module SSet = Utils.String.Set +type lexeme = string + +let sprintf = Printf.sprintf + (* TOKENS *) type t = (* Symbols *) - CAT of Region.t (* "++" *) + CAT of Region.t (* "++" *) (* Arithmetics *) -| MINUS of Region.t (* "-" *) -| PLUS of Region.t (* "+" *) -| SLASH of Region.t (* "/" *) -| TIMES of Region.t (* "*" *) +| MINUS of Region.t (* "-" *) +| PLUS of Region.t (* "+" *) +| SLASH of Region.t (* "/" *) +| TIMES of Region.t (* "*" *) (* Compounds *) -| LPAR of Region.t (* "(" *) -| RPAR of Region.t (* ")" *) -| LBRACKET of Region.t (* "[" *) -| RBRACKET of Region.t (* "]" *) -| LBRACE of Region.t (* "{" *) -| RBRACE of Region.t (* "}" *) +| LPAR of Region.t (* "(" *) +| RPAR of Region.t (* ")" *) +| LBRACKET of Region.t (* "[" *) +| RBRACKET of Region.t (* "]" *) +| LBRACE of Region.t (* "{" *) +| RBRACE of Region.t (* "}" *) (* Separators *) -| COMMA of Region.t (* "," *) -| SEMI of Region.t (* ";" *) -| VBAR of Region.t (* "|" *) -| COLON of Region.t (* ":" *) -| DOT of Region.t (* "." *) +| COMMA of Region.t (* "," *) +| SEMI of Region.t (* ";" *) +| VBAR of Region.t (* "|" *) +| COLON of Region.t (* ":" *) +| DOT of Region.t (* "." *) | ELLIPSIS of Region.t (* "..." *) +| ARROW of Region.t (* "=>" *) (* Wildcard *) -| WILD of Region.t (* "_" *) +| WILD of Region.t (* "_" *) (* Comparisons *) -| EQ of Region.t (* "=" *) -| EQEQ of Region.t (* "==" *) -| NE of Region.t (* "!=" *) -| LT of Region.t (* "<" *) -| GT of Region.t (* ">" *) -| LE of Region.t (* "<=" *) -| GE of Region.t (* ">=" *) -| ARROW of Region.t (* "=>" *) +| EQ of Region.t (* "=" *) +| EQEQ of Region.t (* "==" *) +| NE of Region.t (* "!=" *) +| LT of Region.t (* "<" *) +| GT of Region.t (* ">" *) +| LE of Region.t (* "<=" *) +| GE of Region.t (* ">=" *) -| BOOL_OR of Region.t (* "||" *) -| BOOL_AND of Region.t (* "&&" *) + (* Logic *) -| NOT of Region.t (* ! *) +| BOOL_OR of Region.t (* "||" *) +| BOOL_AND of Region.t (* "&&" *) +| NOT of Region.t (* ! *) (* Identifiers, labels, numbers and strings *) @@ -75,17 +78,17 @@ type t = (* Keywords *) -(*| And*) -| Else of Region.t -| False of Region.t -| If of Region.t -| Let of Region.t -| Rec of Region.t +| Else of Region.t +| False of Region.t +| If of Region.t +| Let of Region.t +| Mod of Region.t +| Or of Region.t +| Rec of Region.t | Switch of Region.t -| Mod of Region.t -| Or of Region.t -| True of Region.t -| Type of Region.t +| True of Region.t +| Type of Region.t + (* Data constructors *) | C_None of Region.t (* "None" *) @@ -96,121 +99,143 @@ type t = | EOF of Region.t (* End of file *) +(* Projections *) + type token = t let proj_token = function - CAT region -> region, "CAT" -| MINUS region -> region, "MINUS" -| PLUS region -> region, "PLUS" -| SLASH region -> region, "SLASH" -| TIMES region -> region, "TIMES" -| LPAR region -> region, "LPAR" -| RPAR region -> region, "RPAR" -| LBRACKET region -> region, "LBRACKET" -| RBRACKET region -> region, "RBRACKET" -| LBRACE region -> region, "LBRACE" -| RBRACE region -> region, "RBRACE" -| COMMA region -> region, "COMMA" -| SEMI region -> region, "SEMI" -| VBAR region -> region, "VBAR" -| COLON region -> region, "COLON" -| DOT region -> region, "DOT" -| ELLIPSIS region -> region, "ELLIPSIS" -| WILD region -> region, "WILD" -| EQ region -> region, "EQ" -| EQEQ region -> region, "EQEQ" -| NE region -> region, "NE" -| LT region -> region, "LT" -| GT region -> region, "GT" -| LE region -> region, "LE" -| GE region -> region, "GE" -| ARROW region -> region, "ARROW" -| BOOL_OR region -> region, "BOOL_OR" -| BOOL_AND region -> region, "BOOL_AND" -| Ident Region.{region; value} -> - region, sprintf "Ident %s" value -| Constr Region.{region; value} -> - region, sprintf "Constr %s" value + (* Literals *) + + String Region.{region; value} -> + region, sprintf "String %s" value +| Bytes Region.{region; value = s,b} -> + region, + sprintf "Bytes (\"%s\", \"0x%s\")" s (Hex.show b) | Int Region.{region; value = s,n} -> region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) | Nat Region.{region; value = s,n} -> region, sprintf "Nat (\"%s\", %s)" s (Z.to_string n) | Mutez Region.{region; value = s,n} -> region, sprintf "Mutez (\"%s\", %s)" s (Z.to_string n) -| String Region.{region; value} -> - region, sprintf "String %s" value -| Bytes Region.{region; value = s,b} -> - region, - sprintf "Bytes (\"%s\", \"0x%s\")" - s (Hex.show b) -| Else region -> region, "Else" -| False region -> region, "False" -| If region -> region, "If" -| Let region -> region, "Let" -| Rec region -> region, "Rec" -| Switch region -> region, "Switch" -| Mod region -> region, "Mod" -| NOT region -> region, "!" -| Or region -> region, "Or" -| True region -> region, "True" -| Type region -> region, "Type" -| C_None region -> region, "C_None" -| C_Some region -> region, "C_Some" -| Attr Region.{region; value} -> region, sprintf "Attr %s" value -| EOF region -> region, "EOF" +| Ident Region.{region; value} -> + region, sprintf "Ident %s" value +| Constr Region.{region; value} -> + region, sprintf "Constr %s" value + + (* Symbols *) + +| CAT region -> region, "CAT" +| MINUS region -> region, "MINUS" +| PLUS region -> region, "PLUS" +| SLASH region -> region, "SLASH" +| TIMES region -> region, "TIMES" +| LPAR region -> region, "LPAR" +| RPAR region -> region, "RPAR" +| LBRACKET region -> region, "LBRACKET" +| RBRACKET region -> region, "RBRACKET" +| LBRACE region -> region, "LBRACE" +| RBRACE region -> region, "RBRACE" +| COMMA region -> region, "COMMA" +| SEMI region -> region, "SEMI" +| VBAR region -> region, "VBAR" +| COLON region -> region, "COLON" +| DOT region -> region, "DOT" +| ELLIPSIS region -> region, "ELLIPSIS" +| WILD region -> region, "WILD" +| EQ region -> region, "EQ" +| EQEQ region -> region, "EQEQ" +| NE region -> region, "NE" +| LT region -> region, "LT" +| GT region -> region, "GT" +| LE region -> region, "LE" +| GE region -> region, "GE" +| ARROW region -> region, "ARROW" +| NOT region -> region, "NOT" +| BOOL_OR region -> region, "BOOL_OR" +| BOOL_AND region -> region, "BOOL_AND" +| Else region -> region, "Else" +| False region -> region, "False" +| If region -> region, "If" +| Let region -> region, "Let" +| Rec region -> region, "Rec" +| Switch region -> region, "Switch" +| Mod region -> region, "Mod" +| Or region -> region, "Or" +| True region -> region, "True" +| Type region -> region, "Type" +| C_None region -> region, "C_None" +| C_Some region -> region, "C_Some" +| Attr Region.{region; value} -> region, sprintf "Attr %s" value +| EOF region -> region, "EOF" let to_lexeme = function - CAT _ -> "++" -| MINUS _ -> "-" -| PLUS _ -> "+" -| SLASH _ -> "/" -| TIMES _ -> "*" -| LPAR _ -> "(" -| RPAR _ -> ")" -| LBRACKET _ -> "[" -| RBRACKET _ -> "]" -| LBRACE _ -> "{" -| RBRACE _ -> "}" -| COMMA _ -> "," -| SEMI _ -> ";" -| VBAR _ -> "|" -| COLON _ -> ":" -| DOT _ -> "." -| ELLIPSIS _ -> "..." -| WILD _ -> "_" -| EQ _ -> "=" -| EQEQ _ -> "==" -| NE _ -> "!=" -| LT _ -> "<" -| GT _ -> ">" -| LE _ -> "<=" -| GE _ -> ">=" -| ARROW _ -> "=>" -| BOOL_OR _ -> "||" -| BOOL_AND _ -> "&&" -| Ident id -> id.Region.value -| Constr id -> id.Region.value + (* Literals *) + + String s -> s.Region.value +| Bytes b -> fst b.Region.value | Int i | Nat i -| Mutez i -> fst i.Region.value -| String s -> s.Region.value -| Bytes b -> fst b.Region.value -| Else _ -> "else" -| False _ -> "false" -| If _ -> "if" -| Let _ -> "let" -| Rec _ -> "rec" -| Mod _ -> "mod" -| NOT _ -> "!" -| Or _ -> "or" -| Switch _ -> "switch" -| True _ -> "true" -| Type _ -> "type" +| Mutez i -> fst i.Region.value +| Ident id -> id.Region.value +| Constr id -> id.Region.value +| Attr a -> a.Region.value + + (* Symbols *) + +| CAT _ -> "++" +| MINUS _ -> "-" +| PLUS _ -> "+" +| SLASH _ -> "/" +| TIMES _ -> "*" +| LPAR _ -> "(" +| RPAR _ -> ")" +| LBRACKET _ -> "[" +| RBRACKET _ -> "]" +| LBRACE _ -> "{" +| RBRACE _ -> "}" +| COMMA _ -> "," +| SEMI _ -> ";" +| VBAR _ -> "|" +| COLON _ -> ":" +| DOT _ -> "." +| ELLIPSIS _ -> "..." +| WILD _ -> "_" +| EQ _ -> "=" +| EQEQ _ -> "==" +| NE _ -> "!=" +| LT _ -> "<" +| GT _ -> ">" +| LE _ -> "<=" +| GE _ -> ">=" +| ARROW _ -> "=>" +| BOOL_OR _ -> "||" +| BOOL_AND _ -> "&&" +| NOT _ -> "!" + + (* Keywords *) + +| Else _ -> "else" +| False _ -> "false" +| If _ -> "if" +| Let _ -> "let" +| Mod _ -> "mod" +| Or _ -> "or" +| Rec _ -> "rec" +| Switch _ -> "switch" +| True _ -> "true" +| Type _ -> "type" + +(* Data constructors *) + | C_None _ -> "None" | C_Some _ -> "Some" -| Attr a -> a.Region.value + +(* Virtual tokens *) + | EOF _ -> "" +(* CONVERSIONS *) + let to_string token ?(offsets=true) mode = let region, val_str = proj_token token in let reg_str = region#compact ~offsets mode @@ -261,12 +286,9 @@ let reserved = |> add "functor" |> add "inherit" |> add "initializer" - (* |> add "land" - see https://ligo.atlassian.net/browse/LIGO-263 *) |> add "lazy" - (* |> add "lor" - see https://ligo.atlassian.net/browse/LIGO-263 *) |> add "lsl" |> add "lsr" - (* |> add "lxor" - see https://ligo.atlassian.net/browse/LIGO-263 *) |> add "match" |> add "method" |> add "module" @@ -291,7 +313,7 @@ let reserved = let constructors = [ (fun reg -> C_None reg); - (fun reg -> C_Some reg); + (fun reg -> C_Some reg) ] let add map (key, value) = SMap.add key value map @@ -376,20 +398,18 @@ let mk_int lexeme region = else Ok (Int Region.{region; value = lexeme, z}) let mk_nat lexeme region = - let z = - Str.(global_replace (regexp "_") "" lexeme) |> - Str.(global_replace (regexp "n") "") |> - Z.of_string in - if Z.equal z Z.zero && lexeme <> "0n" + let z = Str.(global_replace (regexp "_") "" lexeme) |> + Str.(global_replace (regexp "n") "") |> + Z.of_string in + if Z.equal z Z.zero && lexeme <> "0n" then Error Non_canonical_zero_nat else Ok (Nat Region.{region; value = lexeme, z}) let mk_mutez lexeme region = - let z = - Str.(global_replace (regexp "_") "" lexeme) |> - Str.(global_replace (regexp "mutez") "") |> - Z.of_string in - if Z.equal z Z.zero && lexeme <> "0mutez" + let z = Str.(global_replace (regexp "_") "" lexeme) |> + Str.(global_replace (regexp "mutez") "") |> + Z.of_string in + if Z.equal z Z.zero && lexeme <> "0mutez" then Error Non_canonical_zero else Ok (Mutez Region.{region; value = lexeme, z}) @@ -426,11 +446,11 @@ let mk_sym lexeme region = (* Symbols specific to ReasonLIGO *) - | "..." -> Ok (ELLIPSIS region) - | "=>" -> Ok (ARROW region) - | "==" -> Ok (EQEQ region) - | "!" -> Ok (NOT region) - | "++" -> Ok (CAT region) + | "..." -> Ok (ELLIPSIS region) + | "=>" -> Ok (ARROW region) + | "==" -> Ok (EQEQ region) + | "!" -> Ok (NOT region) + | "++" -> Ok (CAT region) (* Invalid symbols *) @@ -461,16 +481,18 @@ let mk_attr header lexeme region = (* Predicates *) let is_string = function String _ -> true | _ -> false -let is_bytes = function Bytes _ -> true | _ -> false -let is_int = function Int _ -> true | _ -> false -let is_ident = function Ident _ -> true | _ -> false -let is_eof = function EOF _ -> true | _ -> false +let is_bytes = function Bytes _ -> true | _ -> false +let is_int = function Int _ -> true | _ -> false +let is_ident = function Ident _ -> true | _ -> false +let is_eof = function EOF _ -> true | _ -> false +let is_minus = function MINUS _ -> true | _ -> false (* Errors *) type error = Odd_lengthed_bytes | Missing_break +| Negative_byte_sequence let error_to_string = function Odd_lengthed_bytes -> @@ -479,6 +501,9 @@ let error_to_string = function | Missing_break -> "Missing break.\n\ Hint: Insert some space." +| Negative_byte_sequence -> + "Negative byte sequence.\n\ + Hint: Remove the leading minus sign." exception Error of error Region.reg @@ -491,29 +516,36 @@ let format_error ?(offsets=true) mode Region.{region; value} ~file = let fail region value = raise (Error Region.{region; value}) let check_right_context token next_token buffer : unit = - if not (is_eof token) then - if is_int token || is_bytes token then - match next_token buffer with - Some ([], next) -> - let pos = (to_region token)#stop in - let region = Region.make ~start:pos ~stop:pos in - if is_int next then - fail region Odd_lengthed_bytes - else - if is_ident next || is_string next || is_bytes next then - fail region Missing_break - | Some (_::_, _) | None -> () - else - if is_ident token || is_string token then - match next_token buffer with - Some ([], next) -> - if is_ident next || is_string next - || is_bytes next || is_int next - then - let pos = (to_region token)#stop in - let region = Region.make ~start:pos ~stop:pos - in fail region Missing_break - | Some (_::_, _) | None -> () + let pos = (to_region token)#stop in + let region = Region.make ~start:pos ~stop:pos in + match next_token buffer with + None -> () + | Some (markup, next) -> + if is_minus token && is_bytes next + then let region = + Region.cover (to_region token) (to_region next) + in fail region Negative_byte_sequence + else + match markup with + [] -> + if is_int token + then if is_string next || is_ident next + then fail region Missing_break + else () + else + if is_string token + then if is_int next || is_bytes next || is_ident next + then fail region Missing_break + else () + else + if is_bytes token + then if is_string next || is_ident next + then fail region Missing_break + else if is_int next + then fail region Odd_lengthed_bytes + else () + else () + | _::_ -> () (* END TRAILER *) } diff --git a/src/passes/1-parser/shared/.links b/src/passes/1-parser/shared/.links deleted file mode 100644 index b29b57639..000000000 --- a/src/passes/1-parser/shared/.links +++ /dev/null @@ -1,2 +0,0 @@ -$HOME/git/OCaml-build/Makefile -$HOME/git/OCaml-build/Makefile.cfg diff --git a/src/passes/1-parser/shared/Lexer.mli b/src/passes/1-parser/shared/Lexer.mli index f7af4b055..c923e1505 100644 --- a/src/passes/1-parser/shared/Lexer.mli +++ b/src/passes/1-parser/shared/Lexer.mli @@ -38,8 +38,6 @@ module Region = Simple_utils.Region module Pos = Simple_utils.Pos -type lexeme = string - (* TOKENS *) (* The signature [TOKEN] exports an abstract type [token], so a lexer @@ -54,6 +52,8 @@ type lexeme = string reading the ocamllex specification for the lexer ([Lexer.mll]). *) +type lexeme = string + module type TOKEN = sig type token @@ -112,10 +112,36 @@ module type TOKEN = unit end +(* The signature of the lexer *) + +module type S = + sig + module Token : TOKEN + type token = Token.token + + (* The scanner [init] is meant to be called first to read the + BOM. Then [scan] is called. *) + + val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state + val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state + + (* Errors (specific to the generic lexer, not to the tokens) *) + + type error + + val error_to_string : error -> string + + exception Error of error Region.reg + + val format_error : + ?offsets:bool -> [`Byte | `Point] -> + error Region.reg -> file:bool -> string Region.reg + end + (* The functorised interface Note that the module parameter [Token] is re-exported as a submodule in [S]. *) -module Make (Token: TOKEN) : LexerLib.S with module Token = Token +module Make (Token : TOKEN) : S with module Token = Token diff --git a/src/passes/1-parser/shared/Lexer.mll b/src/passes/1-parser/shared/Lexer.mll index 0f0b0e52e..feb179b8a 100644 --- a/src/passes/1-parser/shared/Lexer.mll +++ b/src/passes/1-parser/shared/Lexer.mll @@ -8,12 +8,6 @@ module Pos = Simple_utils.Pos (* START HEADER *) -type lexeme = string - -(* ALIASES *) - -let sprintf = Printf.sprintf - (* TOKENS *) (* The signature [TOKEN] exports an abstract type [token], so a lexer @@ -22,6 +16,8 @@ let sprintf = Printf.sprintf caracterises the virtual token for end-of-file, because it requires special handling. *) +type lexeme = string + module type TOKEN = sig type token @@ -84,31 +80,39 @@ module type TOKEN = submodule in [S]. *) -module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) = +module type S = + sig + module Token : TOKEN + type token = Token.token + + val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state + val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state + + type error + + val error_to_string : error -> string + + exception Error of error Region.reg + + val format_error : + ?offsets:bool -> [`Byte | `Point] -> + error Region.reg -> file:bool -> string Region.reg + end + +module Make (Token : TOKEN) : (S with module Token = Token) = struct module Token = Token type token = Token.token - type file_path = string - - type line_comment = LexerLib.line_comment - type block_comment = LexerLib.block_comment - let mk_block = LexerLib.mk_block - (* ERRORS *) type error = Invalid_utf8_sequence | Unexpected_character of char | Undefined_escape_sequence - (* | Missing_break*) | Unterminated_string - (* | Unterminated_integer*) - (* | Odd_lengthed_bytes*) | Unterminated_comment of string - (* | Orphan_minus*) | Non_canonical_zero - (* | Negative_byte_sequence *) | Broken_string | Invalid_character_in_string | Reserved_name of string @@ -116,6 +120,8 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) = | Invalid_natural | Invalid_attribute + let sprintf = Printf.sprintf + let error_to_string = function Invalid_utf8_sequence -> "Invalid UTF-8 sequence." @@ -124,30 +130,15 @@ module Make (Token: TOKEN) : (LexerLib.S with module Token = Token) = | Undefined_escape_sequence -> "Undefined escape sequence.\n\ Hint: Remove or replace the sequence." -(* | Missing_break -> - "Missing break.\n\ - Hint: Insert some space." - *) | Unterminated_string -> + | Unterminated_string -> "Unterminated string.\n\ Hint: Close with double quotes." -(* | Unterminated_integer -> - "Unterminated integer.\n\ - Hint: Remove the sign or proceed with a natural number." *) -(* | Odd_lengthed_bytes -> - "The length of the byte sequence is an odd number.\n\ - Hint: Add or remove a digit." - *) | Unterminated_comment ending -> + | Unterminated_comment ending -> sprintf "Unterminated comment.\n\ Hint: Close with \"%s\"." ending -(* | Orphan_minus -> - "Orphan minus sign.\n\ - Hint: Remove the trailing space." *) | Non_canonical_zero -> "Non-canonical zero.\n\ Hint: Use 0." -(* | Negative_byte_sequence -> - "Negative byte sequence.\n\ - Hint: Remove the leading minus sign." *) | Broken_string -> "The string starting here is interrupted by a line break.\n\ Hint: Remove the break, close the string before or insert a \ @@ -319,13 +310,13 @@ let symbol = common_sym | pascaligo_sym | cameligo_sym | reasonligo_sym (* Comments *) -let pascaligo_block_comment_opening = "(*" -let pascaligo_block_comment_closing = "*)" -let pascaligo_line_comment = "//" +let pascaligo_block_comment_opening = "(*" +let pascaligo_block_comment_closing = "*)" +let pascaligo_line_comment = "//" -let cameligo_block_comment_opening = "(*" -let cameligo_block_comment_closing = "*)" -let cameligo_line_comment = "//" +let cameligo_block_comment_opening = "(*" +let cameligo_block_comment_closing = "*)" +let cameligo_line_comment = "//" let reasonligo_block_comment_opening = "/*" let reasonligo_block_comment_closing = "*/" @@ -369,6 +360,7 @@ and scan state = parse nl { scan (state#push_newline lexbuf) lexbuf } | ' '+ { scan (state#push_space lexbuf) lexbuf } | '\t'+ { scan (state#push_tabs lexbuf) lexbuf } + | ident { mk_ident state lexbuf } | constr { mk_constr state lexbuf } | bytes { mk_bytes seq state lexbuf } @@ -420,27 +412,14 @@ and scan state = parse let state = state#set_pos pos in scan state lexbuf } -(* Some special errors *) - -(* -| '-' { let region, _, state = state#sync lexbuf in - let state = scan state lexbuf in - let open Markup in - match FQueue.peek state#units with - None -> assert false - | Some (_, ((Space _ | Tabs _)::_, token)) - when Token.is_int token -> fail region Orphan_minus - | _ -> fail region Unterminated_integer } - -| "-0x" byte_seq? - { let region, _, _ = state#sync lexbuf - in fail region Negative_byte_sequence } - *) +(* String *) | '"' { let opening, lexeme, state = state#sync lexbuf in let thread = LexerLib.mk_thread opening lexeme in scan_string thread state lexbuf |> mk_string } +(* Comments *) + | block_comment_openings { let lexeme = Lexing.lexeme lexbuf in match state#block with @@ -496,8 +475,6 @@ and scan_flags state acc = parse | eof { let _, _, state = state#sync lexbuf in List.rev acc, state } -(* TODO: Move below to [LexerCommon.mll] *) - (* Finishing a string *) and scan_string thread state = parse @@ -624,164 +601,6 @@ and scan_utf8_inline thread state = parse { (* START TRAILER *) - - -(* Scanning the lexing buffer for tokens (and markup, as a - side-effect). - - Because we want the lexer to have access to the right lexical - context of a recognised lexeme (to enforce stylistic constraints or - report special error patterns), we need to keep a hidden reference - to a queue of recognised lexical units (that is, tokens and markup) - that acts as a mutable state between the calls to [read]. When - [read] is called, that queue is examined first and, if it contains - at least one token, that token is returned; otherwise, the lexing - buffer is scanned for at least one more new token. That is the - general principle: we put a high-level buffer (our queue) on top of - the low-level lexing buffer. - - One tricky and important detail is that we must make any parser - generated by Menhir (and calling [read]) believe that the last - region of the input source that was matched indeed corresponds to - the returned token, despite that many tokens and markup may have - been matched since it was actually read from the input. In other - words, the parser requests a token that is taken from the - high-level buffer, but the parser requests the source regions from - the _low-level_ lexing buffer, and they may disagree if more than - one token has actually been recognised. - - Consequently, in order to maintain a consistent view for the - parser, we have to patch some fields of the lexing buffer, namely - [lex_start_p] and [lex_curr_p], as these fields are read by parsers - generated by Menhir when querying source positions (regions). This - is the purpose of the function [patch_buffer]. After reading one - or more tokens and markup by the scanning rule [scan], we have to - save in the hidden reference [buf_reg] the region of the source - that was matched by [scan]. This atomic sequence of patching, - scanning and saving is implemented by the _function_ [scan] - (beware: it shadows the scanning rule [scan]). The function - [patch_buffer] is, of course, also called just before returning the - token, so the parser has a view of the lexing buffer consistent - with the token. - - Note that an additional reference [first_call] is needed to - distinguish the first call to the function [scan], as the first - scanning rule is actually [init] (which can handle the BOM), not - [scan]. - *) - -type logger = Markup.t list -> token -> unit - -type input = - File of file_path -| String of string -| Channel of in_channel -| Buffer of Lexing.lexbuf - -type instance = { - input : input; - read : log:logger -> Lexing.lexbuf -> token; - buffer : Lexing.lexbuf; - get_win : unit -> token LexerLib.window; - get_pos : unit -> Pos.t; - get_last : unit -> Region.t; - get_file : unit -> file_path; - close : unit -> unit -} - -type open_err = File_opening of string - -let lexbuf_from_input = function - File path -> - (try - let chan = open_in path in - let close () = close_in chan in - let lexbuf = Lexing.from_channel chan in - let () = - let open Lexing in - lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path} - in Ok (lexbuf, close) - with Sys_error msg -> Stdlib.Error (File_opening msg)) -| String s -> - Ok (Lexing.from_string s, fun () -> ()) -| Channel chan -> - let close () = close_in chan in - Ok (Lexing.from_channel chan, close) -| Buffer b -> Ok (b, fun () -> ()) - -let open_token_stream ?line ?block input = - let file_path = match input with - File path -> path - | _ -> "" in - let pos = Pos.min ~file:file_path in - let buf_reg = ref (pos#byte, pos#byte) - and first_call = ref true - and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in - let supply = Uutf.Manual.src decoder in - let state = ref (LexerLib.mk_state - ~units:FQueue.empty - ~last:Region.ghost - ~window:LexerLib.Nil - ~pos - ~markup:[] - ~decoder - ~supply - ?block - ?line - ()) in - - let get_pos () = !state#pos - and get_last () = !state#last - and get_win () = !state#window - and get_file () = file_path in - - let patch_buffer (start, stop) buffer = - let open Lexing in - let file_path = buffer.lex_curr_p.pos_fname in - buffer.lex_start_p <- {start with pos_fname = file_path}; - buffer.lex_curr_p <- {stop with pos_fname = file_path} - - and save_region buffer = - buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in - - let scan buffer = - patch_buffer !buf_reg buffer; - (if !first_call - then (state := init !state buffer; first_call := false) - else state := scan !state buffer); - save_region buffer in - - let next_token buffer = - scan buffer; - match FQueue.peek !state#units with - None -> None - | Some (units, ext_token) -> - state := !state#set_units units; Some ext_token in - - let rec read ~log buffer = - match FQueue.deq !state#units with - None -> - scan buffer; - read ~log buffer - | Some (units, (left_mark, token)) -> - log left_mark token; - state := ((!state#set_units units) - #set_last (Token.to_region token)) - #slide_token token; - Token.check_right_context token next_token buffer; - patch_buffer (Token.to_region token)#byte_pos buffer; - token in - match lexbuf_from_input input with - Ok (buffer, close) -> - let () = - match input with - File path when path <> "" -> LexerLib.reset ~file:path buffer - | _ -> () in - let instance = { - input; read; buffer; get_win; get_pos; get_last; get_file; close} - in Ok instance - | Error _ as e -> e - end (* of functor [Make] in HEADER *) (* END TRAILER *) } diff --git a/src/passes/1-parser/shared/LexerLib.ml b/src/passes/1-parser/shared/LexerLib.ml index 48ff79dfe..308e228df 100644 --- a/src/passes/1-parser/shared/LexerLib.ml +++ b/src/passes/1-parser/shared/LexerLib.ml @@ -1,159 +1,6 @@ -(* Sliding window *) - -(* The type [window] models a two-slot buffer of tokens for reporting - after a parse error. Technically, it is a parametric type, but its - use is meant for tokens, wherever they are defined. - - In [Two(t1,t2)], the token [t2] is the next to be sent to the - parser. - - The call [slide token buffer] pushes the token [token] in the - buffer [buffer]. If the buffer is full, that is, it is - [Two(t1,t2)], then the token [t2] is discarded to make room for - [token]. - *) - -type 'a window = - Nil -| One of 'a -| Two of 'a * 'a - -let slide token = function - Nil -> One token -| One t | Two (t,_) -> Two (token,t) - module Region = Simple_utils.Region module Pos = Simple_utils.Pos -type lexeme = string - -(* The signature [TOKEN] exports an abstract type [token], so a lexer - can be a functor over tokens. This enables to externalise - version-dependent constraints in any module whose signature matches - [TOKEN]. Generic functions to construct tokens are required. - - Note the predicate [is_eof], which caracterises the virtual token - for end-of-file, because it requires special handling. -*) - -module type TOKEN = - sig - type token - - (* Predicates *) - - val is_eof : token -> bool - - (* Projections *) - - val to_lexeme : token -> lexeme - val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string - val to_region : token -> Region.t - - (* Style *) - - type error - - val error_to_string : error -> string - - exception Error of error Region.reg - - val format_error : - ?offsets:bool -> - [`Byte | `Point] -> - error Region.reg -> - file:bool -> - string Region.reg - - val check_right_context : - token -> - (Lexing.lexbuf -> (Markup.t list * token) option) -> - Lexing.lexbuf -> - unit - end - -(* The module type for lexers is [S]. It mainly exports the function - [open_token_stream], which returns - - * a function [read] that extracts tokens from a lexing buffer, - together with a lexing buffer [buffer] to read from, - * a function [close] that closes that buffer, - * a function [get_pos] that returns the current position, and - * a function [get_last] that returns the region of the last - recognised token. - * a function [get_file] that returns the name of the file being - scanned (empty string if [stdin]). - - Note that a module [Token] is exported too, because the signature - of the exported functions depend on it. - - The type [window] is a two-token window, that is, a buffer that - contains the last recognised token, and the penultimate (if any). - - The call [read ~log] evaluates in a lexer (also known as a - tokeniser or scanner) whose type is [Lexing.lexbuf -> token], and - suitable for a parser generated by Menhir. The argument labelled - [log] is a logger, that is, it may print a token and its left - markup to a given channel, at the caller's discretion. -*) - -module type S = - sig - module Token : TOKEN - type token = Token.token - - type file_path = string - type logger = Markup.t list -> token -> unit - - type input = - File of file_path - | String of string - | Channel of in_channel - | Buffer of Lexing.lexbuf - - type instance = { - input : input; - read : log:logger -> Lexing.lexbuf -> token; - buffer : Lexing.lexbuf; - get_win : unit -> token window; - get_pos : unit -> Pos.t; - get_last : unit -> Region.t; - get_file : unit -> file_path; - close : unit -> unit - } - - type open_err = File_opening of string - - val lexbuf_from_input : - input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result - - type line_comment = string (* Opening of a line comment *) - type block_comment = - - val mk_block : opening:string -> closing:string -> block_comment - - val open_token_stream : - ?line:line_comment -> - ?block:block_comment -> - input -> - (instance, open_err) Stdlib.result - - (* Error reporting *) - - type error - - val error_to_string : error -> string - - exception Error of error Region.reg - - val format_error : - ?offsets:bool -> - [`Byte | `Point] -> - error Region.reg -> - file:bool -> - string Region.reg - end - (* LEXER ENGINE *) (* Resetting file name and line number in the lexing buffer @@ -205,27 +52,20 @@ let rollback buffer = buffer.lex_curr_pos <- buffer.lex_curr_pos - len; buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum} +(* Utility types *) + +type file_path = string +type lexeme = string + (* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *) -(* When scanning structured constructs, like strings and comments, we - need to keep the region of the opening symbol (like double quote, - "//" or "(*") in order to report any error more precisely. Since - ocamllex is byte-oriented, we need to store the parsed bytes as - characters in an accumulator [acc] and also its length [len], so, - we are done, it is easy to build the string making up the - structured construct with [mk_str] (see above). - - The resulting data structure is called a _thread_. (Note for - Emacs: "*)".) -*) - type thread = < - opening : Region.t; - length : int; - acc : char list; - push_char : char -> thread; + opening : Region.t; + length : int; + acc : char list; + to_string : string; + push_char : char -> thread; push_string : string -> thread; - to_string : string; set_opening : Region.t -> thread > @@ -233,12 +73,12 @@ let mk_thread region lexeme : thread = (* The call [explode s a] is the list made by pushing the characters in the string [s] on top of [a], in reverse order. For example, [explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *) + let explode s acc = let rec push = function 0 -> acc | i -> s.[i-1] :: push (i-1) - in push (String.length s) - in + in push (String.length s) in object val opening = region method opening = opening @@ -268,184 +108,288 @@ let mk_thread region lexeme : thread = method to_string = let bytes = Bytes.make length ' ' in let rec fill i = function - [] -> bytes - | char::l -> Bytes.set bytes i char; fill (i-1) l + [] -> bytes + | char::l -> Bytes.set bytes i char; fill (i-1) l in fill (length-1) acc |> Bytes.to_string end -type file_path = string +(* STATE *) - (* STATE *) +(* Scanning the lexing buffer for tokens (and markup, as a + side-effect). - (* Beyond tokens, the result of lexing is a state. The type - [state] represents the logical state of the lexing engine, that - is, a value which is threaded during scanning and which denotes - useful, high-level information beyond what the type - [Lexing.lexbuf] in the standard library already provides for - all generic lexers. + Because we want the lexer to have access to the right lexical + context of a recognised lexeme (to enforce stylistic constraints or + report special error patterns), we need to keep a hidden reference + to a queue of recognised lexical units (that is, tokens and markup) + that acts as a mutable state between the calls to [read]. When + [read] is called, that queue is examined first and, if it contains + at least one token, that token is returned; otherwise, the lexing + buffer is scanned for at least one more new token. That is the + general principle: we put a high-level buffer (our queue) on top of + the low-level lexing buffer. - Tokens are the smallest units used by the parser to build the - abstract syntax tree. The state includes a queue of recognised - tokens, with the markup at the left of its lexeme until either - the start of the file or the end of the previously recognised - token. + One tricky and important detail is that we must make any parser + generated by Menhir (and calling [read]) believe that the last + region of the input source that was matched indeed corresponds to + the returned token, despite that many tokens and markup may have + been matched since it was actually read from the input. In other + words, the parser requests a token that is taken from the + high-level buffer, but the parser requests the source regions from + the _low-level_ lexing buffer, and they may disagree if more than + one token has actually been recognised. - The markup from the last recognised token or, if the first - token has not been recognised yet, from the beginning of the - file is stored in the field [markup] of the state because it is - a side-effect, with respect to the output token list, and we - use a record with a single field [units] because that record - may be easily extended during the future maintenance of this - lexer. + Consequently, in order to maintain a consistent view for the + parser, we have to patch some fields of the lexing buffer, namely + [lex_start_p] and [lex_curr_p], as these fields are read by parsers + generated by Menhir when querying source positions (regions). This + is the purpose of the function [patch_buffer]. After reading one or + more tokens and markup by the scanning rule [scan], we have to save + in the hidden reference [buf_reg] the region of the source that was + matched by [scan]. This atomic sequence of patching, scanning and + saving is implemented by the _function_ [scan] (beware: it shadows + the scanning rule [scan]). The function [patch_buffer] is, of + course, also called just before returning the token, so the parser + has a view of the lexing buffer consistent with the token. - The state also includes a field [pos] which holds the current - position in the LIGO source file. The position is not always - updated after a single character has been matched: that depends - on the regular expression that matched the lexing buffer. + Note that an additional reference [first_call] is needed to + distinguish the first call to the function [scan], as the first + scanning rule is actually [init] (which can handle the BOM), not + [scan]. +*) - The field [window] is a two-token window, that is, a buffer - that contains the last recognised token, and the penultimate - (if any). +type 'token window = + Nil +| One of 'token +| Two of 'token * 'token - The fields [decoder] and [supply] offer the support needed - for the lexing of UTF-8 encoded characters in comments (the - only place where they are allowed in LIGO). The former is the - decoder proper and the latter is the effectful function - [supply] that takes a byte, a start index and a length and feed - it to [decoder]. See the documentation of the third-party - library Uutf. - *) +type 'token state = < + units : (Markup.t list * 'token) FQueue.t; + markup : Markup.t list; + window : 'token window; + last : Region.t; + pos : Pos.t; + decoder : Uutf.decoder; + supply : Bytes.t -> int -> int -> unit; + block : EvalOpt.block_comment option; + line : EvalOpt.line_comment option; - type line_comment = string (* Opening of a line comment *) - type block_comment = + enqueue : 'token -> 'token state; + set_units : (Markup.t list * 'token) FQueue.t -> 'token state; + set_last : Region.t -> 'token state; + set_pos : Pos.t -> 'token state; + slide_token : 'token -> 'token state; - let mk_block ~opening ~closing : block_comment = - object - method opening = opening - method closing = closing - end + sync : Lexing.lexbuf -> Region.t * lexeme * 'token state; - type 'a state = < - units : (Markup.t list * 'a) FQueue.t; - markup : Markup.t list; - window : 'a window; - last : Region.t; - pos : Pos.t; - decoder : Uutf.decoder; - supply : Bytes.t -> int -> int -> unit; - block : block_comment option; - line : line_comment option; + push_newline : Lexing.lexbuf -> 'token state; + push_line : thread -> 'token state; + push_block : thread -> 'token state; + push_space : Lexing.lexbuf -> 'token state; + push_tabs : Lexing.lexbuf -> 'token state; + push_bom : Lexing.lexbuf -> 'token state; + push_markup : Markup.t -> 'token state; +> - enqueue : 'a -> 'a state; - set_units : (Markup.t list * 'a) FQueue.t -> 'a state; - set_last : Region.t -> 'a state; - set_pos : Pos.t -> 'a state; - slide_token : 'a -> 'a state; +let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply + ?block ?line () : _ state = + object (self) + val units = units + method units = units + val markup = markup + method markup = markup + val window = window + method window = window + val last = last + method last = last + val pos = pos + method pos = pos + method decoder = decoder + method supply = supply + method block = block + method line = line - sync : Lexing.lexbuf -> Region.t * lexeme * 'a state; + method enqueue token = + {< units = FQueue.enq (markup, token) units; + markup = [] >} - push_newline : Lexing.lexbuf -> 'a state; - push_line : thread -> 'a state; - push_block : thread -> 'a state; - push_space : Lexing.lexbuf -> 'a state; - push_tabs : Lexing.lexbuf -> 'a state; - push_bom : Lexing.lexbuf -> 'a state; - push_markup : Markup.t -> 'a state; - > + method set_units units = {< units = units >} + method set_last region = {< last = region >} + method set_pos pos = {< pos = pos >} - let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply - ?block ?line () : _ state = - object (self) - val units = units - method units = units - val markup = markup - method markup = markup - val window = window - method window = window - val last = last - method last = last - val pos = pos - method pos = pos - method decoder = decoder - method supply = supply - method block = block - method line = line + method slide_token token = + match self#window with + Nil -> {< window = One token >} + | One t | Two (t,_) -> {< window = Two (token,t) >} - (* The call [enqueue (token, state)] updates functionally the - state [state] by associating the token [token] with the - stored markup and enqueuing the pair into the units - queue. The field [markup] is then reset to the empty - list. *) + method sync buffer = + let lex = Lexing.lexeme buffer in + let len = String.length lex in + let start = pos in + let stop = start#shift_bytes len in + let state = {< pos = stop >} + in Region.make ~start ~stop, lex, state - method enqueue token = - {< units = FQueue.enq (markup, token) units; - markup = [] >} + (* MARKUP *) - method set_units units = {< units = units >} - method set_last region = {< last = region >} - method set_pos pos = {< pos = pos >} + (* Committing markup to the current logical state *) - method slide_token token = - {< window = slide token window >} + method push_markup unit = {< markup = unit :: markup >} - (* The call [sync state buffer] updates the current position - in accordance with the contents of the lexing buffer, more - precisely, depending on the length of the string which has - just been recognised by the scanner: that length is used as - a positive offset to the current column. *) + method push_newline buffer = + let () = Lexing.new_line buffer in + let value = Lexing.lexeme buffer in + let start = self#pos in + let stop = start#new_line value in + let region = Region.make ~start ~stop in + let unit = Markup.Newline Region.{region; value} + in {< pos = stop; markup = unit::markup >} - method sync buffer = - let lex = Lexing.lexeme buffer in - let len = String.length lex in - let start = pos in - let stop = start#shift_bytes len in - let state = {< pos = stop >} - in Region.make ~start ~stop, lex, state + method push_line thread = + let start = thread#opening#start in + let region = Region.make ~start ~stop:self#pos + and value = thread#to_string in + let unit = Markup.LineCom Region.{region; value} + in {< markup = unit::markup >} - (* MARKUP *) + method push_block thread = + let start = thread#opening#start in + let region = Region.make ~start ~stop:self#pos + and value = thread#to_string in + let unit = Markup.BlockCom Region.{region; value} + in {< markup = unit::markup >} - (* Committing markup to the current logical state *) + method push_space buffer = + let region, lex, state = self#sync buffer in + let value = String.length lex in + let unit = Markup.Space Region.{region; value} + in state#push_markup unit - method push_markup unit = {< markup = unit :: markup >} + method push_tabs buffer = + let region, lex, state = self#sync buffer in + let value = String.length lex in + let unit = Markup.Tabs Region.{region; value} + in state#push_markup unit - method push_newline buffer = - let () = Lexing.new_line buffer in - let value = Lexing.lexeme buffer in - let start = self#pos in - let stop = start#new_line value in - let region = Region.make ~start ~stop in - let unit = Markup.Newline Region.{region; value} - in {< pos = stop; markup = unit::markup >} + method push_bom buffer = + let region, value, state = self#sync buffer in + let unit = Markup.BOM Region.{region; value} + in state#push_markup unit + end - method push_line thread = - let start = thread#opening#start in - let region = Region.make ~start ~stop:self#pos - and value = thread#to_string in - let unit = Markup.LineCom Region.{region; value} - in {< markup = unit::markup >} +(* LEXER INSTANCE *) - method push_block thread = - let start = thread#opening#start in - let region = Region.make ~start ~stop:self#pos - and value = thread#to_string in - let unit = Markup.BlockCom Region.{region; value} - in {< markup = unit::markup >} +type input = + File of file_path +| String of string +| Channel of in_channel +| Buffer of Lexing.lexbuf - method push_space buffer = - let region, lex, state = self#sync buffer in - let value = String.length lex in - let unit = Markup.Space Region.{region; value} - in state#push_markup unit +type 'token logger = Markup.t list -> 'token -> unit - method push_tabs buffer = - let region, lex, state = self#sync buffer in - let value = String.length lex in - let unit = Markup.Tabs Region.{region; value} - in state#push_markup unit +type 'token instance = { + input : input; + read : log:('token logger) -> Lexing.lexbuf -> 'token; + buffer : Lexing.lexbuf; + get_win : unit -> 'token window; + get_pos : unit -> Pos.t; + get_last : unit -> Region.t; + get_file : unit -> file_path; + close : unit -> unit +} - method push_bom buffer = - let region, value, state = self#sync buffer in - let unit = Markup.BOM Region.{region; value} - in state#push_markup unit +type open_err = File_opening of string - end +let lexbuf_from_input = function + String s -> + Ok (Lexing.from_string s, fun () -> ()) +| Channel chan -> + let close () = close_in chan in + Ok (Lexing.from_channel chan, close) +| Buffer b -> + Ok (b, fun () -> ()) +| File path -> + try + let chan = open_in path in + let close () = close_in chan in + let lexbuf = Lexing.from_channel chan in + let () = + let open Lexing in + lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path} + in Ok (lexbuf, close) + with Sys_error msg -> Stdlib.Error (File_opening msg) + +let open_token_stream ?line ?block ~init ~scan + ~token_to_region ~style input = + let file_path = match input with + File path -> path + | _ -> "" in + let pos = Pos.min ~file:file_path in + let buf_reg = ref (pos#byte, pos#byte) + and first_call = ref true + and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in + let supply = Uutf.Manual.src decoder in + let state = ref (mk_state + ~units:FQueue.empty + ~last:Region.ghost + ~window:Nil + ~pos + ~markup:[] + ~decoder + ~supply + ?block + ?line + ()) in + let get_pos () = !state#pos + and get_last () = !state#last + and get_win () = !state#window + and get_file () = file_path in + + let patch_buffer (start, stop) buffer = + let open Lexing in + let file_path = buffer.lex_curr_p.pos_fname in + buffer.lex_start_p <- {start with pos_fname = file_path}; + buffer.lex_curr_p <- {stop with pos_fname = file_path} + + and save_region buffer = + buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in + + let scan' init scan buffer = + patch_buffer !buf_reg buffer; + (if !first_call + then (state := init !state buffer; first_call := false) + else state := scan !state buffer); + save_region buffer in + + let next_token init scan buffer = + scan' init scan buffer; + match FQueue.peek !state#units with + None -> None + | Some (units, ext_token) -> + state := !state#set_units units; Some ext_token in + + let rec read init scan ~token_to_region ~style ~log buffer = + match FQueue.deq !state#units with + None -> + scan' init scan buffer; + read init scan ~token_to_region ~style ~log buffer + | Some (units, (left_mark, token)) -> + log left_mark token; + state := ((!state#set_units units) + #set_last (token_to_region token)) + #slide_token token; + style token (next_token init scan) buffer; + patch_buffer (token_to_region token)#byte_pos buffer; + token in + + match lexbuf_from_input input with + Ok (buffer, close) -> + let () = + match input with + File path when path <> "" -> reset ~file:path buffer + | _ -> () in + let instance = { + read = read init scan ~token_to_region ~style; + input; buffer; get_win; get_pos; get_last; get_file; close} + in Ok instance + | Error _ as e -> e diff --git a/src/passes/1-parser/shared/LexerLib.mli b/src/passes/1-parser/shared/LexerLib.mli new file mode 100644 index 000000000..c5749b717 --- /dev/null +++ b/src/passes/1-parser/shared/LexerLib.mli @@ -0,0 +1,205 @@ +(* A library for writing UTF8-aware lexers *) + +module Region = Simple_utils.Region +module Pos = Simple_utils.Pos + +(* The function [rollback] resets the lexing buffer to the state it + was when it matched the last regular expression. This function is + safe to use only in the semantic action of the rule which last + matched. *) + +val rollback : Lexing.lexbuf -> unit + +(* Utility types *) + +type file_path = string +type lexeme = string + +(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *) + +(* When scanning structured constructs, like strings and comments, we + need to keep the region of the opening symbol (like double quote, + "//" or "(*") in order to report any error more precisely. Since + ocamllex is byte-oriented, we need to store the parsed bytes as + characters in an accumulator [acc] and also its length [len], so, + we are done, it is easy to build the string making up the + structured construct with [mk_str] (see above). + + The resulting data structure is called a _thread_. (Note for + Emacs: "*)".) + *) + +type thread = < + opening : Region.t; + length : int; + acc : char list; + to_string : string; + push_char : char -> thread; + push_string : string -> thread; + set_opening : Region.t -> thread +> + +val mk_thread : Region.t -> lexeme -> thread + +(* STATE *) + +(* Beyond producing tokens, the result of lexing is a _state_. The + type [state] represents the abstract logical state of the lexing + engine, that is, a value which is threaded during scanning and + which denotes useful, high-level information beyond what the type + [Lexing.lexbuf] in the standard library already provides for all + generic lexers. We qualify it as "logical state" because the lexing + buffer itself has a "physical state" defined by the type + [Lexing.lexbuf]. + + Tokens are the smallest units used by the parser to build the + abstract syntax tree. The state includes a queue of recognised + tokens, with the markup at the left of its lexeme until either the + start of the file or the end of the previously recognised token. + + The markup from the last recognised token or, if the first token + has not been recognised yet, from the beginning of the file is + stored in the field [markup] of the state because it is a + side-effect, with respect to the output token list, and we use a + record with a single field [units] because that record may be + easily extended during the future maintenance of this lexer. + + The state also includes a field [pos] which holds the current + position in the LIGO source file. The position is not always + updated after a single character has been matched: that depends on + the regular expression that matched the lexing buffer. + + The field [window] is a two-token window, that is, a buffer that + contains the last recognised token, and the penultimate (if any). + Technically, it is a parametric type, but its use is meant for + tokens, wherever they are defined. In [Two (t1,t2)], and in case + of a syntax error, [t1] is the first invalid token and [t2] is the + last valid one. + + The fields [decoder] and [supply] offer the support needed for + the lexing of UTF-8 encoded characters in comments (the only place + where they are allowed in LIGO). The former is the decoder proper + and the latter is the effectful function [supply] that takes a + byte, a start index and a length and feed it to [decoder]. See the + documentation of the third-party library Uutf. + + Some methods are now documented. + + The call [state#enqueue token] updates functionally the state + [state] by associating the token [token] with the stored markup and + enqueuing the pair into the units queue. The field [markup] is then + reset to the empty list. + + The call [state#slide_token token] pushes the token [token] in + the buffer [buffer]. If the buffer is full, that is, it is [Two + (t1,t2)], then the token [t2] is discarded to make room for + [token]. + + The call [state#sync buffer] updates the current position in + accordance with the contents of the lexing buffer, more precisely, + depending on the length of the string which has just been + recognised by the scanner: that length is used as a positive offset + to the current column. + *) + +type 'token window = + Nil +| One of 'token +| Two of 'token * 'token + +type 'token state = < + units : (Markup.t list * 'token) FQueue.t; + markup : Markup.t list; + window : 'token window; + last : Region.t; + pos : Pos.t; + decoder : Uutf.decoder; + supply : Bytes.t -> int -> int -> unit; + block : EvalOpt.block_comment option; + line : EvalOpt.line_comment option; + + enqueue : 'token -> 'token state; + set_units : (Markup.t list * 'token) FQueue.t -> 'token state; + set_last : Region.t -> 'token state; + set_pos : Pos.t -> 'token state; + slide_token : 'token -> 'token state; + + sync : Lexing.lexbuf -> Region.t * lexeme * 'token state; + + push_newline : Lexing.lexbuf -> 'token state; + push_line : thread -> 'token state; + push_block : thread -> 'token state; + push_space : Lexing.lexbuf -> 'token state; + push_tabs : Lexing.lexbuf -> 'token state; + push_bom : Lexing.lexbuf -> 'token state; + push_markup : Markup.t -> 'token state; +> + +(* LEXER INSTANCE *) + +(* The function [open_token_stream] returns a lexer instance made of + + * the input [input] of type [input]; + * a function [read] that extracts tokens from a lexing buffer, + together with a lexing buffer [buffer] to read from, + * a function [close] that closes that buffer, + * a function [get_pos] that returns the current position, and + * a function [get_last] that returns the region of the last + recognised token. + * a function [get_file] that returns the name of the file being + scanned (empty string if [stdin]). + + Note that a module [Token] is exported too, because the signature + of the exported functions depend on it. + + The type [window] is a two-token window, that is, a buffer that + contains the last recognised token, and the penultimate (if any). + + The call [read ?line ?block ~init ~scan ~token_to_region ~style + input] evaluates in a lexer (also known as a tokeniser or scanner) + whose type is [log:('token logger) -> Lexing.lexbuf -> 'token], and + suitable for a parser generated by Menhir. The argument labelled + [log] is a logger, that is, it may print a token and its left + markup to a given channel, at the caller's discretion. The argument + labelled [~init] is the scanner to be called first, usually for + reading the BOM, then [scan] is used for the following calls. The + function labelled [~style] is used to check stylistic constraints + on the tokens and the markup between them. +*) + +type input = + File of file_path +| String of string +| Channel of in_channel +| Buffer of Lexing.lexbuf + +type 'token logger = Markup.t list -> 'token -> unit + +type 'token instance = { + input : input; + read : log:('token logger) -> Lexing.lexbuf -> 'token; + buffer : Lexing.lexbuf; + get_win : unit -> 'token window; + get_pos : unit -> Pos.t; + get_last : unit -> Region.t; + get_file : unit -> file_path; + close : unit -> unit +} + +type open_err = File_opening of string + +val lexbuf_from_input : + input -> (Lexing.lexbuf * (unit -> unit), open_err) Stdlib.result + +val open_token_stream : + ?line:EvalOpt.line_comment -> + ?block:EvalOpt.block_comment -> + init:('token state -> Lexing.lexbuf -> 'token state) -> + scan:('token state -> Lexing.lexbuf -> 'token state) -> + token_to_region:('token -> Region.t) -> + style:('token -> + (Lexing.lexbuf -> (Markup.t list * 'token) option) -> + Lexing.lexbuf -> + unit) -> + input -> + ('token instance, open_err) Stdlib.result diff --git a/src/passes/1-parser/shared/LexerLog.ml b/src/passes/1-parser/shared/LexerLog.ml index 7f23ef0c6..6fe02527c 100644 --- a/src/passes/1-parser/shared/LexerLog.ml +++ b/src/passes/1-parser/shared/LexerLog.ml @@ -4,7 +4,8 @@ module Region = Simple_utils.Region module type S = sig - module Lexer : LexerLib.S + module Lexer : Lexer.S + type token = Lexer.token val output_token : ?offsets:bool -> @@ -12,7 +13,7 @@ module type S = EvalOpt.command -> out_channel -> Markup.t list -> - Lexer.token -> + token -> unit type file_path = string @@ -22,19 +23,26 @@ module type S = [`Byte | `Point] -> ?block:EvalOpt.block_comment -> ?line:EvalOpt.line_comment -> - Lexer.input -> + token_to_region:(token -> Region.t) -> + style:(token -> + (Lexing.lexbuf -> (Markup.t list * token) option) -> + Lexing.lexbuf -> + unit) -> + LexerLib.input -> EvalOpt.command -> (unit, string Region.reg) Stdlib.result end -module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) = +module Make (Lexer: Lexer.S) : (S with module Lexer = Lexer) = struct module Lexer = Lexer module Token = Lexer.Token + type token = Lexer.token - (** Pretty-printing in a string the lexemes making up the markup + (* Pretty-printing in a string the lexemes making up the markup between two tokens, concatenated with the last lexeme itself. *) + let output_token ?(offsets=true) mode command channel left_mark token : unit = let output str = Printf.fprintf channel "%s%!" str in @@ -57,10 +65,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) = type file_path = string - let trace ?(offsets=true) mode ?block ?line input command : + let trace ?(offsets=true) mode ?block ?line + ~token_to_region ~style input command : (unit, string Region.reg) Stdlib.result = - match Lexer.open_token_stream ?line ?block input with - Ok Lexer.{read; buffer; close; _} -> + match LexerLib.open_token_stream + ~init:Lexer.init + ~scan:Lexer.scan + ~token_to_region + ~style + ?line ?block input + with + Ok LexerLib.{read; buffer; close; _} -> let log = output_token ~offsets mode command stdout and close_all () = flush_all (); close () in let rec iter () = @@ -69,12 +84,17 @@ module Make (Lexer: LexerLib.S) : (S with module Lexer = Lexer) = if Token.is_eof token then Stdlib.Ok () else iter () + | exception Lexer.Token.Error error -> + let msg = + Lexer.Token.format_error + ~offsets mode ~file:true error + in Stdlib.Error msg | exception Lexer.Error error -> let msg = Lexer.format_error ~offsets mode ~file:true error in Stdlib.Error msg in let result = iter () in close_all (); result - | Stdlib.Error (Lexer.File_opening msg) -> + | Stdlib.Error (LexerLib.File_opening msg) -> flush_all (); Stdlib.Error (Region.wrap_ghost msg) end diff --git a/src/passes/1-parser/shared/LexerLog.mli b/src/passes/1-parser/shared/LexerLog.mli index f433df44d..3f2be268f 100644 --- a/src/passes/1-parser/shared/LexerLog.mli +++ b/src/passes/1-parser/shared/LexerLog.mli @@ -2,7 +2,8 @@ module Region = Simple_utils.Region module type S = sig - module Lexer : LexerLib.S + module Lexer : Lexer.S + type token = Lexer.token val output_token : ?offsets:bool -> @@ -10,7 +11,7 @@ module type S = EvalOpt.command -> out_channel -> Markup.t list -> - Lexer.token -> + token -> unit type file_path = string @@ -20,9 +21,14 @@ module type S = [`Byte | `Point] -> ?block:EvalOpt.block_comment -> ?line:EvalOpt.line_comment -> - Lexer.input -> + token_to_region:(token -> Region.t) -> + style:(token -> + (Lexing.lexbuf -> (Markup.t list * token) option) -> + Lexing.lexbuf -> + unit) -> + LexerLib.input -> EvalOpt.command -> (unit, string Region.reg) Stdlib.result end -module Make (Lexer: LexerLib.S) : S with module Lexer = Lexer +module Make (Lexer: Lexer.S) : S with module Lexer = Lexer diff --git a/src/passes/1-parser/shared/LexerUnit.ml b/src/passes/1-parser/shared/LexerUnit.ml index 017c18286..836c8db98 100644 --- a/src/passes/1-parser/shared/LexerUnit.ml +++ b/src/passes/1-parser/shared/LexerUnit.ml @@ -9,7 +9,7 @@ module type IO = val options : EvalOpt.options (* CLI options *) end -module Make (IO: IO) (Lexer: LexerLib.S) = +module Make (IO: IO) (Lexer: Lexer.S) = struct (* Error printing and exception tracing *) @@ -39,11 +39,16 @@ module Make (IO: IO) (Lexer: LexerLib.S) = | Stdlib.Ok pp_buffer -> (* Running the lexer on the preprocessed input *) - let source = Lexer.String (Buffer.contents pp_buffer) in - match Lexer.open_token_stream ?line:IO.options#line - ?block:IO.options#block - source with - Ok Lexer.{read; buffer; close; _} -> + let source = LexerLib.String (Buffer.contents pp_buffer) in + match LexerLib.open_token_stream + ?line:IO.options#line + ?block:IO.options#block + ~init:Lexer.init + ~scan:Lexer.scan + ~token_to_region:Lexer.Token.to_region + ~style:Lexer.Token.check_right_context + source with + Ok LexerLib.{read; buffer; close; _} -> let close_all () = flush_all (); close () in let rec read_tokens tokens = match read ~log:(fun _ _ -> ()) buffer with @@ -51,9 +56,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) = if Lexer.Token.is_eof token then Stdlib.Ok (List.rev tokens) else read_tokens (token::tokens) -(* | exception _ -> - Printf.eprintf "Here\n%!"; exit 1 - *) | exception Lexer.Token.Error error -> + | exception Lexer.Token.Error error -> let file = match IO.options#input with None | Some "-" -> false @@ -74,7 +77,7 @@ module Make (IO: IO) (Lexer: LexerLib.S) = in Stdlib.Error msg in let result = read_tokens [] in close_all (); result - | Stdlib.Error (Lexer.File_opening msg) -> + | Stdlib.Error (LexerLib.File_opening msg) -> flush_all (); Stdlib.Error (Region.wrap_ghost msg) in match IO.options#input with None -> preproc stdin @@ -115,7 +118,9 @@ module Make (IO: IO) (Lexer: LexerLib.S) = IO.options#mode ?block:IO.options#block ?line:IO.options#line - (Lexer.String preproc_str) + ~token_to_region:Lexer.Token.to_region + ~style:Lexer.Token.check_right_context + (LexerLib.String preproc_str) IO.options#cmd in match IO.options#input with None -> preproc stdin diff --git a/src/passes/1-parser/shared/LexerUnit.mli b/src/passes/1-parser/shared/LexerUnit.mli index 950234e41..804182515 100644 --- a/src/passes/1-parser/shared/LexerUnit.mli +++ b/src/passes/1-parser/shared/LexerUnit.mli @@ -7,7 +7,7 @@ module type IO = val options : EvalOpt.options (* CLI options *) end -module Make (IO: IO) (Lexer: LexerLib.S) : +module Make (IO: IO) (Lexer: Lexer.S) : sig val scan : unit -> (Lexer.token list, string Region.reg) Stdlib.result val trace : unit -> (unit, string Region.reg) Stdlib.result diff --git a/src/passes/1-parser/shared/ParserAPI.ml b/src/passes/1-parser/shared/ParserAPI.ml index 99b9ef064..73b384302 100644 --- a/src/passes/1-parser/shared/ParserAPI.ml +++ b/src/passes/1-parser/shared/ParserAPI.ml @@ -56,8 +56,8 @@ module type PARSER = (* Main functor *) module Make (IO: IO) - (Lexer: LexerLib.S) - (Parser: PARSER with type token = Lexer.Token.token) + (Lexer: Lexer.S) + (Parser: PARSER with type token = Lexer.token) (ParErr: sig val message : int -> string end) = struct module I = Parser.MenhirInterpreter @@ -143,14 +143,14 @@ module Make (IO: IO) ~offsets:IO.options#offsets IO.options#mode IO.options#cmd stdout - let incr_contract Lexer.{read; buffer; get_win; close; _} = + let incr_contract LexerLib.{read; buffer; get_win; close; _} = let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer and failure = failure get_win in let parser = Incr.contract buffer.Lexing.lex_curr_p in let ast = I.loop_handle success failure supplier parser in flush_all (); close (); ast - let incr_expr Lexer.{read; buffer; get_win; close; _} = + let incr_expr LexerLib.{read; buffer; get_win; close; _} = let supplier = I.lexer_lexbuf_to_supplier (read ~log) buffer and failure = failure get_win in let parser = Incr.interactive_expr buffer.Lexing.lex_curr_p in diff --git a/src/passes/1-parser/shared/ParserAPI.mli b/src/passes/1-parser/shared/ParserAPI.mli index 74390f5f0..90d51c392 100644 --- a/src/passes/1-parser/shared/ParserAPI.mli +++ b/src/passes/1-parser/shared/ParserAPI.mli @@ -55,8 +55,8 @@ module type PARSER = end module Make (IO: IO) - (Lexer: LexerLib.S) - (Parser: PARSER with type token = Lexer.Token.token) + (Lexer: Lexer.S) + (Parser: PARSER with type token = Lexer.token) (ParErr: sig val message : int -> string end) : sig (* WARNING: The following parsers may all raise [Lexer.Error] *) @@ -78,8 +78,8 @@ module Make (IO: IO) exception Point of error - val incr_contract : Lexer.instance -> Parser.ast - val incr_expr : Lexer.instance -> Parser.expr + val incr_contract : Lexer.token LexerLib.instance -> Parser.ast + val incr_expr : Lexer.token LexerLib.instance -> Parser.expr val format_error : ?offsets:bool -> [`Point | `Byte] -> error -> string Region.reg diff --git a/src/passes/1-parser/shared/ParserUnit.ml b/src/passes/1-parser/shared/ParserUnit.ml index fd860cd56..d34c183d4 100644 --- a/src/passes/1-parser/shared/ParserUnit.ml +++ b/src/passes/1-parser/shared/ParserUnit.ml @@ -37,7 +37,7 @@ module type Printer = val print_expr : state -> expr -> unit end -module Make (Lexer: LexerLib.S) +module Make (Lexer: Lexer.S) (AST: sig type t type expr end) (Parser: ParserAPI.PARSER with type ast = AST.t @@ -89,12 +89,12 @@ module Make (Lexer: LexerLib.S) ParserLog.mk_state ~offsets:SubIO.options#offsets ~mode:SubIO.options#mode ~buffer:output in - let close () = lexer_inst.Lexer.close () in + let close () = lexer_inst.LexerLib.close () in let expr = try if SubIO.options#mono then - let tokeniser = lexer_inst.Lexer.read ~log - and lexbuf = lexer_inst.Lexer.buffer + let tokeniser = lexer_inst.LexerLib.read ~log + and lexbuf = lexer_inst.LexerLib.buffer in Front.mono_expr tokeniser lexbuf else Front.incr_expr lexer_inst @@ -124,12 +124,12 @@ module Make (Lexer: LexerLib.S) ParserLog.mk_state ~offsets:SubIO.options#offsets ~mode:SubIO.options#mode ~buffer:output in - let close () = lexer_inst.Lexer.close () in + let close () = lexer_inst.LexerLib.close () in let ast = try if SubIO.options#mono then - let tokeniser = lexer_inst.Lexer.read ~log - and lexbuf = lexer_inst.Lexer.buffer + let tokeniser = lexer_inst.LexerLib.read ~log + and lexbuf = lexer_inst.LexerLib.buffer in Front.mono_contract tokeniser lexbuf else Front.incr_contract lexer_inst @@ -163,10 +163,18 @@ module Make (Lexer: LexerLib.S) | exception Lexer.Error err -> let file = - lexer_inst.Lexer.buffer.Lexing.lex_curr_p.Lexing.pos_fname in - let error = - Lexer.format_error ~offsets:SubIO.options#offsets - SubIO.options#mode err ~file:(file <> "") + lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in + let error = Lexer.format_error + ~offsets:SubIO.options#offsets + SubIO.options#mode err ~file:(file <> "") + in Stdlib.Error error + + | exception Lexer.Token.Error err -> + let file = + lexer_inst.LexerLib.buffer.Lexing.lex_curr_p.Lexing.pos_fname in + let error = Lexer.Token.format_error + ~offsets:SubIO.options#offsets + SubIO.options#mode err ~file:(file <> "") in Stdlib.Error error (* Incremental API of Menhir *) @@ -181,7 +189,7 @@ module Make (Lexer: LexerLib.S) | exception Parser.Error -> let invalid, valid_opt = - match lexer_inst.Lexer.get_win () with + match lexer_inst.LexerLib.get_win () with LexerLib.Nil -> assert false (* Safe: There is always at least EOF. *) | LexerLib.One invalid -> invalid, None @@ -205,8 +213,8 @@ module Make (Lexer: LexerLib.S) (* Parsing a contract *) let gen_parser options input parser = - match Lexer.lexbuf_from_input input with - Stdlib.Error (Lexer.File_opening msg) -> + match LexerLib.lexbuf_from_input input with + Stdlib.Error (LexerLib.File_opening msg) -> Stdlib.Error (Region.wrap_ghost msg) | Ok (lexbuf, close) -> (* Preprocessing the input source *) @@ -224,48 +232,53 @@ module Make (Lexer: LexerLib.S) (* Lexing and parsing the preprocessed input source *) let () = close () in - let input' = Lexer.String (Buffer.contents buffer) in - match Lexer.open_token_stream ?line:options#line - ?block:options#block - input' + let input' = LexerLib.String (Buffer.contents buffer) in + match LexerLib.open_token_stream + ~init:Lexer.init + ~scan:Lexer.scan + ~token_to_region:Lexer.Token.to_region + ~style:Lexer.Token.check_right_context + ?line:options#line + ?block:options#block + input' with Ok instance -> let open Lexing in - instance.Lexer.buffer.lex_curr_p <- - {instance.Lexer.buffer.lex_curr_p with pos_fname=file}; + instance.LexerLib.buffer.lex_curr_p <- + {instance.LexerLib.buffer.lex_curr_p with pos_fname=file}; apply instance parser - | Stdlib.Error (Lexer.File_opening msg) -> + | Stdlib.Error (LexerLib.File_opening msg) -> Stdlib.Error (Region.wrap_ghost msg) (* Parsing a contract in a file *) let contract_in_file (source : string) = let options = SubIO.make ~input:(Some source) ~expr:false - in gen_parser options (Lexer.File source) parse_contract + in gen_parser options (LexerLib.File source) parse_contract (* Parsing a contract in a string *) let contract_in_string (source : string) = let options = SubIO.make ~input:None ~expr:false in - gen_parser options (Lexer.String source) parse_contract + gen_parser options (LexerLib.String source) parse_contract (* Parsing a contract in stdin *) let contract_in_stdin () = let options = SubIO.make ~input:None ~expr:false in - gen_parser options (Lexer.Channel stdin) parse_contract + gen_parser options (LexerLib.Channel stdin) parse_contract (* Parsing an expression in a string *) let expr_in_string (source : string) = let options = SubIO.make ~input:None ~expr:true in - gen_parser options (Lexer.String source) parse_expr + gen_parser options (LexerLib.String source) parse_expr (* Parsing an expression in stdin *) let expr_in_stdin () = let options = SubIO.make ~input:None ~expr:true in - gen_parser options (Lexer.Channel stdin) parse_expr + gen_parser options (LexerLib.Channel stdin) parse_expr (* Preprocess only *) diff --git a/src/passes/1-parser/shared/ParserUnit.mli b/src/passes/1-parser/shared/ParserUnit.mli index 90e43813a..a9456ab8c 100644 --- a/src/passes/1-parser/shared/ParserUnit.mli +++ b/src/passes/1-parser/shared/ParserUnit.mli @@ -43,7 +43,7 @@ module type Printer = (* Main functor to make the parser *) -module Make (Lexer : LexerLib.S) +module Make (Lexer : Lexer.S) (AST : sig type t type expr end) (Parser : ParserAPI.PARSER with type ast = AST.t diff --git a/src/test/lexer/negative_byte_sequence.ligo b/src/test/lexer/negative_byte_sequence.ligo index b70da4d24..32546a9a7 100644 --- a/src/test/lexer/negative_byte_sequence.ligo +++ b/src/test/lexer/negative_byte_sequence.ligo @@ -1 +1 @@ -const a: string = -0x222; \ No newline at end of file +const a: string = - (**) 0x2222 diff --git a/src/test/lexer/negative_byte_sequence.mligo b/src/test/lexer/negative_byte_sequence.mligo index 2494567f4..f285e02b5 100644 --- a/src/test/lexer/negative_byte_sequence.mligo +++ b/src/test/lexer/negative_byte_sequence.mligo @@ -1 +1 @@ -let a = -0x222; \ No newline at end of file +let a = - (**) 0x2222 diff --git a/src/test/lexer/negative_byte_sequence.religo b/src/test/lexer/negative_byte_sequence.religo index 2494567f4..e94f1f335 100644 --- a/src/test/lexer/negative_byte_sequence.religo +++ b/src/test/lexer/negative_byte_sequence.religo @@ -1 +1 @@ -let a = -0x222; \ No newline at end of file +let a = - /**/ 0x2222; diff --git a/src/test/lexer/reserved_name.ligo b/src/test/lexer/reserved_name.ligo deleted file mode 100644 index 96e261baf..000000000 --- a/src/test/lexer/reserved_name.ligo +++ /dev/null @@ -1 +0,0 @@ -let arguments = 1; \ No newline at end of file diff --git a/src/test/lexer/reserved_name.religo b/src/test/lexer/reserved_name.religo index 8803e9edf..b60e445bb 100644 --- a/src/test/lexer/reserved_name.religo +++ b/src/test/lexer/reserved_name.religo @@ -1 +1 @@ -let end = 1; \ No newline at end of file +let end = 1;