2020-06-23 11:52:39 +02:00

642 lines
22 KiB
OCaml

(* Lexer specification for LIGO, to be processed by [ocamllex]. *)
{
[@@@warning "-42"]
module Region = Simple_utils.Region
module Pos = Simple_utils.Pos
(* START HEADER *)
(* TOKENS *)
(* The signature [TOKEN] exports an abstract type [token], so a lexer
can be a functor over tokens. Consequently, generic functions to
construct tokens are provided. Note predicate [is_eof], which
caracterises the virtual token for end-of-file, because it requires
special handling. *)
type lexeme = string
module type TOKEN =
sig
type token
(* Errors *)
type int_err = Non_canonical_zero
type ident_err = Reserved_name
type nat_err = Invalid_natural
| Non_canonical_zero_nat
type sym_err = Invalid_symbol
type attr_err = Invalid_attribute
(* Injections *)
val mk_int : lexeme -> Region.t -> (token, int_err) result
val mk_nat : lexeme -> Region.t -> (token, nat_err) result
val mk_mutez : lexeme -> Region.t -> (token, int_err) result
val mk_ident : lexeme -> Region.t -> (token, ident_err) result
val mk_sym : lexeme -> Region.t -> (token, sym_err) result
val mk_string : lexeme -> Region.t -> token
val mk_verbatim : lexeme -> Region.t -> token
val mk_bytes : lexeme -> Region.t -> token
val mk_constr : lexeme -> Region.t -> token
val mk_attr : string -> lexeme -> Region.t -> (token, attr_err) result
val mk_lang : lexeme Region.reg -> Region.t -> token
val eof : Region.t -> token
(* Predicates *)
val is_eof : token -> bool
(* Projections *)
val to_lexeme : token -> lexeme
val to_string : token -> ?offsets:bool -> [`Byte | `Point] -> string
val to_region : token -> Region.t
(* Style *)
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
val check_right_context :
token ->
(Lexing.lexbuf -> (Markup.t list * token) option) ->
Lexing.lexbuf ->
unit
end
(* The functorised interface
Note that the module parameter [Token] is re-exported as a
submodule in [S].
*)
module type S =
sig
module Token : TOKEN
type token = Token.token
val scan :
token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state
type error
val error_to_string : error -> string
exception Error of error Region.reg
val format_error :
?offsets:bool -> [`Byte | `Point] ->
error Region.reg -> file:bool -> string Region.reg
end
module Make (Token : TOKEN) : (S with module Token = Token) =
struct
module Token = Token
type token = Token.token
(* ERRORS *)
type error =
Invalid_utf8_sequence
| Unexpected_character of char
| Undefined_escape_sequence
| Unterminated_string
| Unterminated_verbatim
| Unterminated_comment of string
| Non_canonical_zero
| Broken_string
| Invalid_character_in_string
| Reserved_name of string
| Invalid_symbol
| Invalid_natural
| Invalid_attribute
let sprintf = Printf.sprintf
let error_to_string = function
Invalid_utf8_sequence ->
"Invalid UTF-8 sequence."
| Unexpected_character c ->
sprintf "Unexpected character '%s'." (Char.escaped c)
| Undefined_escape_sequence ->
"Undefined escape sequence.\n\
Hint: Remove or replace the sequence."
| Unterminated_string ->
"Unterminated string.\n\
Hint: Close with double quotes."
| Unterminated_verbatim ->
"Unterminated verbatim.\n\
Hint: Close with \"|}\"."
| Unterminated_comment ending ->
sprintf "Unterminated comment.\n\
Hint: Close with \"%s\"." ending
| Non_canonical_zero ->
"Non-canonical zero.\n\
Hint: Use 0."
| Broken_string ->
"The string starting here is interrupted by a line break.\n\
Hint: Remove the break, close the string before or insert a \
backslash."
| Invalid_character_in_string ->
"Invalid character in string.\n\
Hint: Remove or replace the character."
| Reserved_name s ->
sprintf "Reserved name: \"%s\".\n\
Hint: Change the name." s
| Invalid_symbol ->
"Invalid symbol.\n\
Hint: Check the LIGO syntax you use."
| Invalid_natural ->
"Invalid natural number."
| Invalid_attribute ->
"Invalid attribute."
exception Error of error Region.reg
let format_error ?(offsets=true) mode Region.{region; value} ~file =
let msg = error_to_string value
and reg = region#to_string ~file ~offsets mode in
let value = sprintf "Lexical error %s:\n%s\n" reg msg
in Region.{value; region}
let fail region value = raise (Error Region.{region; value})
(* TOKENS *)
(* Making tokens *)
let mk_string (thread, state) =
let start = thread#opening#start in
let stop = state#pos in
let region = Region.make ~start ~stop in
let lexeme = thread#to_string in
let token = Token.mk_string lexeme region
in state#enqueue token
let mk_verbatim (thread, state) =
let start = thread#opening#start in
let stop = state#pos in
let region = Region.make ~start ~stop in
let lexeme = thread#to_string in
let token = Token.mk_verbatim lexeme region
in state#enqueue token
let mk_bytes bytes state buffer =
let region, _, state = state#sync buffer in
let token = Token.mk_bytes bytes region
in state#enqueue token
let mk_int state buffer =
let region, lexeme, state = state#sync buffer in
match Token.mk_int lexeme region with
Ok token -> state#enqueue token
| Error Token.Non_canonical_zero ->
fail region Non_canonical_zero
let mk_nat state buffer =
let region, lexeme, state = state#sync buffer in
match Token.mk_nat lexeme region with
Ok token -> state#enqueue token
| Error Token.Non_canonical_zero_nat ->
fail region Non_canonical_zero
| Error Token.Invalid_natural ->
fail region Invalid_natural
let mk_mutez state buffer =
let region, lexeme, state = state#sync buffer in
match Token.mk_mutez lexeme region with
Ok token -> state#enqueue token
| Error Token.Non_canonical_zero ->
fail region Non_canonical_zero
let mk_tez state buffer =
let region, lexeme, state = state#sync buffer in
let lexeme = Str.string_before lexeme (String.index lexeme 't') in
let lexeme = Z.mul (Z.of_int 1_000_000) (Z.of_string lexeme) in
match Token.mk_mutez (Z.to_string lexeme ^ "mutez") region with
Ok token -> state#enqueue token
| Error Token.Non_canonical_zero ->
fail region Non_canonical_zero
let format_tez s =
match String.index s '.' with
index ->
let len = String.length s in
let integral = Str.first_chars s index
and fractional = Str.last_chars s (len-index-1) in
let num = Z.of_string (integral ^ fractional)
and den = Z.of_string ("1" ^ String.make (len-index-1) '0')
and million = Q.of_string "1000000" in
let mutez = Q.make num den |> Q.mul million in
let should_be_1 = Q.den mutez in
if Z.equal Z.one should_be_1 then Some (Q.num mutez) else None
| exception Not_found -> assert false
let mk_tez_decimal state buffer =
let region, lexeme, state = state#sync buffer in
let lexeme = Str.(global_replace (regexp "_") "" lexeme) in
let lexeme = Str.string_before lexeme (String.index lexeme 't') in
match format_tez lexeme with
None -> assert false
| Some tz ->
match Token.mk_mutez (Z.to_string tz ^ "mutez") region with
Ok token -> state#enqueue token
| Error Token.Non_canonical_zero ->
fail region Non_canonical_zero
let mk_ident state buffer =
let region, lexeme, state = state#sync buffer in
match Token.mk_ident lexeme region with
Ok token -> state#enqueue token
| Error Token.Reserved_name -> fail region (Reserved_name lexeme)
let mk_attr header attr state buffer =
let region, _, state = state#sync buffer in
match Token.mk_attr header attr region with
Ok token -> state#enqueue token
| Error Token.Invalid_attribute ->
fail region Invalid_attribute
let mk_constr state buffer =
let region, lexeme, state = state#sync buffer in
let token = Token.mk_constr lexeme region
in state#enqueue token
let mk_lang lang state buffer =
let region, _, state = state#sync buffer in
let start = region#start#shift_bytes 1 in
let stop = region#stop in
let lang_reg = Region.make ~start ~stop in
let lang = Region.{value=lang; region=lang_reg} in
let token = Token.mk_lang lang region
in state#enqueue token
let mk_sym state buffer =
let region, lexeme, state = state#sync buffer in
match Token.mk_sym lexeme region with
Ok token -> state#enqueue token
| Error Token.Invalid_symbol -> fail region Invalid_symbol
let mk_eof state buffer =
let region, _, state = state#sync buffer in
let token = Token.eof region
in state#enqueue token
(* END HEADER *)
}
(* START LEXER DEFINITION *)
(* Named regular expressions *)
let utf8_bom = "\xEF\xBB\xBF" (* Byte Order Mark for UTF-8 *)
let nl = ['\n' '\r'] | "\r\n"
let blank = ' ' | '\t'
let digit = ['0'-'9']
let natural = digit | digit (digit | '_')* digit
let decimal = natural '.' natural
let small = ['a'-'z']
let capital = ['A'-'Z']
let letter = small | capital
let ident = small (letter | '_' | digit)*
let constr = capital (letter | '_' | digit)*
let attr = ident | constr
let hexa_digit = digit | ['A'-'F' 'a'-'f']
let byte = hexa_digit hexa_digit
let byte_seq = byte | byte (byte | '_')* byte
let bytes = "0x" (byte_seq? as seq)
let esc = "\\n" | "\\\"" | "\\\\" | "\\b"
| "\\r" | "\\t" | "\\x" byte
(* Symbols *)
let common_sym = ';' | ',' | '(' | ')' | '[' | ']' | '{' | '}'
| '=' | ':' | '|' | "->" | '.' | '_' | '^'
| '+' | '-' | '*' | '/' | '<' | "<=" | '>' | ">="
let pascaligo_sym = "=/=" | '#' | ":="
let cameligo_sym = "<>" | "::" | "||" | "&&"
let reasonligo_sym = '!' | "=>" | "!=" | "==" | "++" | "..." | "||" | "&&"
let symbol = common_sym | pascaligo_sym | cameligo_sym | reasonligo_sym
(* Comments *)
let pascaligo_block_comment_opening = "(*"
let pascaligo_block_comment_closing = "*)"
let pascaligo_line_comment = "//"
let cameligo_block_comment_opening = "(*"
let cameligo_block_comment_closing = "*)"
let cameligo_line_comment = "//"
let reasonligo_block_comment_opening = "/*"
let reasonligo_block_comment_closing = "*/"
let reasonligo_line_comment = "//"
let block_comment_openings =
pascaligo_block_comment_opening
| cameligo_block_comment_opening
| reasonligo_block_comment_opening
let block_comment_closings =
pascaligo_block_comment_closing
| cameligo_block_comment_closing
| reasonligo_block_comment_closing
let line_comments =
pascaligo_line_comment
| cameligo_line_comment
| reasonligo_line_comment
(* #include files *)
let string = [^'"' '\\' '\n']* (* For strings of #include *)
(* RULES *)
(* Except for the first rule [init], all rules bear a name starting
with "scan".
All have a parameter [state] that they thread through their
recursive calls. The rules for the structured constructs (strings
and comments) have an extra parameter of type [thread] to record
the location where they start, and their contents (see above).
*)
rule init state = parse
utf8_bom { scan (state#push_bom lexbuf) lexbuf }
| _ { LexerLib.rollback lexbuf; scan state lexbuf }
and scan state = parse
nl { scan (state#push_newline lexbuf) lexbuf }
| ' '+ { scan (state#push_space lexbuf) lexbuf }
| '\t'+ { scan (state#push_tabs lexbuf) lexbuf }
| ident { mk_ident state lexbuf }
| constr { mk_constr state lexbuf }
| bytes { mk_bytes seq state lexbuf }
| natural 'n' { mk_nat state lexbuf }
| natural "mutez" { mk_mutez state lexbuf }
| natural "tz"
| natural "tez" { mk_tez state lexbuf }
| decimal "tz"
| decimal "tez" { mk_tez_decimal state lexbuf }
| natural { mk_int state lexbuf }
| symbol { mk_sym state lexbuf }
| eof { mk_eof state lexbuf }
| "[@" (attr as a) "]" { mk_attr "[@" a state lexbuf }
| "[@@" (attr as a) "]" { mk_attr "[@@" a state lexbuf }
| "[%" (attr as l) { mk_lang l state lexbuf }
(* Management of #include preprocessing directives
An input LIGO program may contain preprocessing directives, and
the entry modules (named *Main.ml) run the preprocessor on them,
as if using the GNU C preprocessor in traditional mode:
https://gcc.gnu.org/onlinedocs/cpp/Traditional-Mode.html
The main interest in using a preprocessor is that it can stand
for a poor man's (flat) module system for LIGO thanks to #include
directives, and the equivalent of the traditional mode leaves the
markup undisturbed.
Contrary to the C preprocessor, our preprocessor does not
generate #line resulting from processing #include directives deal
with system file headers and thus have to be ignored for our
purpose. Moreover, these #line directives may also carry some
additional flags:
https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
of which 1 and 2 indicate, respectively, the start of a new file
and the return from a file (after its inclusion has been
processed).
*)
| '#' blank* (natural as line) blank+ '"' (string as file) '"' {
let _, _, state = state#sync lexbuf in
let flags, state = scan_flags state [] lexbuf in
let () = ignore flags in
let line = int_of_string line
and file = Filename.basename file in
let pos = state#pos#set ~file ~line ~offset:0 in
let state = state#set_pos pos in
scan state lexbuf }
(* String *)
| '"' { let opening, _, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening in
scan_string thread state lexbuf |> mk_string }
| "{|" { let opening, _, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening in
scan_verbatim thread state lexbuf |> mk_verbatim }
(* Comments *)
| block_comment_openings {
let lexeme = Lexing.lexeme lexbuf in
match state#block with
Some block when block#opening = lexeme ->
let opening, _, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening in
let thread = thread#push_string lexeme in
let thread, state = scan_block block thread state lexbuf
in scan (state#push_block thread) lexbuf
| Some _ | None -> (* Not a comment for this LIGO syntax *)
let n = String.length lexeme in
let () = LexerLib.rollback lexbuf in
scan (scan_n_sym n state lexbuf) lexbuf }
| line_comments {
let lexeme = Lexing.lexeme lexbuf in
match state#line with
Some line when line = lexeme ->
let opening, _, state = state#sync lexbuf in
let thread = LexerLib.mk_thread opening in
let thread = thread#push_string lexeme in
let thread, state = scan_line thread state lexbuf
in scan (state#push_line thread) lexbuf
| Some _ | None -> (* Not a comment for this LIGO syntax *)
let n = String.length lexeme in
let () = LexerLib.rollback lexbuf in
scan (scan_n_sym n state lexbuf) lexbuf }
| _ as c { let region, _, _ = state#sync lexbuf
in fail region (Unexpected_character c) }
(* Scanning a series of symbols *)
and scan_n_sym n state = parse
symbol { let state = mk_sym state lexbuf in
if n = 1 then state else scan_n_sym (n-1) state lexbuf }
(* Scanning #include flag *)
and scan_flags state acc = parse
blank+ { let _, _, state = state#sync lexbuf
in scan_flags state acc lexbuf }
| natural as code { let _, _, state = state#sync lexbuf in
let acc = int_of_string code :: acc
in scan_flags state acc lexbuf }
| nl { List.rev acc, state#push_newline lexbuf }
| eof { let _, _, state = state#sync lexbuf
in List.rev acc, state }
(* Finishing a string *)
and scan_string thread state = parse
nl { fail thread#opening Broken_string }
| eof { fail thread#opening Unterminated_string }
| ['\t' '\r' '\b']
{ let region, _, _ = state#sync lexbuf
in fail region Invalid_character_in_string }
| '"' { let _, _, state = state#sync lexbuf
in thread, state }
| esc { let _, lexeme, state = state#sync lexbuf in
let thread = thread#push_string lexeme
in scan_string thread state lexbuf }
| '\\' _ { let region, _, _ = state#sync lexbuf
in fail region Undefined_escape_sequence }
| _ as c { let _, _, state = state#sync lexbuf in
scan_string (thread#push_char c) state lexbuf }
and scan_verbatim thread state = parse
| eof { fail thread#opening Unterminated_verbatim}
| "|}" { let _, _, state = state#sync lexbuf
in thread, state }
| _ as c { let _, _, state = state#sync lexbuf in
scan_verbatim (thread#push_char c) state lexbuf }
(* Finishing a block comment
(For Emacs: ("(*") The lexing of block comments must take care of
embedded block comments that may occur within, as well as strings,
so no substring "*/" or "*)" may inadvertently close the
block. This is the purpose of the first case of the scanner
[scan_block].
*)
and scan_block block thread state = parse
'"' | block_comment_openings {
let lexeme = Lexing.lexeme lexbuf in
if block#opening = lexeme || lexeme = "\""
then let opening = thread#opening in
let opening', _, state = state#sync lexbuf in
let thread = thread#push_string lexeme in
let thread = thread#set_opening opening' in
let next = if lexeme = "\"" then scan_string
else scan_block block in
let thread, state = next thread state lexbuf in
let thread = thread#set_opening opening
in scan_block block thread state lexbuf
else let () = LexerLib.rollback lexbuf in
let n = String.length lexeme in
let state = scan_n_sym n state lexbuf
in scan_block block thread state lexbuf }
| block_comment_closings {
let lexeme = Lexing.lexeme lexbuf in
if block#closing = lexeme
then let _, _, state = state#sync lexbuf
in thread#push_string lexeme, state
else let () = LexerLib.rollback lexbuf in
let n = String.length lexeme in
let state = scan_n_sym n state lexbuf
in scan_block block thread state lexbuf }
| nl as nl {
let () = Lexing.new_line lexbuf
and state = state#set_pos (state#pos#new_line nl)
and thread = thread#push_string nl in
scan_block block thread state lexbuf }
| eof { let err = Unterminated_comment (block#closing)
in fail thread#opening err }
| _ { let () = LexerLib.rollback lexbuf in
let len = thread#length in
let thread, status = scan_utf8 block thread state lexbuf in
let delta = thread#length - len in
let pos = state#pos#shift_one_uchar delta in
match status with
Stdlib.Ok () ->
scan_block block thread (state#set_pos pos) lexbuf
| Error error ->
let region = Region.make ~start:state#pos ~stop:pos
in fail region error }
and scan_utf8 block thread state = parse
eof { let err = Unterminated_comment block#closing
in fail thread#opening err }
| _ as c { let thread = thread#push_char c in
let lexeme = Lexing.lexeme lexbuf in
let () = state#supply (Bytes.of_string lexeme) 0 1 in
match Uutf.decode state#decoder with
`Uchar _ -> thread, Stdlib.Ok ()
| `Malformed _ -> thread, Stdlib.Error Invalid_utf8_sequence
| `Await -> scan_utf8 block thread state lexbuf
| `End -> assert false }
(* Finishing a line comment *)
and scan_line thread state = parse
nl as nl { let () = Lexing.new_line lexbuf
and thread = thread#push_string nl
and state = state#set_pos (state#pos#new_line nl)
in thread, state }
| eof { thread, state }
| _ { let () = LexerLib.rollback lexbuf in
let len = thread#length in
let thread,
status = scan_utf8_inline thread state lexbuf in
let delta = thread#length - len in
let pos = state#pos#shift_one_uchar delta in
match status with
Stdlib.Ok () ->
scan_line thread (state#set_pos pos) lexbuf
| Error error ->
let region = Region.make ~start:state#pos ~stop:pos
in fail region error }
and scan_utf8_inline thread state = parse
eof { thread, Stdlib.Ok () }
| _ as c { let thread = thread#push_char c in
let lexeme = Lexing.lexeme lexbuf in
let () = state#supply (Bytes.of_string lexeme) 0 1 in
match Uutf.decode state#decoder with
`Uchar _ -> thread, Stdlib.Ok ()
| `Malformed _ -> thread, Stdlib.Error Invalid_utf8_sequence
| `Await -> scan_utf8_inline thread state lexbuf
| `End -> assert false }
(* END LEXER DEFINITION *)
{
(* START TRAILER *)
let scan =
let first_call = ref true in
fun state lexbuf ->
if !first_call
then (first_call := false; init state lexbuf)
else scan state lexbuf
end (* of functor [Make] in HEADER *)
(* END TRAILER *)
}