(* Lexer specification for Ligo, to be processed by [ocamllex] *) { (* START HEADER *) (* Shorthands *) type lexeme = string let sprintf = Printf.sprintf module SMap = Utils.String.Map module SSet = Utils.String.Set (* Hack to roll back one lexeme in the current semantic action *) (* let rollback buffer = let open Lexing in let len = String.length (lexeme buffer) in let pos_cnum = buffer.lex_curr_p.pos_cnum - len in buffer.lex_curr_pos <- buffer.lex_curr_pos - len; buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum} *) (* TOKENS *) type t = (* Literals *) String of lexeme Region.reg | Bytes of (lexeme * MBytes.t) Region.reg | Int of (lexeme * Z.t) Region.reg | Ident of lexeme Region.reg | Constr of lexeme Region.reg (* Symbols *) | SEMI of Region.t | COMMA of Region.t | LPAR of Region.t | RPAR of Region.t | LBRACE of Region.t | RBRACE of Region.t | LBRACKET of Region.t | RBRACKET of Region.t | CONS of Region.t | VBAR of Region.t | ARROW of Region.t | ASGNMNT of Region.t | EQUAL of Region.t | COLON of Region.t | OR of Region.t | AND of Region.t | LT of Region.t | LEQ of Region.t | GT of Region.t | GEQ of Region.t | NEQ of Region.t | PLUS of Region.t | MINUS of Region.t | SLASH of Region.t | TIMES of Region.t | DOT of Region.t | WILD of Region.t | CAT of Region.t (* Keywords *) | Begin of Region.t | Const of Region.t | Down of Region.t | Fail of Region.t | If of Region.t | In of Region.t | Is of Region.t | For of Region.t | Function of Region.t | Parameter of Region.t | Storage of Region.t | Type of Region.t | Of of Region.t | Operations of Region.t | Var of Region.t | End of Region.t | Then of Region.t | Else of Region.t | Match of Region.t | Null of Region.t | Procedure of Region.t | Record of Region.t | Step of Region.t | To of Region.t | Mod of Region.t | Not of Region.t | While of Region.t | With of Region.t (* Types *) (* | T_address of Region.t (* "address" *) | T_big_map of Region.t (* "big_map" *) | T_bool of Region.t (* "bool" *) | T_bytes of Region.t (* "bytes" *) | T_contract of Region.t (* "contract" *) | T_int of Region.t (* "int" *) | T_key of Region.t (* "key" *) | T_key_hash of Region.t (* "key_hash" *) | T_list of Region.t (* "list" *) | T_map of Region.t (* "map" *) | T_mutez of Region.t (* "mutez" *) | T_nat of Region.t (* "nat" *) | T_operation of Region.t (* "operation" *) | T_option of Region.t (* "option" *) | T_set of Region.t (* "set" *) | T_signature of Region.t (* "signature" *) | T_string of Region.t (* "string" *) | T_timestamp of Region.t (* "timestamp" *) | T_unit of Region.t (* "unit" *) *) (* Data constructors *) | C_False of Region.t (* "False" *) | C_None of Region.t (* "None" *) | C_Some of Region.t (* "Some" *) | C_True of Region.t (* "True" *) | C_Unit of Region.t (* "Unit" *) (* Virtual tokens *) | EOF of Region.t type token = t let proj_token = function (* Literals *) String Region.{region; value} -> region, sprintf "String %s" value | Bytes Region.{region; value = s,b} -> region, sprintf "Bytes (\"%s\", \"0x%s\")" s (MBytes.to_hex b |> Hex.to_string) | Int Region.{region; value = s,n} -> region, sprintf "Int (\"%s\", %s)" s (Z.to_string n) | Ident Region.{region; value} -> region, sprintf "Ident \"%s\"" value | Constr Region.{region; value} -> region, sprintf "Constr \"%s\"" value (* Symbols *) | SEMI region -> region, "SEMI" | COMMA region -> region, "COMMA" | LPAR region -> region, "LPAR" | RPAR region -> region, "RPAR" | LBRACE region -> region, "LBRACE" | RBRACE region -> region, "RBRACE" | LBRACKET region -> region, "LBRACKET" | RBRACKET region -> region, "RBRACKET" | CONS region -> region, "CONS" | VBAR region -> region, "VBAR" | ARROW region -> region, "ARROW" | ASGNMNT region -> region, "ASGNMNT" | EQUAL region -> region, "EQUAL" | COLON region -> region, "COLON" | OR region -> region, "OR" | AND region -> region, "AND" | LT region -> region, "LT" | LEQ region -> region, "LEQ" | GT region -> region, "GT" | GEQ region -> region, "GEQ" | NEQ region -> region, "NEQ" | PLUS region -> region, "PLUS" | MINUS region -> region, "MINUS" | SLASH region -> region, "SLASH" | TIMES region -> region, "TIMES" | DOT region -> region, "DOT" | WILD region -> region, "WILD" | CAT region -> region, "CAT" (* Keywords *) | Begin region -> region, "Begin" | Const region -> region, "Const" | Down region -> region, "Down" | Fail region -> region, "Fail" | If region -> region, "If" | In region -> region, "In" | Is region -> region, "Is" | For region -> region, "For" | Function region -> region, "Function" | Parameter region -> region, "Parameter" | Storage region -> region, "Storage" | Type region -> region, "Type" | Of region -> region, "Of" | Operations region -> region, "Operations" | Var region -> region, "Var" | End region -> region, "End" | Then region -> region, "Then" | Else region -> region, "Else" | Match region -> region, "Match" | Null region -> region, "Null" | Procedure region -> region, "Procedure" | Record region -> region, "Record" | Step region -> region, "Step" | To region -> region, "To" | Mod region -> region, "Mod" | Not region -> region, "Not" | While region -> region, "While" | With region -> region, "With" (* Data *) | C_False region -> region, "C_False" | C_None region -> region, "C_None" | C_Some region -> region, "C_Some" | C_True region -> region, "C_True" | C_Unit region -> region, "C_Unit" (* Virtual tokens *) | EOF region -> region, "EOF" let to_lexeme = function (* Literals *) String s -> s.Region.value | Bytes b -> fst b.Region.value | Int i -> fst i.Region.value | Ident id | Constr id -> id.Region.value (* Symbols *) | SEMI _ -> ";" | COMMA _ -> "," | LPAR _ -> "(" | RPAR _ -> ")" | LBRACE _ -> "{" | RBRACE _ -> "}" | LBRACKET _ -> "[" | RBRACKET _ -> "]" | CONS _ -> "<:" | VBAR _ -> "|" | ARROW _ -> "->" | ASGNMNT _ -> ":=" | EQUAL _ -> "=" | COLON _ -> ":" | OR _ -> "||" | AND _ -> "&&" | LT _ -> "<" | LEQ _ -> "<=" | GT _ -> ">" | GEQ _ -> ">=" | NEQ _ -> "=/=" | PLUS _ -> "+" | MINUS _ -> "-" | SLASH _ -> "/" | TIMES _ -> "*" | DOT _ -> "." | WILD _ -> "_" | CAT _ -> "^" (* Keywords *) | Begin _ -> "begin" | Const _ -> "const" | Down _ -> "down" | Fail _ -> "fail" | If _ -> "if" | In _ -> "in" | Is _ -> "is" | For _ -> "for" | Function _ -> "function" | Parameter _ -> "parameter" | Storage _ -> "storage" | Type _ -> "type" | Of _ -> "of" | Operations _ -> "operations" | Var _ -> "var" | End _ -> "end" | Then _ -> "then" | Else _ -> "else" | Match _ -> "match" | Null _ -> "null" | Procedure _ -> "procedure" | Record _ -> "record" | Step _ -> "step" | To _ -> "to" | Mod _ -> "mod" | Not _ -> "not" | While _ -> "while" | With _ -> "with" (* Data constructors *) | C_False _ -> "False" | C_None _ -> "None" | C_Some _ -> "Some" | C_True _ -> "True" | C_Unit _ -> "Unit" (* Virtual tokens *) | EOF _ -> "" let to_string token ?(offsets=true) mode = let region, val_str = proj_token token in let reg_str = region#compact ~offsets mode in sprintf "%s: %s" reg_str val_str let to_region token = proj_token token |> fst (* LEXIS *) let keywords = [ (fun reg -> Begin reg); (fun reg -> Const reg); (fun reg -> Down reg); (fun reg -> Fail reg); (fun reg -> If reg); (fun reg -> In reg); (fun reg -> Is reg); (fun reg -> For reg); (fun reg -> Function reg); (fun reg -> Parameter reg); (fun reg -> Storage reg); (fun reg -> Type reg); (fun reg -> Of reg); (fun reg -> Operations reg); (fun reg -> Var reg); (fun reg -> End reg); (fun reg -> Then reg); (fun reg -> Else reg); (fun reg -> Match reg); (fun reg -> Null reg); (fun reg -> Procedure reg); (fun reg -> Record reg); (fun reg -> Step reg); (fun reg -> To reg); (fun reg -> Mod reg); (fun reg -> Not reg); (fun reg -> While reg); (fun reg -> With reg) ] let reserved = let open SSet in empty |> add "and" |> add "as" |> add "asr" |> add "assert" |> add "class" |> add "constraint" |> add "do" |> add "done" |> add "downto" |> add "exception" |> add "external" |> add "false" |> add "fun" |> add "functor" |> add "include" |> add "inherit" |> add "initializer" |> add "land" |> add "lazy" |> add "let" |> add "lor" |> add "lsl" |> add "lsr" |> add "lxor" |> add "method" |> add "module" |> add "mutable" |> add "new" |> add "nonrec" |> add "object" |> add "open" |> add "or" |> add "private" |> add "rec" |> add "sig" |> add "struct" |> add "true" |> add "try" |> add "val" |> add "virtual" |> add "when" let constructors = [ (fun reg -> C_False reg); (fun reg -> C_None reg); (fun reg -> C_Some reg); (fun reg -> C_True reg); (fun reg -> C_Unit reg) ] let add map (key, value) = SMap.add key value map let mk_map mk_key list = let apply map value = add map (mk_key value, value) in List.fold_left apply SMap.empty list type lexis = { kwd : (Region.t -> token) SMap.t; cstr : (Region.t -> token) SMap.t; res : SSet.t } let lexicon : lexis = let build list = mk_map (fun f -> to_lexeme (f Region.ghost)) list in {kwd = build keywords; cstr = build constructors; res = reserved} (* Identifiers *) type ident_err = Reserved_name (* END HEADER *) } (* START LEXER DEFINITION *) (* Named regular expressions *) let small = ['a'-'z'] let capital = ['A'-'Z'] let letter = small | capital let digit = ['0'-'9'] let ident = small (letter | '_' | digit)* let constr = capital (letter | '_' | digit)* (* Rules *) rule scan_ident region lexicon = parse (ident as value) eof { if SSet.mem value lexicon.res then Error Reserved_name else Ok (match SMap.find_opt value lexicon.kwd with Some mk_kwd -> mk_kwd region | None -> Ident Region.{region; value}) } and scan_constr region lexicon = parse (constr as value) eof { match SMap.find_opt value lexicon.cstr with Some mk_cstr -> mk_cstr region | None -> Constr Region.{region; value} } (* END LEXER DEFINITION *) { (* START TRAILER *) (* Smart constructors (injections) *) let mk_string lexeme region = String Region.{region; value=lexeme} let mk_bytes lexeme region = let norm = Str.(global_replace (regexp "_") "" lexeme) in let value = lexeme, MBytes.of_hex (Hex.of_string norm) in Bytes Region.{region; value} type int_err = Non_canonical_zero let mk_int lexeme region = let z = Str.(global_replace (regexp "_") "" lexeme) |> Z.of_string in if Z.equal z Z.zero && lexeme <> "0" then Error Non_canonical_zero else Ok (Int Region.{region; value = lexeme, z}) let eof region = EOF region let mk_sym lexeme region = match lexeme with ";" -> SEMI region | "," -> COMMA region | "(" -> LPAR region | ")" -> RPAR region | "{" -> LBRACE region | "}" -> RBRACE region | "[" -> LBRACKET region | "]" -> RBRACKET region | "<:" -> CONS region | "|" -> VBAR region | "->" -> ARROW region | ":=" -> ASGNMNT region | "=" -> EQUAL region | ":" -> COLON region | "||" -> OR region | "&&" -> AND region | "<" -> LT region | "<=" -> LEQ region | ">" -> GT region | ">=" -> GEQ region | "=/=" -> NEQ region | "+" -> PLUS region | "-" -> MINUS region | "/" -> SLASH region | "*" -> TIMES region | "." -> DOT region | "_" -> WILD region | "^" -> CAT region | _ -> assert false (* Identifiers *) let mk_ident' lexeme region lexicon = Lexing.from_string lexeme |> scan_ident region lexicon let mk_ident lexeme region = mk_ident' lexeme region lexicon (* Constructors *) let mk_constr' lexeme region lexicon = Lexing.from_string lexeme |> scan_constr region lexicon let mk_constr lexeme region = mk_constr' lexeme region lexicon (* Predicates *) let is_string = function String _ -> true | _ -> false let is_bytes = function Bytes _ -> true | _ -> false let is_int = function Int _ -> true | _ -> false let is_ident = function Ident _ -> true | _ -> false let is_kwd = function | Begin _ | Const _ | Down _ | Fail _ | If _ | In _ | Is _ | For _ | Function _ | Parameter _ | Storage _ | Type _ | Of _ | Operations _ | Var _ | End _ | Then _ | Else _ | Match _ | Null _ | Procedure _ | Record _ | Step _ | To _ | Mod _ | Not _ | While _ | With _ -> true | _ -> false let is_constr = function Constr _ | C_False _ | C_None _ | C_Some _ | C_True _ | C_Unit _ -> true | _ -> false let is_sym = function SEMI _ | COMMA _ | LPAR _ | RPAR _ | LBRACE _ | RBRACE _ | LBRACKET _ | RBRACKET _ | CONS _ | VBAR _ | ARROW _ | ASGNMNT _ | EQUAL _ | COLON _ | OR _ | AND _ | LT _ | LEQ _ | GT _ | GEQ _ | NEQ _ | PLUS _ | MINUS _ | SLASH _ | TIMES _ | DOT _ | WILD _ | CAT _ -> true | _ -> false let is_eof = function EOF _ -> true | _ -> false (* END TRAILER *) }