367 lines
11 KiB
OCaml
367 lines
11 KiB
OCaml
|
|
||
|
{
|
||
|
|
||
|
open Concrete_parser
|
||
|
|
||
|
open Script_located_ir
|
||
|
|
||
|
let count_nl s =
|
||
|
let c = ref 0 in
|
||
|
for i = 0 to String.length s - 1 do
|
||
|
if Compare.Char.(s.[i] = '\010') then
|
||
|
incr c
|
||
|
done;
|
||
|
!c
|
||
|
|
||
|
let update_loc lexbuf nl indent =
|
||
|
let open Lexing in
|
||
|
let lcp = lexbuf.lex_curr_p in
|
||
|
lexbuf.lex_curr_p <- { lcp with
|
||
|
pos_lnum = lcp.pos_lnum + nl;
|
||
|
pos_bol = lcp.pos_cnum - indent;
|
||
|
}
|
||
|
|
||
|
let may_update_loc lexbuf nl indent =
|
||
|
if Compare.Int.(nl <> 0) then update_loc lexbuf nl indent
|
||
|
|
||
|
let start_offset lexbuf =
|
||
|
let open Lexing in
|
||
|
let lsp = lexbuf.lex_start_p in
|
||
|
lsp.pos_cnum - lsp.pos_bol
|
||
|
|
||
|
let end_offset lexbuf =
|
||
|
let open Lexing in
|
||
|
let lcp = lexbuf.lex_curr_p in
|
||
|
lcp.pos_cnum - lcp.pos_bol
|
||
|
|
||
|
let curr_location lexbuf =
|
||
|
lexbuf.Lexing.lex_start_p, lexbuf.Lexing.lex_curr_p
|
||
|
|
||
|
let pos pos =
|
||
|
Lexing.(pos.pos_lnum, pos.pos_cnum - pos.pos_bol)
|
||
|
|
||
|
let pos2 (start, stop) =
|
||
|
pos start, pos stop
|
||
|
|
||
|
(* To translate escape sequences *)
|
||
|
|
||
|
let char_for_backslash = function
|
||
|
| 'n' -> '\010'
|
||
|
| 'r' -> '\013'
|
||
|
| 'b' -> '\008'
|
||
|
| 't' -> '\009'
|
||
|
| c -> c
|
||
|
|
||
|
let char_for_decimal_code lexbuf i =
|
||
|
let c = 100 * (int_of_char(Lexing.lexeme_char lexbuf i) - 48) +
|
||
|
10 * (int_of_char(Lexing.lexeme_char lexbuf (i+1)) - 48) +
|
||
|
(int_of_char(Lexing.lexeme_char lexbuf (i+2)) - 48) in
|
||
|
if Compare.Int.(c < 0 || c > 255) then
|
||
|
raise @@ Illegal_escape (pos2 (curr_location lexbuf), Lexing.lexeme lexbuf)
|
||
|
else char_of_int c
|
||
|
|
||
|
let char_for_hexadecimal_code lexbuf i =
|
||
|
let d1 = int_of_char (Lexing.lexeme_char lexbuf i) in
|
||
|
let val1 = if Compare.Int.(d1 >= 97) then d1 - 87
|
||
|
else if Compare.Int.(d1 >= 65) then d1 - 55
|
||
|
else d1 - 48
|
||
|
in
|
||
|
let d2 = int_of_char (Lexing.lexeme_char lexbuf (i+1)) in
|
||
|
let val2 = if Compare.Int.(d2 >= 97) then d2 - 87
|
||
|
else if Compare.Int.(d2 >= 65) then d2 - 55
|
||
|
else d2 - 48
|
||
|
in
|
||
|
char_of_int (val1 * 16 + val2)
|
||
|
|
||
|
(** Lexer state *)
|
||
|
|
||
|
type state = {
|
||
|
mutable indent_stack:
|
||
|
(int * [`Indent | `Open of (char * (Lexing.position * Lexing.position)) ]) list;
|
||
|
mutable buffer: Concrete_parser.token list;
|
||
|
mutable string_buff: bytes;
|
||
|
mutable string_index: int;
|
||
|
mutable string_start_loc: Lexing.position * Lexing.position;
|
||
|
mutable comment_start_loc: (Lexing.position * Lexing.position) list;
|
||
|
}
|
||
|
|
||
|
let init_state () = {
|
||
|
indent_stack = [];
|
||
|
buffer = [];
|
||
|
string_index = 0;
|
||
|
string_buff = Bytes.create 256;
|
||
|
string_start_loc = Lexing.dummy_pos, Lexing.dummy_pos;
|
||
|
comment_start_loc = [];
|
||
|
}
|
||
|
|
||
|
|
||
|
(** String helpers *)
|
||
|
|
||
|
let reset_string_buffer st =
|
||
|
st.string_buff <- Bytes.create 256;
|
||
|
st.string_index <- 0
|
||
|
|
||
|
let store_string_char st c =
|
||
|
if st.string_index >= Bytes.length st.string_buff then begin
|
||
|
let new_buff = Bytes.create (Bytes.length (st.string_buff) * 2) in
|
||
|
Bytes.blit st.string_buff 0 new_buff 0 (Bytes.length st.string_buff);
|
||
|
st.string_buff <- new_buff
|
||
|
end;
|
||
|
Bytes.set st.string_buff st.string_index c;
|
||
|
st.string_index <- st.string_index + 1
|
||
|
|
||
|
let store_string st s =
|
||
|
for i = 0 to String.length s - 1 do
|
||
|
store_string_char st s.[i];
|
||
|
done
|
||
|
|
||
|
let store_lexeme st lexbuf =
|
||
|
store_string st (Lexing.lexeme lexbuf)
|
||
|
|
||
|
let get_stored_string st =
|
||
|
let s = Bytes.sub st.string_buff 0 st.string_index in
|
||
|
st.string_buff <- Bytes.create 256;
|
||
|
Bytes.to_string s
|
||
|
|
||
|
|
||
|
(** Indentation helpers *)
|
||
|
|
||
|
let first_token st =
|
||
|
match st.indent_stack with
|
||
|
| [] -> true
|
||
|
| _ :: _ -> false
|
||
|
|
||
|
let starting_offset (start, _) =
|
||
|
let open Lexing in
|
||
|
start.pos_cnum - start.pos_bol
|
||
|
|
||
|
let rec pop_indent st loc xs i =
|
||
|
match xs with
|
||
|
| [] -> assert false
|
||
|
| ((x, _) :: _) as xs when Compare.Int.(x = i) ->
|
||
|
st.indent_stack <- xs;
|
||
|
[NEWLINE]
|
||
|
| (x, `Indent) :: xs ->
|
||
|
if Compare.Int.(x > i) then
|
||
|
DEDENT :: pop_indent st loc xs i
|
||
|
else
|
||
|
raise @@ Invalid_indentation (pos2 loc)
|
||
|
| (_, `Open (c, opener_loc)) :: _ ->
|
||
|
let opener_offset = starting_offset opener_loc in
|
||
|
if Compare.Int.(i > opener_offset) then
|
||
|
raise @@ Invalid_indentation_in_block (pos2 loc, c, pos2 opener_loc)
|
||
|
else
|
||
|
raise @@ Unclosed (pos2 loc, c, pos2 opener_loc)
|
||
|
|
||
|
let indent_token st loc =
|
||
|
let i = starting_offset loc in
|
||
|
match st.indent_stack with
|
||
|
| (x, `Indent) :: xs when Compare.Int.(x > i) ->
|
||
|
DEDENT :: pop_indent st loc xs i;
|
||
|
| (x, `Open (c, opener_loc)) :: _ when Compare.Int.(x > i) ->
|
||
|
let opener_offset = starting_offset opener_loc in
|
||
|
if Compare.Int.(i > opener_offset) then
|
||
|
raise @@ Invalid_indentation_in_block (pos2 loc, c, pos2 opener_loc)
|
||
|
else
|
||
|
raise @@ Unclosed (pos2 loc, c, pos2 opener_loc)
|
||
|
| (x, _) :: _ when Compare.Int.(x = i) ->
|
||
|
[NEWLINE]
|
||
|
| [] | (_, _) :: _ (* when Compare.Int.(x < i) *) ->
|
||
|
st.indent_stack <- (i, `Indent) :: st.indent_stack;
|
||
|
[INDENT]
|
||
|
|
||
|
let open_block st opener opener_loc token_offset =
|
||
|
let opener_offset = starting_offset opener_loc in
|
||
|
if Compare.Int.(token_offset <= opener_offset) then
|
||
|
raise @@ Invalid_indentation_after_opener (pos2 opener_loc, opener) ;
|
||
|
st.indent_stack <-
|
||
|
(token_offset, `Open (opener, opener_loc)) :: st.indent_stack;
|
||
|
match opener with
|
||
|
| '{' -> [LBRACE]
|
||
|
| '(' -> [LPAREN]
|
||
|
| _ -> assert false
|
||
|
|
||
|
let close_block st bol closer closer_loc =
|
||
|
let closer_offset = starting_offset closer_loc in
|
||
|
let rec pop xs =
|
||
|
match xs with
|
||
|
| [] -> raise @@ Unopened (pos2 closer_loc, closer)
|
||
|
| (_, `Indent) :: xs -> DEDENT :: pop xs
|
||
|
| (_, `Open (opener, opener_loc)) :: xs ->
|
||
|
let opener_offset = starting_offset opener_loc in
|
||
|
if bol && Compare.Int.(opener_offset <> closer_offset) then
|
||
|
raise @@
|
||
|
Unaligned_closer (pos2 closer_loc, opener, closer, pos2 opener_loc) ;
|
||
|
st.indent_stack <- xs;
|
||
|
[ match opener, closer with
|
||
|
| '{', '}' -> RBRACE
|
||
|
| '(', ')' -> RPAREN
|
||
|
| _ ->
|
||
|
raise @@ Unclosed (pos2 closer_loc, opener, pos2 opener_loc) ]
|
||
|
in
|
||
|
pop st.indent_stack
|
||
|
|
||
|
}
|
||
|
|
||
|
let eol_comment = '#' [^ '\010'] *
|
||
|
let newline = eol_comment ? ('\010' | "\013\010" )
|
||
|
let space = [' ']
|
||
|
let firstidentchar = ['A'-'Z' 'a'-'z' '_']
|
||
|
let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
|
||
|
let decimal_literal =
|
||
|
['0'-'9'] ['0'-'9' '_']*
|
||
|
let hex_literal =
|
||
|
'0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']*
|
||
|
let oct_literal =
|
||
|
'0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
|
||
|
let bin_literal =
|
||
|
'0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
|
||
|
let int_literal =
|
||
|
'-' ? ( decimal_literal | hex_literal | oct_literal | bin_literal)
|
||
|
|
||
|
rule indent_tokens st nl = parse
|
||
|
|
||
|
| space { indent_tokens st nl lexbuf }
|
||
|
| newline { Lexing.new_line lexbuf; indent_tokens st (nl + 1) lexbuf }
|
||
|
|
||
|
| ""
|
||
|
{ let bol = nl <> 0 || first_token st in
|
||
|
if bol then indent_token st (curr_location lexbuf) else [] }
|
||
|
|
||
|
| "/*"
|
||
|
{ st.comment_start_loc <- [curr_location lexbuf];
|
||
|
comment st nl lexbuf }
|
||
|
|
||
|
| ('{' | '(' as opener)
|
||
|
{ let opener_loc = curr_location lexbuf in
|
||
|
let token_offset = next_token_indent st lexbuf in
|
||
|
let bol = nl <> 0 || first_token st in
|
||
|
let prefix =
|
||
|
if bol then indent_token st opener_loc else [] in
|
||
|
prefix @ open_block st opener opener_loc token_offset }
|
||
|
|
||
|
| ('}' | ')' as closer)
|
||
|
{ let closer_loc = curr_location lexbuf in
|
||
|
let bol = Compare.Int.(nl <> 0) in
|
||
|
close_block st bol closer closer_loc }
|
||
|
|
||
|
| eof
|
||
|
{ List.map
|
||
|
(function
|
||
|
| (_, `Indent) -> DEDENT
|
||
|
| (_, `Open (c, loc)) ->
|
||
|
raise @@ Unclosed (pos2 (curr_location lexbuf), c, pos2 loc))
|
||
|
st.indent_stack
|
||
|
@ [EOF]
|
||
|
}
|
||
|
|
||
|
and comment st nl = parse
|
||
|
|
||
|
| "/*" { st.comment_start_loc <-
|
||
|
curr_location lexbuf :: st.comment_start_loc;
|
||
|
comment st nl lexbuf }
|
||
|
|
||
|
| "*/" { match st.comment_start_loc with
|
||
|
| [] -> assert false
|
||
|
| [_] -> indent_tokens st nl lexbuf
|
||
|
| _ :: xs -> st.comment_start_loc <- xs; comment st nl lexbuf }
|
||
|
|
||
|
| "\"" { st.string_start_loc <- curr_location lexbuf;
|
||
|
let nl =
|
||
|
try string st nl lexbuf
|
||
|
with Unterminated_string str_start ->
|
||
|
match st.comment_start_loc with
|
||
|
| [] -> assert false
|
||
|
| loc :: _ ->
|
||
|
let start = List.hd (List.rev st.comment_start_loc) in
|
||
|
raise @@
|
||
|
Unterminated_string_in_comment (pos2 loc, pos2 start, str_start)
|
||
|
in
|
||
|
comment st nl lexbuf }
|
||
|
|
||
|
| newline { Lexing.new_line lexbuf; comment st (nl+1) lexbuf }
|
||
|
|
||
|
| eof { match st.comment_start_loc with
|
||
|
| [] -> assert false
|
||
|
| loc :: _ ->
|
||
|
let start = List.hd (List.rev st.comment_start_loc) in
|
||
|
raise @@ Unterminated_comment (pos2 loc, pos2 start) }
|
||
|
|
||
|
| _ { comment st nl lexbuf }
|
||
|
|
||
|
|
||
|
(** Eat spacings and return the next token offset. *)
|
||
|
and next_token_indent st = parse
|
||
|
|
||
|
| space { next_token_indent st lexbuf }
|
||
|
|
||
|
| newline { Lexing.new_line lexbuf; next_token_indent st lexbuf }
|
||
|
|
||
|
| "" { end_offset lexbuf }
|
||
|
|
||
|
(** The lexer for non-indentation tokens.
|
||
|
It should not care about 'space', 'newline', '{}()' nor comments. *)
|
||
|
and raw_token st = parse
|
||
|
|
||
|
| ";" { SEMICOLON }
|
||
|
|
||
|
| firstidentchar identchar *
|
||
|
{ PRIM (String.lowercase_ascii (Lexing.lexeme lexbuf)) }
|
||
|
|
||
|
| int_literal
|
||
|
{ INT (Lexing.lexeme lexbuf) }
|
||
|
|
||
|
| "\""
|
||
|
{ reset_string_buffer st;
|
||
|
let string_start = lexbuf.Lexing.lex_start_p in
|
||
|
st.string_start_loc <- curr_location lexbuf;
|
||
|
ignore (string st 0 lexbuf);
|
||
|
lexbuf.Lexing.lex_start_p <- string_start;
|
||
|
STRING (get_stored_string st) }
|
||
|
|
||
|
| _
|
||
|
{ raise (Illegal_character (pos2 (curr_location lexbuf),
|
||
|
Lexing.lexeme_char lexbuf 0))
|
||
|
}
|
||
|
|
||
|
and string st nl = parse
|
||
|
'"'
|
||
|
{ nl }
|
||
|
| '\\' newline ([' ' '\t'] * as space)
|
||
|
{ update_loc lexbuf 1 (String.length space);
|
||
|
string st nl lexbuf
|
||
|
}
|
||
|
| '\\' ['\\' '\'' '"' 'n' 't' 'b' 'r' ' ']
|
||
|
{ store_string_char st (char_for_backslash(Lexing.lexeme_char lexbuf 1));
|
||
|
string st nl lexbuf }
|
||
|
| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
|
||
|
{ store_string_char st (char_for_decimal_code lexbuf 1);
|
||
|
string st nl lexbuf }
|
||
|
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F']
|
||
|
{ store_string_char st (char_for_hexadecimal_code lexbuf 2);
|
||
|
string st nl lexbuf }
|
||
|
| newline
|
||
|
{ match st.comment_start_loc with
|
||
|
| [] -> raise @@ Newline_in_string (pos2 (curr_location lexbuf))
|
||
|
| _ -> Lexing.new_line lexbuf; string st (nl+1) lexbuf }
|
||
|
| eof
|
||
|
{ raise @@ Unterminated_string (pos2 st.string_start_loc) }
|
||
|
| _
|
||
|
{ store_string_char st (Lexing.lexeme_char lexbuf 0);
|
||
|
string st nl lexbuf }
|
||
|
|
||
|
|
||
|
{
|
||
|
|
||
|
let rec token st lexbuf =
|
||
|
match st.buffer with
|
||
|
| tok :: tokens ->
|
||
|
st.buffer <- tokens;
|
||
|
tok
|
||
|
| [] ->
|
||
|
match indent_tokens st 0 lexbuf with
|
||
|
| [] -> raw_token st lexbuf
|
||
|
| _ :: _ as tokens -> st.buffer <- tokens; token st lexbuf
|
||
|
|
||
|
}
|