{ open Concrete_parser open Script_located_ir let count_nl s = let c = ref 0 in for i = 0 to String.length s - 1 do if Compare.Char.(s.[i] = '\010') then incr c done; !c let update_loc lexbuf nl indent = let open Lexing in let lcp = lexbuf.lex_curr_p in lexbuf.lex_curr_p <- { lcp with pos_lnum = lcp.pos_lnum + nl; pos_bol = lcp.pos_cnum - indent; } let may_update_loc lexbuf nl indent = if Compare.Int.(nl <> 0) then update_loc lexbuf nl indent let start_offset lexbuf = let open Lexing in let lsp = lexbuf.lex_start_p in lsp.pos_cnum - lsp.pos_bol let end_offset lexbuf = let open Lexing in let lcp = lexbuf.lex_curr_p in lcp.pos_cnum - lcp.pos_bol let curr_location lexbuf = lexbuf.Lexing.lex_start_p, lexbuf.Lexing.lex_curr_p let pos pos = Lexing.(pos.pos_lnum, pos.pos_cnum - pos.pos_bol) let pos2 (start, stop) = pos start, pos stop (* To translate escape sequences *) let char_for_backslash = function | 'n' -> '\010' | 'r' -> '\013' | 'b' -> '\008' | 't' -> '\009' | c -> c let char_for_decimal_code lexbuf i = let c = 100 * (int_of_char(Lexing.lexeme_char lexbuf i) - 48) + 10 * (int_of_char(Lexing.lexeme_char lexbuf (i+1)) - 48) + (int_of_char(Lexing.lexeme_char lexbuf (i+2)) - 48) in if Compare.Int.(c < 0 || c > 255) then raise @@ Illegal_escape (pos2 (curr_location lexbuf), Lexing.lexeme lexbuf) else char_of_int c let char_for_hexadecimal_code lexbuf i = let d1 = int_of_char (Lexing.lexeme_char lexbuf i) in let val1 = if Compare.Int.(d1 >= 97) then d1 - 87 else if Compare.Int.(d1 >= 65) then d1 - 55 else d1 - 48 in let d2 = int_of_char (Lexing.lexeme_char lexbuf (i+1)) in let val2 = if Compare.Int.(d2 >= 97) then d2 - 87 else if Compare.Int.(d2 >= 65) then d2 - 55 else d2 - 48 in char_of_int (val1 * 16 + val2) (** Lexer state *) type state = { mutable indent_stack: (int * [`Indent | `Open of (char * (Lexing.position * Lexing.position)) ]) list; mutable buffer: Concrete_parser.token list; mutable string_buff: bytes; mutable string_index: int; mutable string_start_loc: Lexing.position * Lexing.position; mutable comment_start_loc: (Lexing.position * Lexing.position) list; } let init_state () = { indent_stack = []; buffer = []; string_index = 0; string_buff = Bytes.create 256; string_start_loc = Lexing.dummy_pos, Lexing.dummy_pos; comment_start_loc = []; } (** String helpers *) let reset_string_buffer st = st.string_buff <- Bytes.create 256; st.string_index <- 0 let store_string_char st c = if st.string_index >= Bytes.length st.string_buff then begin let new_buff = Bytes.create (Bytes.length (st.string_buff) * 2) in Bytes.blit st.string_buff 0 new_buff 0 (Bytes.length st.string_buff); st.string_buff <- new_buff end; Bytes.set st.string_buff st.string_index c; st.string_index <- st.string_index + 1 let store_string st s = for i = 0 to String.length s - 1 do store_string_char st s.[i]; done let store_lexeme st lexbuf = store_string st (Lexing.lexeme lexbuf) let get_stored_string st = let s = Bytes.sub st.string_buff 0 st.string_index in st.string_buff <- Bytes.create 256; Bytes.to_string s (** Indentation helpers *) let first_token st = match st.indent_stack with | [] -> true | _ :: _ -> false let starting_offset (start, _) = let open Lexing in start.pos_cnum - start.pos_bol let rec pop_indent st loc xs i = match xs with | [] -> assert false | ((x, _) :: _) as xs when Compare.Int.(x = i) -> st.indent_stack <- xs; [NEWLINE] | (x, `Indent) :: xs -> if Compare.Int.(x > i) then DEDENT :: pop_indent st loc xs i else raise @@ Invalid_indentation (pos2 loc) | (_, `Open (c, opener_loc)) :: _ -> let opener_offset = starting_offset opener_loc in if Compare.Int.(i > opener_offset) then raise @@ Invalid_indentation_in_block (pos2 loc, c, pos2 opener_loc) else raise @@ Unclosed (pos2 loc, c, pos2 opener_loc) let indent_token st loc = let i = starting_offset loc in match st.indent_stack with | (x, `Indent) :: xs when Compare.Int.(x > i) -> DEDENT :: pop_indent st loc xs i; | (x, `Open (c, opener_loc)) :: _ when Compare.Int.(x > i) -> let opener_offset = starting_offset opener_loc in if Compare.Int.(i > opener_offset) then raise @@ Invalid_indentation_in_block (pos2 loc, c, pos2 opener_loc) else raise @@ Unclosed (pos2 loc, c, pos2 opener_loc) | (x, _) :: _ when Compare.Int.(x = i) -> [NEWLINE] | [] | (_, _) :: _ (* when Compare.Int.(x < i) *) -> st.indent_stack <- (i, `Indent) :: st.indent_stack; [INDENT] let open_block st opener opener_loc token_offset = let opener_offset = starting_offset opener_loc in if Compare.Int.(token_offset <= opener_offset) then raise @@ Invalid_indentation_after_opener (pos2 opener_loc, opener) ; st.indent_stack <- (token_offset, `Open (opener, opener_loc)) :: st.indent_stack; match opener with | '{' -> [LBRACE] | '(' -> [LPAREN] | _ -> assert false let close_block st bol closer closer_loc = let closer_offset = starting_offset closer_loc in let rec pop xs = match xs with | [] -> raise @@ Unopened (pos2 closer_loc, closer) | (_, `Indent) :: xs -> DEDENT :: pop xs | (_, `Open (opener, opener_loc)) :: xs -> let opener_offset = starting_offset opener_loc in if bol && Compare.Int.(opener_offset <> closer_offset) then raise @@ Unaligned_closer (pos2 closer_loc, opener, closer, pos2 opener_loc) ; st.indent_stack <- xs; [ match opener, closer with | '{', '}' -> RBRACE | '(', ')' -> RPAREN | _ -> raise @@ Unclosed (pos2 closer_loc, opener, pos2 opener_loc) ] in pop st.indent_stack } let eol_comment = '#' [^ '\010'] * let newline = eol_comment ? ('\010' | "\013\010" ) let space = [' '] let firstidentchar = ['A'-'Z' 'a'-'z' '_'] let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] let decimal_literal = ['0'-'9'] ['0'-'9' '_']* let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']* let oct_literal = '0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']* let bin_literal = '0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']* let int_literal = '-' ? ( decimal_literal | hex_literal | oct_literal | bin_literal) rule indent_tokens st nl = parse | space { indent_tokens st nl lexbuf } | newline { Lexing.new_line lexbuf; indent_tokens st (nl + 1) lexbuf } | "" { let bol = nl <> 0 || first_token st in if bol then indent_token st (curr_location lexbuf) else [] } | "/*" { st.comment_start_loc <- [curr_location lexbuf]; comment st nl lexbuf } | ('{' | '(' as opener) { let opener_loc = curr_location lexbuf in let token_offset = next_token_indent st lexbuf in let bol = nl <> 0 || first_token st in let prefix = if bol then indent_token st opener_loc else [] in prefix @ open_block st opener opener_loc token_offset } | ('}' | ')' as closer) { let closer_loc = curr_location lexbuf in let bol = Compare.Int.(nl <> 0) in close_block st bol closer closer_loc } | eof { List.map (function | (_, `Indent) -> DEDENT | (_, `Open (c, loc)) -> raise @@ Unclosed (pos2 (curr_location lexbuf), c, pos2 loc)) st.indent_stack @ [EOF] } and comment st nl = parse | "/*" { st.comment_start_loc <- curr_location lexbuf :: st.comment_start_loc; comment st nl lexbuf } | "*/" { match st.comment_start_loc with | [] -> assert false | [_] -> indent_tokens st nl lexbuf | _ :: xs -> st.comment_start_loc <- xs; comment st nl lexbuf } | "\"" { st.string_start_loc <- curr_location lexbuf; let nl = try string st nl lexbuf with Unterminated_string str_start -> match st.comment_start_loc with | [] -> assert false | loc :: _ -> let start = List.hd (List.rev st.comment_start_loc) in raise @@ Unterminated_string_in_comment (pos2 loc, pos2 start, str_start) in comment st nl lexbuf } | newline { Lexing.new_line lexbuf; comment st (nl+1) lexbuf } | eof { match st.comment_start_loc with | [] -> assert false | loc :: _ -> let start = List.hd (List.rev st.comment_start_loc) in raise @@ Unterminated_comment (pos2 loc, pos2 start) } | _ { comment st nl lexbuf } (** Eat spacings and return the next token offset. *) and next_token_indent st = parse | space { next_token_indent st lexbuf } | newline { Lexing.new_line lexbuf; next_token_indent st lexbuf } | "" { end_offset lexbuf } (** The lexer for non-indentation tokens. It should not care about 'space', 'newline', '{}()' nor comments. *) and raw_token st = parse | ";" { SEMICOLON } | firstidentchar identchar * { PRIM (Lexing.lexeme lexbuf) } | int_literal { INT (Lexing.lexeme lexbuf) } | "\"" { reset_string_buffer st; let string_start = lexbuf.Lexing.lex_start_p in st.string_start_loc <- curr_location lexbuf; ignore (string st 0 lexbuf); lexbuf.Lexing.lex_start_p <- string_start; STRING (get_stored_string st) } | _ { raise (Illegal_character (pos2 (curr_location lexbuf), Lexing.lexeme_char lexbuf 0)) } and string st nl = parse '"' { nl } | '\\' newline ([' ' '\t'] * as space) { update_loc lexbuf 1 (String.length space); string st nl lexbuf } | '\\' ['\\' '\'' '"' 'n' 't' 'b' 'r' ' '] { store_string_char st (char_for_backslash(Lexing.lexeme_char lexbuf 1)); string st nl lexbuf } | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_string_char st (char_for_decimal_code lexbuf 1); string st nl lexbuf } | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] { store_string_char st (char_for_hexadecimal_code lexbuf 2); string st nl lexbuf } | newline { match st.comment_start_loc with | [] -> raise @@ Newline_in_string (pos2 (curr_location lexbuf)) | _ -> Lexing.new_line lexbuf; string st (nl+1) lexbuf } | eof { raise @@ Unterminated_string (pos2 st.string_start_loc) } | _ { store_string_char st (Lexing.lexeme_char lexbuf 0); string st nl lexbuf } { let rec token st lexbuf = match st.buffer with | tok :: tokens -> st.buffer <- tokens; tok | [] -> match indent_tokens st 0 lexbuf with | [] -> raw_token st lexbuf | _ :: _ as tokens -> st.buffer <- tokens; token st lexbuf }