396 lines
13 KiB
OCaml
396 lines
13 KiB
OCaml
module Region = Simple_utils.Region
|
|
module Pos = Simple_utils.Pos
|
|
|
|
(* LEXER ENGINE *)
|
|
|
|
(* Resetting file name and line number in the lexing buffer
|
|
|
|
The call [reset ~file ~line buffer] modifies in-place the lexing
|
|
buffer [buffer] so the lexing engine records that the file
|
|
associated with [buffer] is named [file], and the current line is
|
|
[line]. This function is useful when lexing a file that has been
|
|
previously preprocessed by the C preprocessor, in which case the
|
|
argument [file] is the name of the file that was preprocessed,
|
|
_not_ the preprocessed file (of which the user is not normally
|
|
aware). By default, the [line] argument is [1].
|
|
*)
|
|
|
|
let reset_file ~file buffer =
|
|
let open Lexing in
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_fname = file}
|
|
|
|
let reset_line ~line buffer =
|
|
assert (line >= 0);
|
|
let open Lexing in
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_lnum = line}
|
|
|
|
let reset_offset ~offset buffer =
|
|
assert (offset >= 0);
|
|
let open Lexing in
|
|
let bol = buffer.lex_curr_p.pos_bol in
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum = bol + offset }
|
|
|
|
let reset ?file ?line ?offset buffer =
|
|
let () =
|
|
match file with
|
|
Some file -> reset_file ~file buffer
|
|
| None -> () in
|
|
let () =
|
|
match line with
|
|
Some line -> reset_line ~line buffer
|
|
| None -> () in
|
|
match offset with
|
|
Some offset -> reset_offset ~offset buffer
|
|
| None -> ()
|
|
|
|
(* Rolling back one lexeme _within the current semantic action_ *)
|
|
|
|
let rollback buffer =
|
|
let open Lexing in
|
|
let len = String.length (lexeme buffer) in
|
|
let pos_cnum = buffer.lex_curr_p.pos_cnum - len in
|
|
buffer.lex_curr_pos <- buffer.lex_curr_pos - len;
|
|
buffer.lex_curr_p <- {buffer.lex_curr_p with pos_cnum}
|
|
|
|
(* Utility types *)
|
|
|
|
type file_path = string
|
|
type lexeme = string
|
|
|
|
(* THREAD FOR STRUCTURED CONSTRUCTS (STRINGS, COMMENTS) *)
|
|
|
|
type thread = <
|
|
opening : Region.t;
|
|
length : int;
|
|
acc : char list;
|
|
to_string : string;
|
|
push_char : char -> thread;
|
|
push_string : string -> thread;
|
|
set_opening : Region.t -> thread
|
|
>
|
|
|
|
let mk_thread region lexeme : thread =
|
|
(* The call [explode s a] is the list made by pushing the characters
|
|
in the string [s] on top of [a], in reverse order. For example,
|
|
[explode "ba" ['c';'d'] = ['a'; 'b'; 'c'; 'd']]. *)
|
|
|
|
let explode s acc =
|
|
let rec push = function
|
|
0 -> acc
|
|
| i -> s.[i-1] :: push (i-1)
|
|
in push (String.length s) in
|
|
object
|
|
val opening = region
|
|
method opening = opening
|
|
|
|
val length = String.length lexeme
|
|
method length = length
|
|
|
|
val acc = explode lexeme []
|
|
method acc = acc
|
|
|
|
method set_opening opening = {< opening; length; acc >}
|
|
|
|
method push_char char =
|
|
{< opening; length=length+1; acc=char::acc >}
|
|
|
|
method push_string str =
|
|
{< opening;
|
|
length = length + String.length str;
|
|
acc = explode str acc >}
|
|
|
|
(* The value of [thread#to_string] is a string of length
|
|
[thread#length] containing the [thread#length] characters in
|
|
the list [thread#acc], in reverse order. For instance,
|
|
[thread#to_string = "abc"] if [thread#length = 3] and
|
|
[thread#acc = ['c';'b';'a']]. *)
|
|
|
|
method to_string =
|
|
let bytes = Bytes.make length ' ' in
|
|
let rec fill i = function
|
|
[] -> bytes
|
|
| char::l -> Bytes.set bytes i char; fill (i-1) l
|
|
in fill (length-1) acc |> Bytes.to_string
|
|
end
|
|
|
|
(* STATE *)
|
|
|
|
(* Scanning the lexing buffer for tokens (and markup, as a
|
|
side-effect).
|
|
|
|
Because we want the lexer to have access to the right lexical
|
|
context of a recognised lexeme (to enforce stylistic constraints or
|
|
report special error patterns), we need to keep a hidden reference
|
|
to a queue of recognised lexical units (that is, tokens and markup)
|
|
that acts as a mutable state between the calls to [read]. When
|
|
[read] is called, that queue is examined first and, if it contains
|
|
at least one token, that token is returned; otherwise, the lexing
|
|
buffer is scanned for at least one more new token. That is the
|
|
general principle: we put a high-level buffer (our queue) on top of
|
|
the low-level lexing buffer.
|
|
|
|
One tricky and important detail is that we must make any parser
|
|
generated by Menhir (and calling [read]) believe that the last
|
|
region of the input source that was matched indeed corresponds to
|
|
the returned token, despite that many tokens and markup may have
|
|
been matched since it was actually read from the input. In other
|
|
words, the parser requests a token that is taken from the
|
|
high-level buffer, but the parser requests the source regions from
|
|
the _low-level_ lexing buffer, and they may disagree if more than
|
|
one token has actually been recognised.
|
|
|
|
Consequently, in order to maintain a consistent view for the
|
|
parser, we have to patch some fields of the lexing buffer, namely
|
|
[lex_start_p] and [lex_curr_p], as these fields are read by parsers
|
|
generated by Menhir when querying source positions (regions). This
|
|
is the purpose of the function [patch_buffer]. After reading one or
|
|
more tokens and markup by the scanning rule [scan], we have to save
|
|
in the hidden reference [buf_reg] the region of the source that was
|
|
matched by [scan]. This atomic sequence of patching, scanning and
|
|
saving is implemented by the _function_ [scan] (beware: it shadows
|
|
the scanning rule [scan]). The function [patch_buffer] is, of
|
|
course, also called just before returning the token, so the parser
|
|
has a view of the lexing buffer consistent with the token.
|
|
|
|
Note that an additional reference [first_call] is needed to
|
|
distinguish the first call to the function [scan], as the first
|
|
scanning rule is actually [init] (which can handle the BOM), not
|
|
[scan].
|
|
*)
|
|
|
|
type 'token window =
|
|
Nil
|
|
| One of 'token
|
|
| Two of 'token * 'token
|
|
|
|
type 'token state = <
|
|
units : (Markup.t list * 'token) FQueue.t;
|
|
markup : Markup.t list;
|
|
window : 'token window;
|
|
last : Region.t;
|
|
pos : Pos.t;
|
|
decoder : Uutf.decoder;
|
|
supply : Bytes.t -> int -> int -> unit;
|
|
block : EvalOpt.block_comment option;
|
|
line : EvalOpt.line_comment option;
|
|
|
|
enqueue : 'token -> 'token state;
|
|
set_units : (Markup.t list * 'token) FQueue.t -> 'token state;
|
|
set_last : Region.t -> 'token state;
|
|
set_pos : Pos.t -> 'token state;
|
|
slide_token : 'token -> 'token state;
|
|
|
|
sync : Lexing.lexbuf -> Region.t * lexeme * 'token state;
|
|
|
|
push_newline : Lexing.lexbuf -> 'token state;
|
|
push_line : thread -> 'token state;
|
|
push_block : thread -> 'token state;
|
|
push_space : Lexing.lexbuf -> 'token state;
|
|
push_tabs : Lexing.lexbuf -> 'token state;
|
|
push_bom : Lexing.lexbuf -> 'token state;
|
|
push_markup : Markup.t -> 'token state;
|
|
>
|
|
|
|
let mk_state ~units ~markup ~window ~last ~pos ~decoder ~supply
|
|
?block ?line () : _ state =
|
|
object (self)
|
|
val units = units
|
|
method units = units
|
|
val markup = markup
|
|
method markup = markup
|
|
val window = window
|
|
method window = window
|
|
val last = last
|
|
method last = last
|
|
val pos = pos
|
|
method pos = pos
|
|
method decoder = decoder
|
|
method supply = supply
|
|
method block = block
|
|
method line = line
|
|
|
|
method enqueue token =
|
|
{< units = FQueue.enq (markup, token) units;
|
|
markup = [] >}
|
|
|
|
method set_units units = {< units = units >}
|
|
method set_last region = {< last = region >}
|
|
method set_pos pos = {< pos = pos >}
|
|
|
|
method slide_token token =
|
|
match self#window with
|
|
Nil -> {< window = One token >}
|
|
| One t | Two (t,_) -> {< window = Two (token,t) >}
|
|
|
|
method sync buffer =
|
|
let lex = Lexing.lexeme buffer in
|
|
let len = String.length lex in
|
|
let start = pos in
|
|
let stop = start#shift_bytes len in
|
|
let state = {< pos = stop >}
|
|
in Region.make ~start ~stop, lex, state
|
|
|
|
(* MARKUP *)
|
|
|
|
(* Committing markup to the current logical state *)
|
|
|
|
method push_markup unit = {< markup = unit :: markup >}
|
|
|
|
method push_newline buffer =
|
|
let () = Lexing.new_line buffer in
|
|
let value = Lexing.lexeme buffer in
|
|
let start = self#pos in
|
|
let stop = start#new_line value in
|
|
let region = Region.make ~start ~stop in
|
|
let unit = Markup.Newline Region.{region; value}
|
|
in {< pos = stop; markup = unit::markup >}
|
|
|
|
method push_line thread =
|
|
let start = thread#opening#start in
|
|
let region = Region.make ~start ~stop:self#pos
|
|
and value = thread#to_string in
|
|
let unit = Markup.LineCom Region.{region; value}
|
|
in {< markup = unit::markup >}
|
|
|
|
method push_block thread =
|
|
let start = thread#opening#start in
|
|
let region = Region.make ~start ~stop:self#pos
|
|
and value = thread#to_string in
|
|
let unit = Markup.BlockCom Region.{region; value}
|
|
in {< markup = unit::markup >}
|
|
|
|
method push_space buffer =
|
|
let region, lex, state = self#sync buffer in
|
|
let value = String.length lex in
|
|
let unit = Markup.Space Region.{region; value}
|
|
in state#push_markup unit
|
|
|
|
method push_tabs buffer =
|
|
let region, lex, state = self#sync buffer in
|
|
let value = String.length lex in
|
|
let unit = Markup.Tabs Region.{region; value}
|
|
in state#push_markup unit
|
|
|
|
method push_bom buffer =
|
|
let region, value, state = self#sync buffer in
|
|
let unit = Markup.BOM Region.{region; value}
|
|
in state#push_markup unit
|
|
end
|
|
|
|
(* LEXER INSTANCE *)
|
|
|
|
type input =
|
|
File of file_path
|
|
| String of string
|
|
| Channel of in_channel
|
|
| Buffer of Lexing.lexbuf
|
|
|
|
type 'token logger = Markup.t list -> 'token -> unit
|
|
|
|
type 'token instance = {
|
|
input : input;
|
|
read : log:('token logger) -> Lexing.lexbuf -> 'token;
|
|
buffer : Lexing.lexbuf;
|
|
get_win : unit -> 'token window;
|
|
get_pos : unit -> Pos.t;
|
|
get_last : unit -> Region.t;
|
|
get_file : unit -> file_path;
|
|
close : unit -> unit
|
|
}
|
|
|
|
type open_err = File_opening of string
|
|
|
|
let lexbuf_from_input = function
|
|
String s ->
|
|
Ok (Lexing.from_string s, fun () -> ())
|
|
| Channel chan ->
|
|
let close () = close_in chan in
|
|
Ok (Lexing.from_channel chan, close)
|
|
| Buffer b ->
|
|
Ok (b, fun () -> ())
|
|
| File path ->
|
|
try
|
|
let chan = open_in path in
|
|
let close () = close_in chan in
|
|
let lexbuf = Lexing.from_channel chan in
|
|
let () =
|
|
let open Lexing in
|
|
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_fname=path}
|
|
in Ok (lexbuf, close)
|
|
with Sys_error msg -> Stdlib.Error (File_opening msg)
|
|
|
|
let open_token_stream ?line ?block ~init ~scan
|
|
~token_to_region ~style input =
|
|
let file_path = match input with
|
|
File path -> path
|
|
| _ -> "" in
|
|
let pos = Pos.min ~file:file_path in
|
|
let buf_reg = ref (pos#byte, pos#byte)
|
|
and first_call = ref true
|
|
and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
|
|
let supply = Uutf.Manual.src decoder in
|
|
let state = ref (mk_state
|
|
~units:FQueue.empty
|
|
~last:Region.ghost
|
|
~window:Nil
|
|
~pos
|
|
~markup:[]
|
|
~decoder
|
|
~supply
|
|
?block
|
|
?line
|
|
()) in
|
|
let get_pos () = !state#pos
|
|
and get_last () = !state#last
|
|
and get_win () = !state#window
|
|
and get_file () = file_path in
|
|
|
|
let patch_buffer (start, stop) buffer =
|
|
let open Lexing in
|
|
let file_path = buffer.lex_curr_p.pos_fname in
|
|
buffer.lex_start_p <- {start with pos_fname = file_path};
|
|
buffer.lex_curr_p <- {stop with pos_fname = file_path}
|
|
|
|
and save_region buffer =
|
|
buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in
|
|
|
|
let scan' init scan buffer =
|
|
patch_buffer !buf_reg buffer;
|
|
(if !first_call
|
|
then (state := init !state buffer; first_call := false)
|
|
else state := scan !state buffer);
|
|
save_region buffer in
|
|
|
|
let next_token init scan buffer =
|
|
scan' init scan buffer;
|
|
match FQueue.peek !state#units with
|
|
None -> None
|
|
| Some (units, ext_token) ->
|
|
state := !state#set_units units; Some ext_token in
|
|
|
|
let rec read init scan ~token_to_region ~style ~log buffer =
|
|
match FQueue.deq !state#units with
|
|
None ->
|
|
scan' init scan buffer;
|
|
read init scan ~token_to_region ~style ~log buffer
|
|
| Some (units, (left_mark, token)) ->
|
|
log left_mark token;
|
|
state := ((!state#set_units units)
|
|
#set_last (token_to_region token))
|
|
#slide_token token;
|
|
style token (next_token init scan) buffer;
|
|
patch_buffer (token_to_region token)#byte_pos buffer;
|
|
token in
|
|
|
|
match lexbuf_from_input input with
|
|
Ok (buffer, close) ->
|
|
let () =
|
|
match input with
|
|
File path when path <> "" -> reset ~file:path buffer
|
|
| _ -> () in
|
|
let instance = {
|
|
read = read init scan ~token_to_region ~style;
|
|
input; buffer; get_win; get_pos; get_last; get_file; close}
|
|
in Ok instance
|
|
| Error _ as e -> e
|