From 6ce6ebfec3c9d1f4f5f7ae9ce0716e85fa5e7a33 Mon Sep 17 00:00:00 2001 From: Christian Rinderknecht Date: Tue, 28 Apr 2020 21:17:34 +0200 Subject: [PATCH] Simplified the interface of the lexer by not exporting the scanner [init] (which reads the BOM, if any). --- src/passes/1-parser/shared/Lexer.mli | 4 +-- src/passes/1-parser/shared/Lexer.mll | 12 +++++-- src/passes/1-parser/shared/LexerLib.ml | 40 ++++++++++-------------- src/passes/1-parser/shared/LexerLib.mli | 14 ++++----- src/passes/1-parser/shared/LexerLog.ml | 1 - src/passes/1-parser/shared/LexerUnit.ml | 1 - src/passes/1-parser/shared/ParserUnit.ml | 1 - 7 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/passes/1-parser/shared/Lexer.mli b/src/passes/1-parser/shared/Lexer.mli index c923e1505..d3993fec8 100644 --- a/src/passes/1-parser/shared/Lexer.mli +++ b/src/passes/1-parser/shared/Lexer.mli @@ -119,10 +119,8 @@ module type S = module Token : TOKEN type token = Token.token - (* The scanner [init] is meant to be called first to read the - BOM. Then [scan] is called. *) + (* The scanner *) - val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state (* Errors (specific to the generic lexer, not to the tokens) *) diff --git a/src/passes/1-parser/shared/Lexer.mll b/src/passes/1-parser/shared/Lexer.mll index feb179b8a..dcfe3d91c 100644 --- a/src/passes/1-parser/shared/Lexer.mll +++ b/src/passes/1-parser/shared/Lexer.mll @@ -85,8 +85,8 @@ module type S = module Token : TOKEN type token = Token.token - val init : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state - val scan : token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state + val scan : + token LexerLib.state -> Lexing.lexbuf -> token LexerLib.state type error @@ -601,6 +601,14 @@ and scan_utf8_inline thread state = parse { (* START TRAILER *) + +let scan = + let first_call = ref true in + fun state lexbuf -> + if !first_call + then (first_call := false; init state lexbuf) + else scan state lexbuf + end (* of functor [Make] in HEADER *) (* END TRAILER *) } diff --git a/src/passes/1-parser/shared/LexerLib.ml b/src/passes/1-parser/shared/LexerLib.ml index 308e228df..e6ba62b9e 100644 --- a/src/passes/1-parser/shared/LexerLib.ml +++ b/src/passes/1-parser/shared/LexerLib.ml @@ -151,11 +151,6 @@ let mk_thread region lexeme : thread = the scanning rule [scan]). The function [patch_buffer] is, of course, also called just before returning the token, so the parser has a view of the lexing buffer consistent with the token. - - Note that an additional reference [first_call] is needed to - distinguish the first call to the function [scan], as the first - scanning rule is actually [init] (which can handle the BOM), not - [scan]. *) type 'token window = @@ -319,14 +314,13 @@ let lexbuf_from_input = function in Ok (lexbuf, close) with Sys_error msg -> Stdlib.Error (File_opening msg) -let open_token_stream ?line ?block ~init ~scan +let open_token_stream ?line ?block ~scan ~token_to_region ~style input = let file_path = match input with File path -> path | _ -> "" in let pos = Pos.min ~file:file_path in let buf_reg = ref (pos#byte, pos#byte) - and first_call = ref true and decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in let supply = Uutf.Manual.src decoder in let state = ref (mk_state @@ -354,33 +348,31 @@ let open_token_stream ?line ?block ~init ~scan and save_region buffer = buf_reg := Lexing.(buffer.lex_start_p, buffer.lex_curr_p) in - let scan' init scan buffer = + let scan' scan buffer = patch_buffer !buf_reg buffer; - (if !first_call - then (state := init !state buffer; first_call := false) - else state := scan !state buffer); + state := scan !state buffer; save_region buffer in - let next_token init scan buffer = - scan' init scan buffer; + let next_token scan buffer = + scan' scan buffer; match FQueue.peek !state#units with None -> None | Some (units, ext_token) -> state := !state#set_units units; Some ext_token in - let rec read init scan ~token_to_region ~style ~log buffer = + let rec read scan ~token_to_region ~style ~log buffer = match FQueue.deq !state#units with None -> - scan' init scan buffer; - read init scan ~token_to_region ~style ~log buffer + scan' scan buffer; + read scan ~token_to_region ~style ~log buffer | Some (units, (left_mark, token)) -> - log left_mark token; - state := ((!state#set_units units) - #set_last (token_to_region token)) - #slide_token token; - style token (next_token init scan) buffer; - patch_buffer (token_to_region token)#byte_pos buffer; - token in + log left_mark token; + state := ((!state#set_units units) + #set_last (token_to_region token)) + #slide_token token; + style token (next_token scan) buffer; + patch_buffer (token_to_region token)#byte_pos buffer; + token in match lexbuf_from_input input with Ok (buffer, close) -> @@ -389,7 +381,7 @@ let open_token_stream ?line ?block ~init ~scan File path when path <> "" -> reset ~file:path buffer | _ -> () in let instance = { - read = read init scan ~token_to_region ~style; + read = read scan ~token_to_region ~style; input; buffer; get_win; get_pos; get_last; get_file; close} in Ok instance | Error _ as e -> e diff --git a/src/passes/1-parser/shared/LexerLib.mli b/src/passes/1-parser/shared/LexerLib.mli index c5749b717..95f580f2a 100644 --- a/src/passes/1-parser/shared/LexerLib.mli +++ b/src/passes/1-parser/shared/LexerLib.mli @@ -155,17 +155,16 @@ type 'token state = < The type [window] is a two-token window, that is, a buffer that contains the last recognised token, and the penultimate (if any). - The call [read ?line ?block ~init ~scan ~token_to_region ~style + The call [read ?line ?block ~scan ~token_to_region ~style input] evaluates in a lexer (also known as a tokeniser or scanner) whose type is [log:('token logger) -> Lexing.lexbuf -> 'token], and suitable for a parser generated by Menhir. The argument labelled [log] is a logger, that is, it may print a token and its left - markup to a given channel, at the caller's discretion. The argument - labelled [~init] is the scanner to be called first, usually for - reading the BOM, then [scan] is used for the following calls. The - function labelled [~style] is used to check stylistic constraints - on the tokens and the markup between them. -*) + markup to a given channel, at the caller's discretion. The function + labelled [~scan] is the main scanner of the lexer. The function + labelled [~style] is used to check stylistic constraints on the + tokens and the markup between them. + *) type input = File of file_path @@ -194,7 +193,6 @@ val lexbuf_from_input : val open_token_stream : ?line:EvalOpt.line_comment -> ?block:EvalOpt.block_comment -> - init:('token state -> Lexing.lexbuf -> 'token state) -> scan:('token state -> Lexing.lexbuf -> 'token state) -> token_to_region:('token -> Region.t) -> style:('token -> diff --git a/src/passes/1-parser/shared/LexerLog.ml b/src/passes/1-parser/shared/LexerLog.ml index 6fe02527c..8545774ee 100644 --- a/src/passes/1-parser/shared/LexerLog.ml +++ b/src/passes/1-parser/shared/LexerLog.ml @@ -69,7 +69,6 @@ module Make (Lexer: Lexer.S) : (S with module Lexer = Lexer) = ~token_to_region ~style input command : (unit, string Region.reg) Stdlib.result = match LexerLib.open_token_stream - ~init:Lexer.init ~scan:Lexer.scan ~token_to_region ~style diff --git a/src/passes/1-parser/shared/LexerUnit.ml b/src/passes/1-parser/shared/LexerUnit.ml index 836c8db98..c6ab7c527 100644 --- a/src/passes/1-parser/shared/LexerUnit.ml +++ b/src/passes/1-parser/shared/LexerUnit.ml @@ -43,7 +43,6 @@ module Make (IO: IO) (Lexer: Lexer.S) = match LexerLib.open_token_stream ?line:IO.options#line ?block:IO.options#block - ~init:Lexer.init ~scan:Lexer.scan ~token_to_region:Lexer.Token.to_region ~style:Lexer.Token.check_right_context diff --git a/src/passes/1-parser/shared/ParserUnit.ml b/src/passes/1-parser/shared/ParserUnit.ml index d34c183d4..dfaa888c7 100644 --- a/src/passes/1-parser/shared/ParserUnit.ml +++ b/src/passes/1-parser/shared/ParserUnit.ml @@ -234,7 +234,6 @@ module Make (Lexer: Lexer.S) let () = close () in let input' = LexerLib.String (Buffer.contents buffer) in match LexerLib.open_token_stream - ~init:Lexer.init ~scan:Lexer.scan ~token_to_region:Lexer.Token.to_region ~style:Lexer.Token.check_right_context