Data_encoding: optimize binary serialization

This commit is contained in:
Milo Davis 2017-12-04 16:35:48 +01:00 committed by Benjamin Canou
parent b2918c1387
commit 82dcd5c179
8 changed files with 470 additions and 46 deletions

View File

@ -36,6 +36,15 @@ let apply ?(error=No_case_matched) fs v =
| None -> loop fs in
loop fs
let apply_map ?(error=No_case_matched) f fs v =
let rec loop = function
| [] -> raise error
| x :: fs ->
match (f x) v with
| Some l -> l
| None -> loop fs in
loop fs
module Size = struct
let bool = 1
let int8 = 1
@ -131,7 +140,7 @@ type 'a desc =
| Float : float desc
| Bytes : Kind.length -> MBytes.t desc
| String : Kind.length -> string desc
| String_enum : Kind.length * (string * 'a) list -> 'a desc
| String_enum : ('a, string * int) Hashtbl.t * 'a array -> 'a desc
| Array : 'a t -> 'a array desc
| List : 'a t -> 'a list desc
| Obj : 'a field -> 'a desc
@ -211,6 +220,9 @@ let range_to_size ~minimum ~maximum : integer =
then signed_range_to_size minimum maximum
else unsigned_range_to_size (maximum - minimum)
let enum_size arr =
unsigned_range_to_size (Array.length arr)
type 'a encoding = 'a t
let rec classify : type a. a t -> Kind.t = fun e ->
@ -234,7 +246,8 @@ let rec classify : type a. a t -> Kind.t = fun e ->
(* Tagged *)
| Bytes kind -> (kind :> Kind.t)
| String kind -> (kind :> Kind.t)
| String_enum (kind, _) -> (kind :> Kind.t)
| String_enum (_, cases) ->
`Fixed (integer_to_size (enum_size cases))
| Obj (Opt (kind, _, _)) -> (kind :> Kind.t)
| Objs (kind, _, _) -> kind
| Tups (kind, _, _) -> kind
@ -384,7 +397,7 @@ module Json = struct
| RangedFloat { minimum; maximum } -> ranged_float ~minimum ~maximum "rangedFloat"
| String _ -> string (* TODO: check length *)
| Bytes _ -> bytes_jsont (* TODO check length *)
| String_enum (_, l) -> string_enum l
| String_enum (tbl, _) -> string_enum (Hashtbl.fold (fun a (str, _) acc -> (str, a) :: acc) tbl [])
| Array e -> array (get_json e)
| List e -> list (get_json e)
| Obj f -> obj1 (field_json f)
@ -487,7 +500,6 @@ module Encoding = struct
let list e =
check_not_variable "a list" e ;
make @@ List e
let string_enum l = make @@ String_enum (`Variable, l)
end
let dynamic_size e =
@ -520,11 +532,15 @@ module Encoding = struct
let array e = dynamic_size (Variable.array e)
let list e = dynamic_size (Variable.list e)
let string_enum cases =
let arr = Array.of_list (List.map snd cases) in
let tbl = Hashtbl.create (Array.length arr) in
List.iteri (fun ind (str, a) -> Hashtbl.add tbl a (str, ind)) cases ;
make @@ String_enum (tbl, arr)
let conv proj inj ?schema encoding =
make @@ Conv { proj ; inj ; encoding ; schema }
let string_enum l = dynamic_size (Variable.string_enum l)
let describe ?title ?description encoding =
match title, description with
| None, None -> encoding
@ -827,7 +843,8 @@ module Binary = struct
| RangedFloat _ -> fun _ -> Size.float
| Bytes `Fixed n -> fun _ -> n
| String `Fixed n -> fun _ -> n
| String_enum (`Fixed n, _) -> fun _ -> n
| String_enum (_, arr) ->
fun _ -> integer_to_size @@ enum_size arr
| Objs (`Fixed n, _, _) -> fun _ -> n
| Tups (`Fixed n, _, _) -> fun _ -> n
| Union (`Fixed n, _, _) -> fun _ -> n
@ -841,8 +858,9 @@ module Binary = struct
let length2 = length e2 in
fun (v1, v2) -> length1 v1 + length2 v2
| Union (`Dynamic, sz, cases) ->
let tag_size = tag_size sz in
let case_length (Case { encoding = e ; proj }) =
let length v = tag_size sz + length e v in
let length v = tag_size + length e v in
fun v -> Option.map ~f:length (proj v) in
apply (List.map case_length cases)
| Mu (`Dynamic, _name, self) ->
@ -854,13 +872,6 @@ module Binary = struct
| Ignore -> fun _ -> 0
| Bytes `Variable -> MBytes.length
| String `Variable -> String.length
| String_enum (`Variable, l) -> begin
fun v ->
try
let l = List.map (fun (x,y) -> (y,x)) l in
String.length (List.assoc v l)
with Not_found -> raise No_case_matched
end
| Array e ->
let length = length e in
fun v ->
@ -1023,6 +1034,80 @@ module Binary = struct
end
module BufferedWriter = struct
let int8 v buf =
if (v < - (1 lsl 7) || v >= 1 lsl 7) then
invalid_arg "Data_encoding.Binary.Writer.int8" ;
MBytes_buffer.write_int8 buf v
let uint8 v buf =
if (v < 0 || v >= 1 lsl 8) then
invalid_arg "Data_encoding.Binary.Writer.uint8" ;
MBytes_buffer.write_int8 buf v
let char v buf =
MBytes_buffer.write_char buf v
let bool v buf =
uint8 (if v then 255 else 0) buf
let int16 v buf =
if (v < - (1 lsl 15) || v >= 1 lsl 15) then
invalid_arg "Data_encoding.Binary.Writer.int16" ;
MBytes_buffer.write_int16 buf v
let uint16 v buf =
if (v < 0 || v >= 1 lsl 16) then
invalid_arg "Data_encoding.Binary.Writer.uint16" ;
MBytes_buffer.write_int16 buf v
let int31 v buf =
MBytes_buffer.write_int32 buf (Int32.of_int v)
let int32 v buf =
MBytes_buffer.write_int32 buf v
let int64 v buf =
MBytes_buffer.write_int64 buf v
(** write a float64 (double) **)
let float v buf =
MBytes_buffer.write_double buf v
let fixed_kind_bytes length s buf =
MBytes_buffer.write_mbytes buf s 0 length
let variable_length_bytes s buf =
let length = MBytes.length s in
MBytes_buffer.write_mbytes buf s 0 length
let fixed_kind_string length s buf =
if String.length s <> length then invalid_arg "fixed_kind_string";
MBytes_buffer.write_string_data buf s
let variable_length_string s buf =
MBytes_buffer.write_string_data buf s
let write_tag = function
| `Uint8 -> uint8
| `Uint16 -> uint16
end
let rec assoc_snd target = function
| [] -> raise No_case_matched
| (value, hd) :: tl ->
if hd = target
then value
else assoc_snd target tl
let get_string_enum_case tbl v =
try
snd (Hashtbl.find tbl v)
with _ ->
raise No_case_matched
let rec write_rec
: type a. a t -> a -> MBytes.t -> int -> int = fun e ->
let open Writer in
@ -1065,13 +1150,14 @@ module Binary = struct
| String `Variable -> variable_length_string
| Array t -> array (write_rec t)
| List t -> list (write_rec t)
| String_enum (kind, l) -> begin
fun v ->
try
let l = List.map (fun (x,y) -> (y,x)) l in
write_rec (make @@ String kind) (List.assoc v l)
with Not_found -> raise No_case_matched
end
| String_enum (tbl, arr) ->
(fun v ->
let value = get_string_enum_case tbl v in
match enum_size arr with
| `Int64 -> int64 (Int64.of_int value)
| `Uint16 -> uint16 value
| `Uint8 -> uint8 value
| `Int32 -> int32 (Int32.of_int value))
| Obj (Req (_, e)) -> write_rec e
| Obj (Opt (`Dynamic, _, e)) ->
let write = write_rec e in
@ -1100,16 +1186,112 @@ module Binary = struct
int32 (Int32.of_int @@ length v) buf ofs |> write v buf
| Delayed f -> write_rec (f ())
let rec write_rec_buffer
: type a. a encoding -> a -> MBytes_buffer.t -> unit =
fun encoding value buffer ->
let open BufferedWriter in
match encoding.encoding with
| Null -> ()
| Empty -> ()
| Constant _ -> ()
| Ignore -> ()
| Bool -> bool value buffer
| Int8 -> int8 value buffer
| Uint8 -> uint8 value buffer
| Int16 -> int16 value buffer
| Uint16 -> uint16 value buffer
| Int31 -> int31 value buffer
| Int32 -> int32 value buffer
| Int64 -> int64 value buffer
| Float -> float value buffer
| Bytes (`Fixed n) -> fixed_kind_bytes n value buffer
| String (`Fixed n) -> fixed_kind_string n value buffer
| Bytes `Variable -> variable_length_bytes value buffer
| String `Variable -> variable_length_string value buffer
| Array t -> Array.iter (fun x -> write_rec_buffer t x buffer) value
| List t -> List.iter (fun x -> write_rec_buffer t x buffer) value
| RangedInt { minimum ; maximum } ->
if value < minimum || value > maximum
then invalid_arg (Printf.sprintf "Integer %d not in range [%d, %d]."
value minimum maximum) ;
let value = if minimum >= 0 then value - minimum else value in
begin
match range_to_size ~minimum ~maximum with
| `Uint16 -> uint16 value buffer
| `Uint8 -> uint8 value buffer
| `Int8 -> int8 value buffer
| `Int64 -> int64 (Int64.of_int value) buffer
| `Int16 -> int16 value buffer
| `Int32 -> int32 (Int32.of_int value) buffer
end
| RangedFloat { minimum ; maximum } ->
if value < minimum || value > maximum
then invalid_arg (Printf.sprintf "Float %f not in range [%f, %f]."
value minimum maximum) ;
float value buffer
| String_enum (tbl, arr) ->
(match enum_size arr with
| `Uint16 -> BufferedWriter.uint16
| `Uint8 -> BufferedWriter.uint8
| `Int64 -> (fun x -> BufferedWriter.int64 (Int64.of_int x))
| `Int32 -> (fun x -> BufferedWriter.int32 (Int32.of_int x)))
(get_string_enum_case tbl value)
buffer
| Obj (Req (_, e)) -> write_rec_buffer e value buffer
| Obj (Opt (`Dynamic, _, e)) ->
(match value with
| None -> int8 0 buffer
| Some x ->
begin
int8 1 buffer ;
write_rec_buffer e x buffer
end)
| Obj (Opt (`Variable, _, e)) ->
(match value with
| None -> ()
| Some x -> write_rec_buffer e x buffer)
| Obj (Dft (_, e, _)) -> write_rec_buffer e value buffer
| Objs (_, e1, e2) ->
let v1, v2 = value in
write_rec_buffer e1 v1 buffer ;
write_rec_buffer e2 v2 buffer
| Tup e -> write_rec_buffer e value buffer
| Tups (_, e1, e2) ->
let v1, v2 = value in
write_rec_buffer e1 v1 buffer ;
write_rec_buffer e2 v2 buffer
| Conv { encoding = e; proj } ->
write_rec_buffer e (proj value) buffer
| Describe { encoding = e } -> write_rec_buffer e value buffer
| Def { encoding = e } -> write_rec_buffer e value buffer
| Splitted { encoding = e } -> write_rec_buffer e value buffer
| Union (_, sz, cases) ->
let rec write_case = function
| [] -> raise No_case_matched
| Case { tag = Json_only } :: tl -> write_case tl
| Case { encoding = e ; proj ; tag = Tag tag } :: tl ->
begin
match proj value with
| None -> write_case tl
| Some data ->
write_tag sz tag buffer ;
write_rec_buffer e data buffer
end in
write_case cases
| Mu (_, _, self) ->
write_rec_buffer (self encoding) value buffer
| Dynamic_size e ->
MBytes_buffer.write_sized buffer (fun () -> write_rec_buffer e value buffer)
| Delayed f -> write_rec_buffer (f ()) value buffer
let write t v buf ofs =
try Some (write_rec t v buf ofs)
with _ -> None
let to_bytes t v =
let length = length t v in
let bytes = MBytes.create length in
let ofs = write_rec t v bytes 0 in
assert(ofs = length);
bytes
let bytes = MBytes_buffer.create () in
write_rec_buffer t v bytes ;
MBytes_buffer.to_mbytes bytes
let to_bytes_list ?(copy_blocks=false) block_sz t v =
assert (block_sz > 0);
@ -1288,11 +1470,17 @@ module Binary = struct
| String (`Fixed n) -> fixed_length_string n
| Bytes `Variable -> fun buf ofs len -> fixed_length_bytes len buf ofs len
| String `Variable -> fun buf ofs len -> fixed_length_string len buf ofs len
| String_enum (kind, l) -> begin
fun buf ofs len ->
let ofs, str = read_rec (make @@ (String kind)) buf ofs len in
try ofs, List.assoc str l
with Not_found -> raise (Unexpected_enum (str, List.map fst l))
| String_enum (_, arr) -> begin
fun buf ofs a ->
let ofs, ind =
match enum_size arr with
| `Uint8 -> uint8 buf ofs a
| `Uint16 -> uint16 buf ofs a
| `Int64 -> let ofs, i64 = int64 buf ofs a in (ofs, Int64.to_int i64)
| `Int32 -> let ofs, i64 = int32 buf ofs a in (ofs, Int32.to_int i64) in
if ind >= Array.length arr
then raise No_case_matched
else (ofs, arr.(ind))
end
| Array e -> array (read_rec e)
| List e -> list (read_rec e)
@ -1563,9 +1751,13 @@ module Binary = struct
| String `Variable ->
next_path path (fst (fixed_length_string len buf))
| String_enum (kind, _) -> (* ! approx! *)
data_checker path (make @@ (String kind)) buf len
| String_enum (_, arr) ->
next_path path
(match enum_size arr with
| `Int64 -> fst @@ int64 buf
| `Uint16 -> fst @@ uint16 buf
| `Uint8 -> fst @@ uint8 buf
| `Int32 -> fst @@ int32 buf)
| Array e ->
let p = P_list { path ; encoding = e ; base_ofs = buf.ofs ;
data_len = len ; nb_elts_read = 0 } in

View File

@ -73,11 +73,6 @@ let substring src srcoff len =
include EndianBigstring.BigEndian
let of_float f =
let buf = create 8 in
set_float buf 0 f;
buf
module LE = struct
include EndianBigstring.LittleEndian
end

View File

@ -14,7 +14,7 @@
open Bigarray
(* Arrays are of characters, represented as uint8's, in row-major layout. *)
(** Arrays are of characters, represented as uint8's, in row-major layout. *)
type t = (char, int8_unsigned_elt, c_layout) Array1.t
val create: int -> t
@ -109,8 +109,6 @@ val set_float: t -> int -> float -> unit
val set_double: t -> int -> float -> unit
(** [set_double buff i v] writes [v] to [buff] at offset [i] *)
val of_float: float -> t
module LE: sig
(** Functions reading according to Little Endian byte order *)

View File

@ -0,0 +1,89 @@
(**************************************************************************)
(* *)
(* Copyright (c) 2014 - 2017. *)
(* Dynamic Ledger Solutions, Inc. <contact@tezos.com> *)
(* *)
(* All rights reserved. No warranty, explicit or implicit, provided. *)
(* *)
(**************************************************************************)
type t = { mutable buffer : MBytes.t ;
mutable offset : int }
let create ?(initial_size=4096) () =
if initial_size <= 0
then invalid_arg "MBytes_buffer size must be greater than zero" ;
{ buffer = MBytes.create initial_size ;
offset = 0 }
let resize ?at_least_size buf =
let new_buf =
MBytes.create
(2 * (MBytes.length buf.buffer + Option.unopt ~default:0 at_least_size)) in
MBytes.blit buf.buffer 0 new_buf 0 buf.offset ;
buf.buffer <- new_buf
let resize_if_necessary buf bytes =
if buf.offset + bytes > MBytes.length buf.buffer
then resize ~at_least_size:bytes buf
else ()
let write_mbytes dst src srcoff len =
resize_if_necessary dst len ;
MBytes.blit src srcoff dst.buffer dst.offset len ;
dst.offset <- dst.offset + len
let write_char buf char =
resize_if_necessary buf 1 ;
MBytes.set_char buf.buffer buf.offset char ;
buf.offset <- buf.offset + 1
let write_int8 buf int =
resize_if_necessary buf 1 ;
MBytes.set_int8 buf.buffer buf.offset int ;
buf.offset <- buf.offset + 1
let write_int16 buf int =
resize_if_necessary buf 2 ;
MBytes.set_int16 buf.buffer buf.offset int ;
buf.offset <- buf.offset + 2
let write_int32 buf int =
resize_if_necessary buf 4 ;
MBytes.set_int32 buf.buffer buf.offset int ;
buf.offset <- buf.offset + 4
let write_int64 buf int =
resize_if_necessary buf 8 ;
MBytes.set_int64 buf.buffer buf.offset int ;
buf.offset <- buf.offset + 8
let write_float buf float =
resize_if_necessary buf 4 ;
MBytes.set_float buf.buffer buf.offset float ;
buf.offset <- buf.offset + 4
let write_double buf float =
resize_if_necessary buf 8 ;
MBytes.set_double buf.buffer buf.offset float ;
buf.offset <- buf.offset + 8
let write_string_data buf str =
let len = String.length str in
resize_if_necessary buf len ;
MBytes.blit_from_string str 0 buf.buffer buf.offset len ;
buf.offset <- buf.offset + len
let write_sized buffer (writer : unit -> unit) =
let initial_offset = buffer.offset in
write_int32 buffer Int32.zero ;
writer () ;
let ending_offset = buffer.offset in
let size = ending_offset - initial_offset - 4 in
let size32 = Int32.of_int size in
if (Int32.to_int size32) <> size
then failwith "Tried to write more than [Int32.max_int] bytes in sized location"
else MBytes.set_int32 buffer.buffer initial_offset size32
let to_mbytes buf =
MBytes.sub buf.buffer 0 buf.offset

View File

@ -0,0 +1,63 @@
(**************************************************************************)
(* *)
(* Copyright (c) 2014 - 2017. *)
(* Dynamic Ledger Solutions, Inc. <contact@tezos.com> *)
(* *)
(* All rights reserved. No warranty, explicit or implicit, provided. *)
(* *)
(**************************************************************************)
(** Low-level byte growing byte vector. Internally uses operations in
{!MBytes}. The vector is resized as needed.
All operations append to the end of the vector. **)
(** Type of vectors *)
type t
(** Create a vector with an initial size of [initial_size]. *)
val create : ?initial_size:int -> unit -> t
(** [write_mbytes t src src_offset len]
Copy a sequence of len bytes from [src] to [t] starting at [src_offset]. *)
val write_mbytes : t -> MBytes.t -> int -> int -> unit
(** Write a character to the vector *)
val write_char : t -> char -> unit
(** Write an 8-bit signed integer to the vector *)
val write_int8 : t -> int -> unit
(** Write an 16-bit signed integer to the vector *)
val write_int16 : t -> int -> unit
(** Write an 32-bit signed integer to the vector *)
val write_int32 : t -> int32 -> unit
(** Write an 64-bit signed integer to the vector *)
val write_int64 : t -> int64 -> unit
(** Write a single-precision float to the vector *)
val write_float : t -> float -> unit
(** Write a double-precision float to the vector *)
val write_double : t -> float -> unit
(** Write the characters from a string to the vector.
This does not copy the size. *)
val write_string_data : t -> string -> unit
(** Write a sized amount of data to the vector.
Size is a 32 bit integer.
Do not use [to_mbytes] in the writer function. *)
val write_sized : t -> (unit -> unit) -> unit
(** Convert the buffer to mBytes.
As all operations are append only, you can continue to add to the
buffer after this function has been called.
The buffer may be more than twice as large as necessary to contain
the written data. Calling [MBytes.copy] will create a new buffer
that is exactly the required size. *)
val to_mbytes : t -> MBytes.t

View File

@ -5,7 +5,9 @@
test_lwt_pipe
test_merkle
test_stream_data_encoding
test_utils))
test_utils
bench_data_encoding
test_mbytes_buffer))
(libraries (tezos-base test_lib))
(flags (:standard -w -9-32
-safe-string
@ -17,7 +19,8 @@
test_lwt_pipe.exe
test_merkle.exe
test_stream_data_encoding.exe
test_utils.exe))))
test_utils.exe
test_mbytes_buffer.exe))))
(alias
((name runtest_data_encoding)
@ -39,13 +42,18 @@
((name runtest_utils)
(action (run ${exe:test_utils.exe}))))
(alias
((name runtest_mbytes_buffer)
(action (run ${exe:test_mbytes_buffer.exe}))))
(alias
((name runtest)
(deps ((alias runtest_data_encoding)
(alias runtest_lwt_pipe)
(alias runtest_merkle)
(alias runtest_stream_data_encoding)
(alias runtest_utils)))))
(alias runtest_utils)
(alias runtest_mbytes_buffer)))))
(alias
((name runtest_indent)

View File

@ -364,6 +364,53 @@ let test_out_of_range _ =
assert_exception enc_float 100.1 ;
Lwt.return_unit
let test_string_enum_boundary _ =
let open Data_encoding in
let entries = List.rev_map (fun x -> string_of_int x, x) (0 -- 254) in
let run_test cases =
List.iter (fun (_, num) ->
let enc = string_enum cases in
let encoded = Data_encoding.Binary.to_bytes enc num in
let decoded = Data_encoding.Binary.of_bytes_exn enc encoded in
Assert.equal num decoded) cases in
run_test entries ;
let entries2 = (("255", 255) :: entries) in
run_test entries2 ;
run_test (("256", 256) :: entries2) ;
Lwt.return_unit
(** Generate encodings of the encoding and the randomized generator *)
let test_generator ?(iterations=50) encoding generator =
for _ = 0 to iterations - 1 do
let encode = generator () in
let bytes = Data_encoding.Binary.to_bytes encoding encode in
let decode = Data_encoding.Binary.of_bytes_exn encoding bytes in
Assert.equal encode decode
done ;
Lwt.return ()
let rec make_int_list acc len () =
if len = 0
then acc
else make_int_list (Random.int64 Int64.max_int :: acc) (len - 1) ()
let test_randomized_int_list _ =
test_generator Data_encoding.(list int64) (make_int_list [] 100)
let test_randomized_string_list _ =
test_generator (list string) (fun () -> List.map Int64.to_string (make_int_list [] 100 ()))
let test_randomized_variant_list _ =
test_generator (list (result (option string) string))
(fun () ->
List.map
(fun x ->
let str = Int64.to_string x in
if Random.bool ()
then if Random.bool () then Ok (Some str) else Ok None
else Error str)
(make_int_list [] 100 ()))
let tests = [
"simple", test_simple_values ;
"json", test_json ;
@ -373,7 +420,12 @@ let tests = [
"tags", test_tag_errors ;
"wrapped_binary", test_wrapped_binary ;
"out_of_range", test_out_of_range ;
"string_enum_boundary", test_string_enum_boundary ;
"randomized_int_list", test_randomized_int_list ;
"randomized_string_list", test_randomized_string_list ;
"randomized_variant_list", test_randomized_variant_list ;
]
let () =
Random.init 100 ;
Test.run "data_encoding." (List.map (fun (s, f) -> s, wrap_test f) tests)

View File

@ -0,0 +1,27 @@
(**************************************************************************)
(* *)
(* Copyright (c) 2014 - 2017. *)
(* Dynamic Ledger Solutions, Inc. <contact@tezos.com> *)
(* *)
(* All rights reserved. No warranty, explicit or implicit, provided. *)
(* *)
(**************************************************************************)
let hex_of_buffer buf =
Hex_encode.hex_of_bytes (MBytes_buffer.to_mbytes buf)
let assert_hex_eq buf =
Assert.equal ~prn:(fun x -> x) (hex_of_buffer buf)
let () =
let buf1 = MBytes_buffer.create ~initial_size:1 () in
MBytes_buffer.write_char buf1 'a' ;
assert_hex_eq buf1 "61" ;
MBytes_buffer.write_char buf1 'Q' ;
assert_hex_eq buf1 "6151" ;
MBytes_buffer.write_int32 buf1 Int32.max_int ;
assert_hex_eq buf1 "61517fffffff" ;
let buf2 = MBytes_buffer.create ~initial_size:1 () in
MBytes_buffer.write_sized buf2 (fun () -> MBytes_buffer.write_string_data buf2 "abc") ;
assert_hex_eq buf2 "00000003616263"