1
1
use std:: collections:: { BTreeMap , HashMap , HashSet } ;
2
- use std:: io;
2
+ use std:: io:: { self , Read , Write } ;
3
3
use std:: net:: Ipv6Addr ;
4
4
5
5
use columnar:: MonotonicallyMappableToU128 ;
@@ -56,7 +56,9 @@ impl CompactDoc {
56
56
57
57
/// Adding a facet to the document.
58
58
pub fn add_facet < F > ( & mut self , field : Field , path : F )
59
- where Facet : From < F > {
59
+ where
60
+ Facet : From < F > ,
61
+ {
60
62
let facet = Facet :: from ( path) ;
61
63
self . add_leaf_field_value ( field, ReferenceValueLeaf :: Facet ( facet. encoded_str ( ) ) ) ;
62
64
}
@@ -255,7 +257,9 @@ impl Eq for CompactDoc {}
255
257
256
258
impl DocumentDeserialize for CompactDoc {
257
259
fn deserialize < ' de , D > ( mut deserializer : D ) -> Result < Self , DeserializeError >
258
- where D : DocumentDeserializer < ' de > {
260
+ where
261
+ D : DocumentDeserializer < ' de > ,
262
+ {
259
263
let mut doc = CompactDoc :: default ( ) ;
260
264
// TODO: Deserializing into OwnedValue is wasteful. The deserializer should be able to work
261
265
// on slices and referenced data.
@@ -285,30 +289,14 @@ impl<'a> Value<'a> for CompactDocValue<'a> {
285
289
#[ derive( Debug , Clone ) ]
286
290
/// A container to store tantivy Value
287
291
struct CompactDocContainer {
288
- /// A list of nodes, that are used to store the values of the document.
289
- ///
290
- /// ## Note on the design
291
- /// We could use just a single vec `node_data` to store the nodes, but this would have a
292
- /// downside. node_data has flexible sized elements compared to `nodes`. So when creating
293
- /// a vec or document, we can reserve space for all od the direct child nodes upfront, and then
294
- /// write into the nodes array without resizing. This is not possible with `node_data`. So
295
- /// we use `nodes` to store the references to the actual data in `node_data`.
296
- /// There would be 2 ways to use node_data instead of nodes:
297
- /// - Instead of storing start and len for arrays and objects, we could a list of node pointers
298
- /// to the bytes in node_data. This would require more memory.
299
- /// - A two layer approach, where we when receiving an array/object, we would process the first
300
- /// level
301
- /// subnodes and store the pos and len in and then handle then deeper levels. I don't like the
302
- /// added complexity of this approach.
303
- nodes : mediumvec:: Vec32 < ValueAddr > ,
304
- /// The `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
305
- /// includes all the data of the document.
292
+ /// `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
293
+ /// includes all the data of the document and also metadata like where the nodes are located
294
+ /// in an object or array.
306
295
node_data : mediumvec:: Vec32 < u8 > ,
307
296
}
308
297
impl Default for CompactDocContainer {
309
298
fn default ( ) -> Self {
310
299
Self {
311
- nodes : mediumvec:: Vec32 :: with_capacity ( 4 ) ,
312
300
// This should be at the lower end of the payload of a document
313
301
// 512 byte is pretty small
314
302
node_data : mediumvec:: Vec32 :: with_capacity ( 512 ) ,
@@ -323,6 +311,21 @@ pub struct ValueAddr {
323
311
type_id : ValueType ,
324
312
val : Addr , // this is the address, except for bool and null, which are inlined
325
313
}
314
+ impl BinarySerializable for ValueAddr {
315
+ fn serialize < W : Write + ?Sized > ( & self , writer : & mut W ) -> io:: Result < ( ) > {
316
+ ( self . type_id as u8 ) . serialize ( writer) ?;
317
+ self . val . 0 . serialize ( writer)
318
+ }
319
+
320
+ fn deserialize < R : Read > ( reader : & mut R ) -> io:: Result < Self > {
321
+ let type_id = ValueType :: deserialize ( reader) ?;
322
+ let addr: [ u8 ; 3 ] = <[ u8 ; 3 ] >:: deserialize ( reader) ?;
323
+ Ok ( ValueAddr {
324
+ type_id,
325
+ val : Addr ( addr) ,
326
+ } )
327
+ }
328
+ }
326
329
impl std:: fmt:: Debug for ValueAddr {
327
330
fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
328
331
f. write_fmt ( format_args ! (
@@ -400,6 +403,26 @@ pub enum ValueType {
400
403
Array = 12 ,
401
404
}
402
405
406
+ impl BinarySerializable for ValueType {
407
+ fn serialize < W : Write + ?Sized > ( & self , writer : & mut W ) -> io:: Result < ( ) > {
408
+ ( * self as u8 ) . serialize ( writer) ?;
409
+ Ok ( ( ) )
410
+ }
411
+
412
+ fn deserialize < R : Read > ( reader : & mut R ) -> io:: Result < Self > {
413
+ let num = u8:: deserialize ( reader) ?;
414
+ let type_id = if ( 0 ..=12 ) . contains ( & num) {
415
+ unsafe { std:: mem:: transmute ( num) }
416
+ } else {
417
+ return Err ( io:: Error :: new (
418
+ io:: ErrorKind :: InvalidData ,
419
+ format ! ( "Invalid value type id: {}" , num) ,
420
+ ) ) ;
421
+ } ;
422
+ Ok ( type_id)
423
+ }
424
+ }
425
+
403
426
impl < ' a , V : Value < ' a > > From < & ReferenceValue < ' a , V > > for ValueType {
404
427
fn from ( value : & ReferenceValue < ' a , V > ) -> Self {
405
428
match value {
@@ -474,9 +497,10 @@ impl CompactDocContainer {
474
497
let mut positions = Vec :: new ( ) ;
475
498
for elem in elements {
476
499
let ref_elem = self . add_value ( elem) ;
477
- let position = self . nodes . len ( ) as u32 ;
500
+ let position = self . node_data . len ( ) as u32 ;
478
501
write_u32_vint ( position, & mut positions) . expect ( "in memory can't fail" ) ;
479
- self . nodes . push ( ref_elem) ;
502
+ write_into ( & mut self . node_data , ref_elem) ;
503
+ // self.nodes.push(ref_elem);
480
504
}
481
505
ValueAddr :: new ( type_id, write_bytes_into ( & mut self . node_data , & positions) )
482
506
}
@@ -485,10 +509,10 @@ impl CompactDocContainer {
485
509
for ( key, value) in entries {
486
510
let ref_key = self . add_value_leaf ( ReferenceValueLeaf :: Str ( key) ) ;
487
511
let ref_value = self . add_value ( value) ;
488
- let position = self . nodes . len ( ) as u32 ;
512
+ let position = self . node_data . len ( ) as u32 ;
489
513
write_u32_vint ( position, & mut positions) . expect ( "in memory can't fail" ) ;
490
- self . nodes . push ( ref_key) ;
491
- self . nodes . push ( ref_value) ;
514
+ write_into ( & mut self . node_data , ref_key) ;
515
+ write_into ( & mut self . node_data , ref_value) ;
492
516
}
493
517
ValueAddr :: new ( type_id, write_bytes_into ( & mut self . node_data , & positions) )
494
518
}
@@ -619,12 +643,15 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
619
643
type Item = ( & ' a str , CompactDocValue < ' a > ) ;
620
644
621
645
fn next ( & mut self ) -> Option < Self :: Item > {
622
- if self . positions_slice . len ( ) > 0 {
646
+ if ! self . positions_slice . is_empty ( ) {
623
647
let key_index = read_u32_vint ( & mut self . positions_slice ) as usize ;
624
- let key = self . container . extract_str ( self . container . nodes [ key_index] ) ;
648
+ let position = & mut & self . container . node_data [ key_index..] ;
649
+ let key_addr = ValueAddr :: deserialize ( position) . ok ( ) ?;
650
+ let key = self . container . extract_str ( key_addr) ;
651
+ let value = ValueAddr :: deserialize ( position) . ok ( ) ?;
625
652
let value = CompactDocValue {
626
653
container : self . container ,
627
- value : self . container . nodes [ key_index + 1 ] ,
654
+ value,
628
655
} ;
629
656
return Some ( ( key, value) ) ;
630
657
}
@@ -653,11 +680,12 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
653
680
type Item = CompactDocValue < ' a > ;
654
681
655
682
fn next ( & mut self ) -> Option < Self :: Item > {
656
- if self . positions_slice . len ( ) > 0 {
683
+ if ! self . positions_slice . is_empty ( ) {
657
684
let key_index = read_u32_vint ( & mut self . positions_slice ) as usize ;
685
+ let value = ValueAddr :: deserialize ( & mut & self . container . node_data [ key_index..] ) . ok ( ) ?;
658
686
let value = CompactDocValue {
659
687
container : self . container ,
660
- value : self . container . nodes [ key_index ] ,
688
+ value,
661
689
} ;
662
690
return Some ( value) ;
663
691
}
0 commit comments