Skip to content

Commit bd64dd2

Browse files
committed
remove nodes array
1 parent 8904d6e commit bd64dd2

File tree

4 files changed

+85
-62
lines changed

4 files changed

+85
-62
lines changed

common/src/lib.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@ pub use json_path_writer::JsonPathWriter;
2121
pub use ownedbytes::{OwnedBytes, StableDeref};
2222
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
2323
pub use vint::{
24-
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32,
25-
vint_32_iterator as u32_vint_iterator, write_u32_vint, VInt, VIntU128,
24+
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
2625
};
2726
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
2827

common/src/serialize.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,23 @@ impl FixedSize for u32 {
120120
const SIZE_IN_BYTES: usize = 4;
121121
}
122122

123+
impl BinarySerializable for [u8; 3] {
124+
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
125+
writer.write_u8(self[0])?;
126+
writer.write_u8(self[1])?;
127+
writer.write_u8(self[2])?;
128+
Ok(())
129+
}
130+
131+
fn deserialize<R: Read>(reader: &mut R) -> io::Result<[u8; 3]> {
132+
Ok([reader.read_u8()?, reader.read_u8()?, reader.read_u8()?])
133+
}
134+
}
135+
136+
impl FixedSize for [u8; 3] {
137+
const SIZE_IN_BYTES: usize = 3;
138+
}
139+
123140
impl BinarySerializable for u16 {
124141
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
125142
writer.write_u16::<Endianness>(*self)
@@ -347,6 +364,12 @@ pub mod test {
347364
assert_eq!(serialize_test(String::from("富士さん見える。")), 1 + 3 * 8);
348365
}
349366

367+
#[test]
368+
fn test_serialize_3bytes() {
369+
let bytes: [u8; 3] = [1, 2, 3];
370+
assert_eq!(serialize_test(bytes), 3);
371+
}
372+
350373
#[test]
351374
fn test_serialize_vec() {
352375
assert_eq!(serialize_test(Vec::<u8>::new()), 1);

common/src/vint.rs

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -157,20 +157,6 @@ pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()>
157157
writer.write_all(data)
158158
}
159159

160-
/// Iterates over all u32 encoded vints in the slice
161-
pub fn vint_32_iterator(data: &[u8]) -> impl Iterator<Item = u32> + '_ {
162-
let mut slice = data;
163-
std::iter::from_fn(move || {
164-
if slice.is_empty() {
165-
None
166-
} else {
167-
let (value, vlen) = read_u32_vint_no_advance(&slice);
168-
slice = &slice[vlen..];
169-
Some(value)
170-
}
171-
})
172-
}
173-
174160
impl VInt {
175161
pub fn val(&self) -> u64 {
176162
self.0
@@ -236,8 +222,7 @@ impl BinarySerializable for VInt {
236222
#[cfg(test)]
237223
mod tests {
238224

239-
use super::{serialize_vint_u32, u32_vint_iterator, BinarySerializable, VInt};
240-
use crate::write_u32_vint;
225+
use super::{serialize_vint_u32, BinarySerializable, VInt};
241226

242227
fn aux_test_vint(val: u64) {
243228
let mut v = [14u8; 10];
@@ -256,18 +241,6 @@ mod tests {
256241
assert_eq!(val, serdeser_val.0);
257242
}
258243

259-
#[test]
260-
fn vint_iterator_test() {
261-
let mut out = Vec::new();
262-
263-
let values: Vec<u32> = (0..100).collect();
264-
for i in values.iter() {
265-
write_u32_vint(*i, &mut out).unwrap();
266-
}
267-
let deser: Vec<u32> = u32_vint_iterator(&out).collect();
268-
assert_eq!(values, deser);
269-
}
270-
271244
#[test]
272245
fn test_vint() {
273246
aux_test_vint(0);

src/schema/document/default_document.rs

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use std::collections::{BTreeMap, HashMap, HashSet};
2-
use std::io;
2+
use std::io::{self, Read, Write};
33
use std::net::Ipv6Addr;
44

55
use columnar::MonotonicallyMappableToU128;
@@ -56,7 +56,9 @@ impl CompactDoc {
5656

5757
/// Adding a facet to the document.
5858
pub fn add_facet<F>(&mut self, field: Field, path: F)
59-
where Facet: From<F> {
59+
where
60+
Facet: From<F>,
61+
{
6062
let facet = Facet::from(path);
6163
self.add_leaf_field_value(field, ReferenceValueLeaf::Facet(facet.encoded_str()));
6264
}
@@ -255,7 +257,9 @@ impl Eq for CompactDoc {}
255257

256258
impl DocumentDeserialize for CompactDoc {
257259
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
258-
where D: DocumentDeserializer<'de> {
260+
where
261+
D: DocumentDeserializer<'de>,
262+
{
259263
let mut doc = CompactDoc::default();
260264
// TODO: Deserializing into OwnedValue is wasteful. The deserializer should be able to work
261265
// on slices and referenced data.
@@ -285,30 +289,14 @@ impl<'a> Value<'a> for CompactDocValue<'a> {
285289
#[derive(Debug, Clone)]
286290
/// A container to store tantivy Value
287291
struct CompactDocContainer {
288-
/// A list of nodes, that are used to store the values of the document.
289-
///
290-
/// ## Note on the design
291-
/// We could use just a single vec `node_data` to store the nodes, but this would have a
292-
/// downside. node_data has flexible sized elements compared to `nodes`. So when creating
293-
/// a vec or document, we can reserve space for all od the direct child nodes upfront, and then
294-
/// write into the nodes array without resizing. This is not possible with `node_data`. So
295-
/// we use `nodes` to store the references to the actual data in `node_data`.
296-
/// There would be 2 ways to use node_data instead of nodes:
297-
/// - Instead of storing start and len for arrays and objects, we could a list of node pointers
298-
/// to the bytes in node_data. This would require more memory.
299-
/// - A two layer approach, where we when receiving an array/object, we would process the first
300-
/// level
301-
/// subnodes and store the pos and len in and then handle then deeper levels. I don't like the
302-
/// added complexity of this approach.
303-
nodes: mediumvec::Vec32<ValueAddr>,
304-
/// The `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
305-
/// includes all the data of the document.
292+
/// `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
293+
/// includes all the data of the document and also metadata like where the nodes are located
294+
/// in an object or array.
306295
node_data: mediumvec::Vec32<u8>,
307296
}
308297
impl Default for CompactDocContainer {
309298
fn default() -> Self {
310299
Self {
311-
nodes: mediumvec::Vec32::with_capacity(4),
312300
// This should be at the lower end of the payload of a document
313301
// 512 byte is pretty small
314302
node_data: mediumvec::Vec32::with_capacity(512),
@@ -323,6 +311,21 @@ pub struct ValueAddr {
323311
type_id: ValueType,
324312
val: Addr, // this is the address, except for bool and null, which are inlined
325313
}
314+
impl BinarySerializable for ValueAddr {
315+
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
316+
(self.type_id as u8).serialize(writer)?;
317+
self.val.0.serialize(writer)
318+
}
319+
320+
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
321+
let type_id = ValueType::deserialize(reader)?;
322+
let addr: [u8; 3] = <[u8; 3]>::deserialize(reader)?;
323+
Ok(ValueAddr {
324+
type_id,
325+
val: Addr(addr),
326+
})
327+
}
328+
}
326329
impl std::fmt::Debug for ValueAddr {
327330
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328331
f.write_fmt(format_args!(
@@ -400,6 +403,26 @@ pub enum ValueType {
400403
Array = 12,
401404
}
402405

406+
impl BinarySerializable for ValueType {
407+
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
408+
(*self as u8).serialize(writer)?;
409+
Ok(())
410+
}
411+
412+
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
413+
let num = u8::deserialize(reader)?;
414+
let type_id = if (0..=12).contains(&num) {
415+
unsafe { std::mem::transmute(num) }
416+
} else {
417+
return Err(io::Error::new(
418+
io::ErrorKind::InvalidData,
419+
format!("Invalid value type id: {}", num),
420+
));
421+
};
422+
Ok(type_id)
423+
}
424+
}
425+
403426
impl<'a, V: Value<'a>> From<&ReferenceValue<'a, V>> for ValueType {
404427
fn from(value: &ReferenceValue<'a, V>) -> Self {
405428
match value {
@@ -474,9 +497,10 @@ impl CompactDocContainer {
474497
let mut positions = Vec::new();
475498
for elem in elements {
476499
let ref_elem = self.add_value(elem);
477-
let position = self.nodes.len() as u32;
500+
let position = self.node_data.len() as u32;
478501
write_u32_vint(position, &mut positions).expect("in memory can't fail");
479-
self.nodes.push(ref_elem);
502+
write_into(&mut self.node_data, ref_elem);
503+
// self.nodes.push(ref_elem);
480504
}
481505
ValueAddr::new(type_id, write_bytes_into(&mut self.node_data, &positions))
482506
}
@@ -485,10 +509,10 @@ impl CompactDocContainer {
485509
for (key, value) in entries {
486510
let ref_key = self.add_value_leaf(ReferenceValueLeaf::Str(key));
487511
let ref_value = self.add_value(value);
488-
let position = self.nodes.len() as u32;
512+
let position = self.node_data.len() as u32;
489513
write_u32_vint(position, &mut positions).expect("in memory can't fail");
490-
self.nodes.push(ref_key);
491-
self.nodes.push(ref_value);
514+
write_into(&mut self.node_data, ref_key);
515+
write_into(&mut self.node_data, ref_value);
492516
}
493517
ValueAddr::new(type_id, write_bytes_into(&mut self.node_data, &positions))
494518
}
@@ -619,12 +643,15 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
619643
type Item = (&'a str, CompactDocValue<'a>);
620644

621645
fn next(&mut self) -> Option<Self::Item> {
622-
if self.positions_slice.len() > 0 {
646+
if !self.positions_slice.is_empty() {
623647
let key_index = read_u32_vint(&mut self.positions_slice) as usize;
624-
let key = self.container.extract_str(self.container.nodes[key_index]);
648+
let position = &mut &self.container.node_data[key_index..];
649+
let key_addr = ValueAddr::deserialize(position).ok()?;
650+
let key = self.container.extract_str(key_addr);
651+
let value = ValueAddr::deserialize(position).ok()?;
625652
let value = CompactDocValue {
626653
container: self.container,
627-
value: self.container.nodes[key_index + 1],
654+
value,
628655
};
629656
return Some((key, value));
630657
}
@@ -653,11 +680,12 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
653680
type Item = CompactDocValue<'a>;
654681

655682
fn next(&mut self) -> Option<Self::Item> {
656-
if self.positions_slice.len() > 0 {
683+
if !self.positions_slice.is_empty() {
657684
let key_index = read_u32_vint(&mut self.positions_slice) as usize;
685+
let value = ValueAddr::deserialize(&mut &self.container.node_data[key_index..]).ok()?;
658686
let value = CompactDocValue {
659687
container: self.container,
660-
value: self.container.nodes[key_index],
688+
value,
661689
};
662690
return Some(value);
663691
}

0 commit comments

Comments
 (0)