diff options
author | Valentin Popov <valentin@popov.link> | 2024-01-08 00:21:28 +0300 |
---|---|---|
committer | Valentin Popov <valentin@popov.link> | 2024-01-08 00:21:28 +0300 |
commit | 1b6a04ca5504955c571d1c97504fb45ea0befee4 (patch) | |
tree | 7579f518b23313e8a9748a88ab6173d5e030b227 /vendor/weezl/src/encode.rs | |
parent | 5ecd8cf2cba827454317368b68571df0d13d7842 (diff) | |
download | fparkan-1b6a04ca5504955c571d1c97504fb45ea0befee4.tar.xz fparkan-1b6a04ca5504955c571d1c97504fb45ea0befee4.zip |
Initial vendor packages
Signed-off-by: Valentin Popov <valentin@popov.link>
Diffstat (limited to 'vendor/weezl/src/encode.rs')
-rw-r--r-- | vendor/weezl/src/encode.rs | 1126 |
1 files changed, 1126 insertions, 0 deletions
diff --git a/vendor/weezl/src/encode.rs b/vendor/weezl/src/encode.rs new file mode 100644 index 0000000..492b18c --- /dev/null +++ b/vendor/weezl/src/encode.rs @@ -0,0 +1,1126 @@ +//! A module for all encoding needs. +use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; +use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; + +use crate::alloc::{boxed::Box, vec::Vec}; +#[cfg(feature = "std")] +use crate::error::StreamResult; +#[cfg(feature = "std")] +use std::io::{self, BufRead, Write}; + +/// The state for encoding data with an LZW algorithm. +/// +/// The same structure can be utilized with streams as well as your own buffers and driver logic. +/// It may even be possible to mix them if you are sufficiently careful not to lose any written +/// data in the process. +/// +/// This is a sans-IO implementation, meaning that it only contains the state of the encoder and +/// the caller will provide buffers for input and output data when calling the basic +/// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` +/// methods for enoding with a particular style of common IO. +/// +/// * [`encode`] for encoding once without any IO-loop. +/// * [`into_async`] for encoding with the `futures` traits for asynchronous IO. +/// * [`into_stream`] for encoding with the standard `io` traits. +/// * [`into_vec`] for in-memory encoding. +/// +/// [`encode_bytes`]: #method.encode_bytes +/// [`encode`]: #method.encode +/// [`into_async`]: #method.into_async +/// [`into_stream`]: #method.into_stream +/// [`into_vec`]: #method.into_vec +pub struct Encoder { + /// Internally dispatch via a dynamic trait object. This did not have any significant + /// performance impact as we batch data internally and this pointer does not change after + /// creation! + state: Box<dyn Stateful + Send + 'static>, +} + +/// A encoding stream sink. +/// +/// See [`Encoder::into_stream`] on how to create this type. +/// +/// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream +#[cfg_attr( + not(feature = "std"), + deprecated = "This type is only useful with the `std` feature." +)] +#[cfg_attr(not(feature = "std"), allow(dead_code))] +pub struct IntoStream<'d, W> { + encoder: &'d mut Encoder, + writer: W, + buffer: Option<StreamBuf<'d>>, + default_size: usize, +} + +/// An async decoding sink. +/// +/// See [`Encoder::into_async`] on how to create this type. +/// +/// [`Encoder::into_async`]: struct.Encoder.html#method.into_async +#[cfg(feature = "async")] +pub struct IntoAsync<'d, W> { + encoder: &'d mut Encoder, + writer: W, + buffer: Option<StreamBuf<'d>>, + default_size: usize, +} + +/// A encoding sink into a vector. +/// +/// See [`Encoder::into_vec`] on how to create this type. +/// +/// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec +pub struct IntoVec<'d> { + encoder: &'d mut Encoder, + vector: &'d mut Vec<u8>, +} + +trait Stateful { + fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; + fn mark_ended(&mut self) -> bool; + /// Reset the state tracking if end code has been written. + fn restart(&mut self); + /// Reset the encoder to the beginning, dropping all buffers etc. + fn reset(&mut self); +} + +struct EncodeState<B: Buffer> { + /// The configured minimal code size. + min_size: u8, + /// The current encoding symbol tree. + tree: Tree, + /// If we have pushed the end code. + has_ended: bool, + /// If tiff then bumps are a single code sooner. + is_tiff: bool, + /// The code corresponding to the currently read characters. + current_code: Code, + /// The clear code for resetting the dictionary. + clear_code: Code, + /// The bit buffer for encoding. + buffer: B, +} + +struct MsbBuffer { + /// The current code length. + code_size: u8, + /// The buffer bits. + buffer: u64, + /// The number of valid buffer bits. + bits_in_buffer: u8, +} + +struct LsbBuffer { + /// The current code length. + code_size: u8, + /// The buffer bits. + buffer: u64, + /// The number of valid buffer bits. + bits_in_buffer: u8, +} + +trait Buffer { + fn new(size: u8) -> Self; + /// Reset the code size in the buffer. + fn reset(&mut self, min_size: u8); + /// Apply effects of a Clear Code. + fn clear(&mut self, min_size: u8); + /// Insert a code into the buffer. + fn buffer_code(&mut self, code: Code); + /// Push bytes if the buffer space is getting small. + fn push_out(&mut self, out: &mut &mut [u8]) -> bool; + /// Flush all full bytes, returning if at least one more byte remains. + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool; + /// Pad the buffer to a full byte. + fn buffer_pad(&mut self); + /// Increase the maximum code size. + fn bump_code_size(&mut self); + /// Return the maximum code with the current code size. + fn max_code(&self) -> Code; + /// Return the current code size in bits. + fn code_size(&self) -> u8; +} + +/// One tree node for at most each code. +/// To avoid using too much memory we keep nodes with few successors in optimized form. This form +/// doesn't offer lookup by indexing but instead does a linear search. +#[derive(Default)] +struct Tree { + simples: Vec<Simple>, + complex: Vec<Full>, + keys: Vec<CompressedKey>, +} + +#[derive(Clone, Copy)] +enum FullKey { + NoSuccessor, + Simple(u16), + Full(u16), +} + +#[derive(Clone, Copy)] +struct CompressedKey(u16); + +const SHORT: usize = 16; + +#[derive(Clone, Copy)] +struct Simple { + codes: [Code; SHORT], + chars: [u8; SHORT], + count: u8, +} + +#[derive(Clone, Copy)] +struct Full { + char_continuation: [Code; 256], +} + +impl Encoder { + /// Create a new encoder with the specified bit order and symbol size. + /// + /// The algorithm for dynamically increasing the code symbol bit width is compatible with the + /// original specification. In particular you will need to specify an `Lsb` bit oder to encode + /// the data portion of a compressed `gif` image. + /// + /// # Panics + /// + /// The `size` needs to be in the interval `2..=12`. + pub fn new(order: BitOrder, size: u8) -> Self { + type Boxed = Box<dyn Stateful + Send + 'static>; + super::assert_encode_size(size); + let state = match order { + BitOrder::Lsb => Box::new(EncodeState::<LsbBuffer>::new(size)) as Boxed, + BitOrder::Msb => Box::new(EncodeState::<MsbBuffer>::new(size)) as Boxed, + }; + + Encoder { state } + } + + /// Create a TIFF compatible encoder with the specified bit order and symbol size. + /// + /// The algorithm for dynamically increasing the code symbol bit width is compatible with the + /// TIFF specification, which is a misinterpretation of the original algorithm for increasing + /// the code size. It switches one symbol sooner. + /// + /// # Panics + /// + /// The `size` needs to be in the interval `2..=12`. + pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { + type Boxed = Box<dyn Stateful + Send + 'static>; + super::assert_encode_size(size); + let state = match order { + BitOrder::Lsb => { + let mut state = Box::new(EncodeState::<LsbBuffer>::new(size)); + state.is_tiff = true; + state as Boxed + } + BitOrder::Msb => { + let mut state = Box::new(EncodeState::<MsbBuffer>::new(size)); + state.is_tiff = true; + state as Boxed + } + }; + + Encoder { state } + } + + /// Encode some bytes from `inp` into `out`. + /// + /// See [`into_stream`] for high-level functions (this interface is only available with the + /// `std` feature) and [`finish`] for marking the input data as complete. + /// + /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and + /// all following ones will _not_ be consumed and the `status` of the result will signal an + /// error. The result will also indicate that all bytes up to but not including the offending + /// byte have been consumed. You may try again with a fixed byte. + /// + /// [`into_stream`]: #method.into_stream + /// [`finish`]: #method.finish + pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { + self.state.advance(inp, out) + } + + /// Encode a single chunk of data. + /// + /// This method will add an end marker to the encoded chunk. + /// + /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize + /// buffer size, to supply an existing vector, to control whether an end marker is required, or + /// to preserve partial data in the case of a decoding error. + /// + /// [`into_vec`]: #into_vec + /// + /// # Example + /// + /// ``` + /// use weezl::{BitOrder, encode::Encoder}; + /// + /// let data = b"Hello, world"; + /// let encoded = Encoder::new(BitOrder::Msb, 9) + /// .encode(data) + /// .expect("All bytes valid for code size"); + /// ``` + pub fn encode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> { + let mut output = Vec::new(); + self.into_vec(&mut output).encode_all(data).status?; + Ok(output) + } + + /// Construct a encoder into a writer. + #[cfg(feature = "std")] + pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> { + IntoStream { + encoder: self, + writer, + buffer: None, + default_size: STREAM_BUF_SIZE, + } + } + + /// Construct a encoder into an async writer. + #[cfg(feature = "async")] + pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> { + IntoAsync { + encoder: self, + writer, + buffer: None, + default_size: STREAM_BUF_SIZE, + } + } + + /// Construct an encoder into a vector. + /// + /// All encoded data is appended and the vector is __not__ cleared. + /// + /// Compared to `into_stream` this interface allows a high-level access to encoding without + /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the + /// special target exposes. + pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> { + IntoVec { + encoder: self, + vector: vec, + } + } + + /// Mark the encoding as in the process of finishing. + /// + /// The next following call to `encode_bytes` which is able to consume the complete input will + /// also try to emit an end code. It's not recommended, but also not unsound, to use different + /// byte slices in different calls from this point forward and thus to 'delay' the actual end + /// of the data stream. The behaviour after the end marker has been written is unspecified but + /// sound. + pub fn finish(&mut self) { + self.state.mark_ended(); + } + + /// Undo marking this data stream as ending. + /// FIXME: clarify how this interacts with padding introduced after end code. + #[allow(dead_code)] + pub(crate) fn restart(&mut self) { + self.state.restart() + } + + /// Reset all internal state. + /// + /// This produce an encoder as if just constructed with `new` but taking slightly less work. In + /// particular it will not deallocate any internal allocations. It will also avoid some + /// duplicate setup work. + pub fn reset(&mut self) { + self.state.reset() + } +} + +#[cfg(feature = "std")] +impl<'d, W: Write> IntoStream<'d, W> { + /// Encode data from a reader. + /// + /// This will drain the supplied reader. It will not encode an end marker after all data has + /// been processed. + pub fn encode(&mut self, read: impl BufRead) -> StreamResult { + self.encode_part(read, false) + } + + /// Encode data from a reader and an end marker. + pub fn encode_all(mut self, read: impl BufRead) -> StreamResult { + self.encode_part(read, true) + } + + /// Set the size of the intermediate encode buffer. + /// + /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is + /// available and any encoding method is called. No buffer is allocated if `set_buffer` has + /// been called. The buffer is reused. + /// + /// # Panics + /// This method panics if `size` is `0`. + pub fn set_buffer_size(&mut self, size: usize) { + assert_ne!(size, 0, "Attempted to set empty buffer"); + self.default_size = size; + } + + /// Use a particular buffer as an intermediate encode buffer. + /// + /// Calling this sets or replaces the buffer. When a buffer has been set then it is used + /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant + /// for efficient encoding as there is additional overhead from `write` calls each time the + /// buffer has been filled. + /// + /// # Panics + /// This method panics if the `buffer` is empty. + pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { + assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); + self.buffer = Some(StreamBuf::Borrowed(buffer)); + } + + fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult { + let IntoStream { + encoder, + writer, + buffer, + default_size, + } = self; + enum Progress { + Ok, + Done, + } + + let mut bytes_read = 0; + let mut bytes_written = 0; + + let read_bytes = &mut bytes_read; + let write_bytes = &mut bytes_written; + + let outbuf: &mut [u8] = + match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { + StreamBuf::Borrowed(slice) => &mut *slice, + StreamBuf::Owned(vec) => &mut *vec, + }; + assert!(!outbuf.is_empty()); + + let once = move || { + let data = read.fill_buf()?; + + if data.is_empty() { + if finish { + encoder.finish(); + } else { + return Ok(Progress::Done); + } + } + + let result = encoder.encode_bytes(data, &mut outbuf[..]); + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + read.consume(result.consumed_in); + + let done = result.status.map_err(|err| { + io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) + })?; + + if let LzwStatus::Done = done { + writer.write_all(&outbuf[..result.consumed_out])?; + return Ok(Progress::Done); + } + + if let LzwStatus::NoProgress = done { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "No more data but no end marker detected", + )); + } + + writer.write_all(&outbuf[..result.consumed_out])?; + Ok(Progress::Ok) + }; + + let status = core::iter::repeat_with(once) + // scan+fuse can be replaced with map_while + .scan((), |(), result| match result { + Ok(Progress::Ok) => Some(Ok(())), + Err(err) => Some(Err(err)), + Ok(Progress::Done) => None, + }) + .fuse() + .collect(); + + StreamResult { + bytes_read, + bytes_written, + status, + } + } +} + +impl IntoVec<'_> { + /// Encode data from a slice. + pub fn encode(&mut self, read: &[u8]) -> VectorResult { + self.encode_part(read, false) + } + + /// Decode data from a reader, adding an end marker. + pub fn encode_all(mut self, read: &[u8]) -> VectorResult { + self.encode_part(read, true) + } + + fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) { + const CHUNK_SIZE: usize = 1 << 12; + let decoder = &mut self.encoder; + let length = self.vector.len(); + + // Use the vector to do overflow checks and w/e. + self.vector.reserve(CHUNK_SIZE); + // FIXME: encoding into uninit buffer? + self.vector.resize(length + CHUNK_SIZE, 0u8); + + (&mut self.vector[length..], decoder) + } + + fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult { + let mut result = VectorResult { + consumed_in: 0, + consumed_out: 0, + status: Ok(LzwStatus::Ok), + }; + + enum Progress { + Ok, + Done, + } + + // Converting to mutable refs to move into the `once` closure. + let read_bytes = &mut result.consumed_in; + let write_bytes = &mut result.consumed_out; + let mut data = part; + + // A 64 MB buffer is quite large but should get alloc_zeroed. + // Note that the decoded size can be up to quadratic in code block. + let once = move || { + // Grab a new output buffer. + let (outbuf, encoder) = self.grab_buffer(); + + if finish { + encoder.finish(); + } + + // Decode as much of the buffer as fits. + let result = encoder.encode_bytes(data, &mut outbuf[..]); + // Do the bookkeeping and consume the buffer. + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + data = &data[result.consumed_in..]; + + let unfilled = outbuf.len() - result.consumed_out; + let filled = self.vector.len() - unfilled; + self.vector.truncate(filled); + + // Handle the status in the result. + let done = result.status?; + if let LzwStatus::Done = done { + Ok(Progress::Done) + } else { + Ok(Progress::Ok) + } + }; + + // Decode chunks of input data until we're done. + let status: Result<(), _> = core::iter::repeat_with(once) + // scan+fuse can be replaced with map_while + .scan((), |(), result| match result { + Ok(Progress::Ok) => Some(Ok(())), + Err(err) => Some(Err(err)), + Ok(Progress::Done) => None, + }) + .fuse() + .collect(); + + if let Err(err) = status { + result.status = Err(err); + } + + result + } +} + +// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would +// trip over the usage of await, which is a reserved keyword in that edition/version. It only +// contains an impl block. +#[cfg(feature = "async")] +#[path = "encode_into_async.rs"] +mod impl_encode_into_async; + +impl<B: Buffer> EncodeState<B> { + fn new(min_size: u8) -> Self { + let clear_code = 1 << min_size; + let mut tree = Tree::default(); + tree.init(min_size); + let mut state = EncodeState { + min_size, + tree, + has_ended: false, + is_tiff: false, + current_code: clear_code, + clear_code, + buffer: B::new(min_size), + }; + state.buffer_code(clear_code); + state + } +} + +impl<B: Buffer> Stateful for EncodeState<B> { + fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { + let c_in = inp.len(); + let c_out = out.len(); + let mut status = Ok(LzwStatus::Ok); + + 'encoding: loop { + if self.push_out(&mut out) { + break; + } + + if inp.is_empty() && self.has_ended { + let end = self.end_code(); + if self.current_code != end { + if self.current_code != self.clear_code { + self.buffer_code(self.current_code); + + // When reading this code, the decoder will add an extra entry to its table + // before reading th end code. Thusly, it may increase its code size based + // on this additional entry. + if self.tree.keys.len() + usize::from(self.is_tiff) + > usize::from(self.buffer.max_code()) + && self.buffer.code_size() < MAX_CODESIZE + { + self.buffer.bump_code_size(); + } + } + self.buffer_code(end); + self.current_code = end; + self.buffer_pad(); + } + + break; + } + + let mut next_code = None; + let mut bytes = inp.iter(); + while let Some(&byte) = bytes.next() { + if self.min_size < 8 && byte >= 1 << self.min_size { + status = Err(LzwError::InvalidCode); + break 'encoding; + } + + inp = bytes.as_slice(); + match self.tree.iterate(self.current_code, byte) { + Ok(code) => self.current_code = code, + Err(_) => { + next_code = Some(self.current_code); + + self.current_code = u16::from(byte); + break; + } + } + } + + match next_code { + // No more bytes, no code produced. + None => break, + Some(code) => { + self.buffer_code(code); + + if self.tree.keys.len() + usize::from(self.is_tiff) + > usize::from(self.buffer.max_code()) + 1 + && self.buffer.code_size() < MAX_CODESIZE + { + self.buffer.bump_code_size(); + } + + if self.tree.keys.len() > MAX_ENTRIES { + self.buffer_code(self.clear_code); + self.tree.reset(self.min_size); + self.buffer.clear(self.min_size); + } + } + } + } + + if inp.is_empty() && self.current_code == self.end_code() { + if !self.flush_out(&mut out) { + status = Ok(LzwStatus::Done); + } + } + + BufferResult { + consumed_in: c_in - inp.len(), + consumed_out: c_out - out.len(), + status, + } + } + + fn mark_ended(&mut self) -> bool { + core::mem::replace(&mut self.has_ended, true) + } + + fn restart(&mut self) { + self.has_ended = false; + } + + fn reset(&mut self) { + self.restart(); + self.current_code = self.clear_code; + self.tree.reset(self.min_size); + self.buffer.reset(self.min_size); + self.buffer_code(self.clear_code); + } +} + +impl<B: Buffer> EncodeState<B> { + fn push_out(&mut self, out: &mut &mut [u8]) -> bool { + self.buffer.push_out(out) + } + + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { + self.buffer.flush_out(out) + } + + fn end_code(&self) -> Code { + self.clear_code + 1 + } + + fn buffer_pad(&mut self) { + self.buffer.buffer_pad(); + } + + fn buffer_code(&mut self, code: Code) { + self.buffer.buffer_code(code); + } +} + +impl Buffer for MsbBuffer { + fn new(min_size: u8) -> Self { + MsbBuffer { + code_size: min_size + 1, + buffer: 0, + bits_in_buffer: 0, + } + } + + fn reset(&mut self, min_size: u8) { + self.code_size = min_size + 1; + self.buffer = 0; + self.bits_in_buffer = 0; + } + + fn clear(&mut self, min_size: u8) { + self.code_size = min_size + 1; + } + + fn buffer_code(&mut self, code: Code) { + let shift = 64 - self.bits_in_buffer - self.code_size; + self.buffer |= u64::from(code) << shift; + self.bits_in_buffer += self.code_size; + } + + fn push_out(&mut self, out: &mut &mut [u8]) -> bool { + if self.bits_in_buffer + 2 * self.code_size < 64 { + return false; + } + + self.flush_out(out) + } + + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { + let want = usize::from(self.bits_in_buffer / 8); + let count = want.min((*out).len()); + let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); + *out = tail; + + for b in bytes { + *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8; + self.buffer <<= 8; + self.bits_in_buffer -= 8; + } + + count < want + } + + fn buffer_pad(&mut self) { + let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; + self.bits_in_buffer += to_byte; + } + + fn bump_code_size(&mut self) { + self.code_size += 1; + } + + fn max_code(&self) -> Code { + (1 << self.code_size) - 1 + } + + fn code_size(&self) -> u8 { + self.code_size + } +} + +impl Buffer for LsbBuffer { + fn new(min_size: u8) -> Self { + LsbBuffer { + code_size: min_size + 1, + buffer: 0, + bits_in_buffer: 0, + } + } + + fn reset(&mut self, min_size: u8) { + self.code_size = min_size + 1; + self.buffer = 0; + self.bits_in_buffer = 0; + } + + fn clear(&mut self, min_size: u8) { + self.code_size = min_size + 1; + } + + fn buffer_code(&mut self, code: Code) { + self.buffer |= u64::from(code) << self.bits_in_buffer; + self.bits_in_buffer += self.code_size; + } + + fn push_out(&mut self, out: &mut &mut [u8]) -> bool { + if self.bits_in_buffer + 2 * self.code_size < 64 { + return false; + } + + self.flush_out(out) + } + + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { + let want = usize::from(self.bits_in_buffer / 8); + let count = want.min((*out).len()); + let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); + *out = tail; + + for b in bytes { + *b = (self.buffer & 0x0000_0000_0000_00ff) as u8; + self.buffer >>= 8; + self.bits_in_buffer -= 8; + } + + count < want + } + + fn buffer_pad(&mut self) { + let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; + self.bits_in_buffer += to_byte; + } + + fn bump_code_size(&mut self) { + self.code_size += 1; + } + + fn max_code(&self) -> Code { + (1 << self.code_size) - 1 + } + + fn code_size(&self) -> u8 { + self.code_size + } +} + +impl Tree { + fn init(&mut self, min_size: u8) { + // We need a way to represent the state of a currently empty buffer. We use the clear code + // for this, thus create one complex mapping that leads to the one-char base codes. + self.keys + .resize((1 << min_size) + 2, FullKey::NoSuccessor.into()); + self.complex.push(Full { + char_continuation: [0; 256], + }); + let map_of_begin = self.complex.last_mut().unwrap(); + for ch in 0u16..256 { + map_of_begin.char_continuation[usize::from(ch)] = ch; + } + self.keys[1 << min_size] = FullKey::Full(0).into(); + } + + fn reset(&mut self, min_size: u8) { + self.simples.clear(); + self.keys.truncate((1 << min_size) + 2); + // Keep entry for clear code. + self.complex.truncate(1); + // The first complex is not changed.. + for k in self.keys[..(1 << min_size) + 2].iter_mut() { + *k = FullKey::NoSuccessor.into(); + } + self.keys[1 << min_size] = FullKey::Full(0).into(); + } + + fn at_key(&self, code: Code, ch: u8) -> Option<Code> { + let key = self.keys[usize::from(code)]; + match FullKey::from(key) { + FullKey::NoSuccessor => None, + FullKey::Simple(idx) => { + let nexts = &self.simples[usize::from(idx)]; + let successors = nexts + .codes + .iter() + .zip(nexts.chars.iter()) + .take(usize::from(nexts.count)); + for (&scode, &sch) in successors { + if sch == ch { + return Some(scode); + } + } + + None + } + FullKey::Full(idx) => { + let full = &self.complex[usize::from(idx)]; + let precode = full.char_continuation[usize::from(ch)]; + if usize::from(precode) < MAX_ENTRIES { + Some(precode) + } else { + None + } + } + } + } + + /// Iterate to the next char. + /// Return Ok when it was already in the tree or creates a new entry for it and returns Err. + fn iterate(&mut self, code: Code, ch: u8) -> Result<Code, Code> { + if let Some(next) = self.at_key(code, ch) { + Ok(next) + } else { + Err(self.append(code, ch)) + } + } + + fn append(&mut self, code: Code, ch: u8) -> Code { + let next: Code = self.keys.len() as u16; + let key = self.keys[usize::from(code)]; + // TODO: with debug assertions, check for non-existence + match FullKey::from(key) { + FullKey::NoSuccessor => { + let new_key = FullKey::Simple(self.simples.len() as u16); + self.simples.push(Simple::default()); + let simples = self.simples.last_mut().unwrap(); + simples.codes[0] = next; + simples.chars[0] = ch; + simples.count = 1; + self.keys[usize::from(code)] = new_key.into(); + } + FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => { + let nexts = &mut self.simples[usize::from(idx)]; + let nidx = usize::from(nexts.count); + nexts.chars[nidx] = ch; + nexts.codes[nidx] = next; + nexts.count += 1; + } + FullKey::Simple(idx) => { + let new_key = FullKey::Full(self.complex.len() as u16); + let simples = &self.simples[usize::from(idx)]; + self.complex.push(Full { + char_continuation: [Code::max_value(); 256], + }); + let full = self.complex.last_mut().unwrap(); + for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) { + full.char_continuation[usize::from(pch)] = pcont; + } + self.keys[usize::from(code)] = new_key.into(); + } + FullKey::Full(idx) => { + let full = &mut self.complex[usize::from(idx)]; + full.char_continuation[usize::from(ch)] = next; + } + } + self.keys.push(FullKey::NoSuccessor.into()); + next + } +} + +impl Default for FullKey { + fn default() -> Self { + FullKey::NoSuccessor + } +} + +impl Default for Simple { + fn default() -> Self { + Simple { + codes: [0; SHORT], + chars: [0; SHORT], + count: 0, + } + } +} + +impl From<CompressedKey> for FullKey { + fn from(CompressedKey(key): CompressedKey) -> Self { + match (key >> MAX_CODESIZE) & 0xf { + 0 => FullKey::Full(key & 0xfff), + 1 => FullKey::Simple(key & 0xfff), + _ => FullKey::NoSuccessor, + } + } +} + +impl From<FullKey> for CompressedKey { + fn from(full: FullKey) -> Self { + CompressedKey(match full { + FullKey::NoSuccessor => 0x2000, + FullKey::Simple(code) => 0x1000 | code, + FullKey::Full(code) => code, + }) + } +} + +#[cfg(test)] +mod tests { + use super::{BitOrder, Encoder, LzwError, LzwStatus}; + use crate::alloc::vec::Vec; + use crate::decode::Decoder; + #[cfg(feature = "std")] + use crate::StreamBuf; + + #[test] + fn invalid_input_rejected() { + const BIT_LEN: u8 = 2; + let ref input = [0, 1 << BIT_LEN /* invalid */, 0]; + let ref mut target = [0u8; 128]; + let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN); + + encoder.finish(); + // We require simulation of normality, that is byte-for-byte compression. + let result = encoder.encode_bytes(input, target); + assert!(if let Err(LzwError::InvalidCode) = result.status { + true + } else { + false + }); + assert_eq!(result.consumed_in, 1); + + let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]); + assert!(if let Ok(LzwStatus::Done) = fixed.status { + true + } else { + false + }); + assert_eq!(fixed.consumed_in, 2); + + // Okay, now test we actually fixed it. + let ref mut compare = [0u8; 4]; + let mut todo = &target[..result.consumed_out + fixed.consumed_out]; + let mut free = &mut compare[..]; + let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN); + + // Decode with up to 16 rounds, far too much but inconsequential. + for _ in 0..16 { + if decoder.has_ended() { + break; + } + + let result = decoder.decode_bytes(todo, free); + assert!(result.status.is_ok()); + todo = &todo[result.consumed_in..]; + free = &mut free[result.consumed_out..]; + } + + let remaining = { free }.len(); + let len = compare.len() - remaining; + assert_eq!(todo, &[]); + assert_eq!(compare[..len], [0, 1, 0]); + } + + #[test] + #[should_panic] + fn invalid_code_size_low() { + let _ = Encoder::new(BitOrder::Msb, 1); + } + + #[test] + #[should_panic] + fn invalid_code_size_high() { + let _ = Encoder::new(BitOrder::Msb, 14); + } + + fn make_decoded() -> Vec<u8> { + const FILE: &'static [u8] = + include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.lock")); + return Vec::from(FILE); + } + + #[test] + #[cfg(feature = "std")] + fn into_stream_buffer_no_alloc() { + let encoded = make_decoded(); + let mut encoder = Encoder::new(BitOrder::Msb, 8); + + let mut output = vec![]; + let mut buffer = [0; 512]; + let mut istream = encoder.into_stream(&mut output); + istream.set_buffer(&mut buffer[..]); + istream.encode(&encoded[..]).status.unwrap(); + + match istream.buffer { + Some(StreamBuf::Borrowed(_)) => {} + None => panic!("Decoded without buffer??"), + Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), + } + } + + #[test] + #[cfg(feature = "std")] + fn into_stream_buffer_small_alloc() { + struct WriteTap<W: std::io::Write>(W); + const BUF_SIZE: usize = 512; + + impl<W: std::io::Write> std::io::Write for WriteTap<W> { + fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { + assert!(buf.len() <= BUF_SIZE); + self.0.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.0.flush() + } + } + + let encoded = make_decoded(); + let mut encoder = Encoder::new(BitOrder::Msb, 8); + + let mut output = vec![]; + let mut istream = encoder.into_stream(WriteTap(&mut output)); + istream.set_buffer_size(512); + istream.encode(&encoded[..]).status.unwrap(); + + match istream.buffer { + Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), + Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), + None => panic!("Decoded without buffer??"), + } + } + + #[test] + #[cfg(feature = "std")] + fn reset() { + let encoded = make_decoded(); + let mut encoder = Encoder::new(BitOrder::Msb, 8); + let mut reference = None; + + for _ in 0..2 { + let mut output = vec![]; + let mut buffer = [0; 512]; + let mut istream = encoder.into_stream(&mut output); + istream.set_buffer(&mut buffer[..]); + istream.encode_all(&encoded[..]).status.unwrap(); + + encoder.reset(); + if let Some(reference) = &reference { + assert_eq!(output, *reference); + } else { + reference = Some(output); + } + } + } +} |