diff options
Diffstat (limited to 'vendor/weezl/src')
-rw-r--r-- | vendor/weezl/src/decode.rs | 1333 | ||||
-rw-r--r-- | vendor/weezl/src/decode_into_async.rs | 143 | ||||
-rw-r--r-- | vendor/weezl/src/encode.rs | 1126 | ||||
-rw-r--r-- | vendor/weezl/src/encode_into_async.rs | 142 | ||||
-rw-r--r-- | vendor/weezl/src/error.rs | 72 | ||||
-rw-r--r-- | vendor/weezl/src/lib.rs | 146 |
6 files changed, 2962 insertions, 0 deletions
diff --git a/vendor/weezl/src/decode.rs b/vendor/weezl/src/decode.rs new file mode 100644 index 0000000..641a3a8 --- /dev/null +++ b/vendor/weezl/src/decode.rs @@ -0,0 +1,1333 @@ +//! A module for all decoding needs. +#[cfg(feature = "std")] +use crate::error::StreamResult; +use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; +use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; + +use crate::alloc::{boxed::Box, vec, vec::Vec}; +#[cfg(feature = "std")] +use std::io::{self, BufRead, Write}; + +/// The state for decoding data with an LZW algorithm. +/// +/// The same structure can be utilized with streams as well as your own buffers and driver logic. +/// It may even be possible to mix them if you are sufficiently careful not to lose or skip any +/// already decode data in the process. +/// +/// This is a sans-IO implementation, meaning that it only contains the state of the decoder and +/// the caller will provide buffers for input and output data when calling the basic +/// [`decode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` +/// methods for decoding with a particular style of common IO. +/// +/// * [`decode`] for decoding once without any IO-loop. +/// * [`into_async`] for decoding with the `futures` traits for asynchronous IO. +/// * [`into_stream`] for decoding with the standard `io` traits. +/// * [`into_vec`] for in-memory decoding. +/// +/// [`decode_bytes`]: #method.decode_bytes +/// [`decode`]: #method.decode +/// [`into_async`]: #method.into_async +/// [`into_stream`]: #method.into_stream +/// [`into_vec`]: #method.into_vec +pub struct Decoder { + state: Box<dyn Stateful + Send + 'static>, +} + +/// A decoding stream sink. +/// +/// See [`Decoder::into_stream`] on how to create this type. +/// +/// [`Decoder::into_stream`]: struct.Decoder.html#method.into_stream +#[cfg_attr( + not(feature = "std"), + deprecated = "This type is only useful with the `std` feature." +)] +#[cfg_attr(not(feature = "std"), allow(dead_code))] +pub struct IntoStream<'d, W> { + decoder: &'d mut Decoder, + writer: W, + buffer: Option<StreamBuf<'d>>, + default_size: usize, +} + +/// An async decoding sink. +/// +/// See [`Decoder::into_async`] on how to create this type. +/// +/// [`Decoder::into_async`]: struct.Decoder.html#method.into_async +#[cfg(feature = "async")] +pub struct IntoAsync<'d, W> { + decoder: &'d mut Decoder, + writer: W, + buffer: Option<StreamBuf<'d>>, + default_size: usize, +} + +/// A decoding sink into a vector. +/// +/// See [`Decoder::into_vec`] on how to create this type. +/// +/// [`Decoder::into_vec`]: struct.Decoder.html#method.into_vec +pub struct IntoVec<'d> { + decoder: &'d mut Decoder, + vector: &'d mut Vec<u8>, +} + +trait Stateful { + fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; + fn has_ended(&self) -> bool; + /// Ignore an end code and continue decoding (no implied reset). + fn restart(&mut self); + /// Reset the decoder to the beginning, dropping all buffers etc. + fn reset(&mut self); +} + +#[derive(Clone)] +struct Link { + prev: Code, + byte: u8, +} + +#[derive(Default)] +struct MsbBuffer { + /// A buffer of individual bits. The oldest code is kept in the high-order bits. + bit_buffer: u64, + /// A precomputed mask for this code. + code_mask: u16, + /// The current code size. + code_size: u8, + /// The number of bits in the buffer. + bits: u8, +} + +#[derive(Default)] +struct LsbBuffer { + /// A buffer of individual bits. The oldest code is kept in the high-order bits. + bit_buffer: u64, + /// A precomputed mask for this code. + code_mask: u16, + /// The current code size. + code_size: u8, + /// The number of bits in the buffer. + bits: u8, +} + +trait CodeBuffer { + fn new(min_size: u8) -> Self; + fn reset(&mut self, min_size: u8); + fn bump_code_size(&mut self); + /// Retrieve the next symbol, refilling if necessary. + fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code>; + /// Refill the internal buffer. + fn refill_bits(&mut self, inp: &mut &[u8]); + /// Get the next buffered code word. + fn get_bits(&mut self) -> Option<Code>; + fn max_code(&self) -> Code; + fn code_size(&self) -> u8; +} + +struct DecodeState<CodeBuffer> { + /// The original minimum code size. + min_size: u8, + /// The table of decoded codes. + table: Table, + /// The buffer of decoded data. + buffer: Buffer, + /// The link which we are still decoding and its original code. + last: Option<(Code, Link)>, + /// The next code entry. + next_code: Code, + /// Code to reset all tables. + clear_code: Code, + /// Code to signal the end of the stream. + end_code: Code, + /// A stored flag if the end code has already appeared. + has_ended: bool, + /// If tiff then bumps are a single code sooner. + is_tiff: bool, + /// Do we allow stream to start without an explicit reset code? + implicit_reset: bool, + /// The buffer for decoded words. + code_buffer: CodeBuffer, +} + +struct Buffer { + bytes: Box<[u8]>, + read_mark: usize, + write_mark: usize, +} + +struct Table { + inner: Vec<Link>, + depths: Vec<u16>, +} + +impl Decoder { + /// Create a new decoder with the specified bit order and symbol size. + /// + /// The algorithm for dynamically increasing the code symbol bit width is compatible with the + /// original specification. In particular you will need to specify an `Lsb` bit oder to decode + /// the data portion of a compressed `gif` image. + /// + /// # Panics + /// + /// The `size` needs to be in the interval `0..=12`. + pub fn new(order: BitOrder, size: u8) -> Self { + type Boxed = Box<dyn Stateful + Send + 'static>; + super::assert_decode_size(size); + let state = match order { + BitOrder::Lsb => Box::new(DecodeState::<LsbBuffer>::new(size)) as Boxed, + BitOrder::Msb => Box::new(DecodeState::<MsbBuffer>::new(size)) as Boxed, + }; + + Decoder { state } + } + + /// Create a TIFF compatible decoder with the specified bit order and symbol size. + /// + /// The algorithm for dynamically increasing the code symbol bit width is compatible with the + /// TIFF specification, which is a misinterpretation of the original algorithm for increasing + /// the code size. It switches one symbol sooner. + /// + /// # Panics + /// + /// The `size` needs to be in the interval `0..=12`. + pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { + type Boxed = Box<dyn Stateful + Send + 'static>; + super::assert_decode_size(size); + let state = match order { + BitOrder::Lsb => { + let mut state = Box::new(DecodeState::<LsbBuffer>::new(size)); + state.is_tiff = true; + state as Boxed + } + BitOrder::Msb => { + let mut state = Box::new(DecodeState::<MsbBuffer>::new(size)); + state.is_tiff = true; + state as Boxed + } + }; + + Decoder { state } + } + + /// Decode some bytes from `inp` and write result to `out`. + /// + /// This will consume a prefix of the input buffer and write decoded output into a prefix of + /// the output buffer. See the respective fields of the return value for the count of consumed + /// and written bytes. For the next call You should have adjusted the inputs accordingly. + /// + /// The call will try to decode and write as many bytes of output as available. It will be + /// much more optimized (and avoid intermediate buffering) if it is allowed to write a large + /// contiguous chunk at once. + /// + /// See [`into_stream`] for high-level functions (that are only available with the `std` + /// feature). + /// + /// [`into_stream`]: #method.into_stream + pub fn decode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { + self.state.advance(inp, out) + } + + /// Decode a single chunk of lzw encoded data. + /// + /// This method requires the data to contain an end marker, and returns an error otherwise. + /// + /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize + /// buffer size, to supply an existing vector, to control whether an end marker is required, or + /// to preserve partial data in the case of a decoding error. + /// + /// [`into_vec`]: #into_vec + /// + /// # Example + /// + /// ``` + /// use weezl::{BitOrder, decode::Decoder}; + /// + /// // Encoded that was created with an encoder. + /// let data = b"\x80\x04\x81\x94l\x1b\x06\xf0\xb0 \x1d\xc6\xf1\xc8l\x19 \x10"; + /// let decoded = Decoder::new(BitOrder::Msb, 9) + /// .decode(data) + /// .unwrap(); + /// assert_eq!(decoded, b"Hello, world"); + /// ``` + pub fn decode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> { + let mut output = vec![]; + self.into_vec(&mut output).decode_all(data).status?; + Ok(output) + } + + /// Construct a decoder into a writer. + #[cfg(feature = "std")] + pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> { + IntoStream { + decoder: self, + writer, + buffer: None, + default_size: STREAM_BUF_SIZE, + } + } + + /// Construct a decoder into an async writer. + #[cfg(feature = "async")] + pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> { + IntoAsync { + decoder: self, + writer, + buffer: None, + default_size: STREAM_BUF_SIZE, + } + } + + /// Construct a decoder into a vector. + /// + /// All decoded data is appended and the vector is __not__ cleared. + /// + /// Compared to `into_stream` this interface allows a high-level access to decoding without + /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the + /// special target exposes. + pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> { + IntoVec { + decoder: self, + vector: vec, + } + } + + /// Check if the decoding has finished. + /// + /// No more output is produced beyond the end code that marked the finish of the stream. The + /// decoder may have read additional bytes, including padding bits beyond the last code word + /// but also excess bytes provided. + pub fn has_ended(&self) -> bool { + self.state.has_ended() + } + + /// Ignore an end code and continue. + /// + /// This will _not_ reset any of the inner code tables and not have the effect of a clear code. + /// It will instead continue as if the end code had not been present. If no end code has + /// occurred then this is a no-op. + /// + /// You can test if an end code has occurred with [`has_ended`](#method.has_ended). + /// FIXME: clarify how this interacts with padding introduced after end code. + #[allow(dead_code)] + pub(crate) fn restart(&mut self) { + self.state.restart(); + } + + /// Reset all internal state. + /// + /// This produce a decoder as if just constructed with `new` but taking slightly less work. In + /// particular it will not deallocate any internal allocations. It will also avoid some + /// duplicate setup work. + pub fn reset(&mut self) { + self.state.reset(); + } +} + +#[cfg(feature = "std")] +impl<'d, W: Write> IntoStream<'d, W> { + /// Decode data from a reader. + /// + /// This will read data until the stream is empty or an end marker is reached. + pub fn decode(&mut self, read: impl BufRead) -> StreamResult { + self.decode_part(read, false) + } + + /// Decode data from a reader, requiring an end marker. + pub fn decode_all(mut self, read: impl BufRead) -> StreamResult { + self.decode_part(read, true) + } + + /// Set the size of the intermediate decode buffer. + /// + /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is + /// available and any decoding method is called. No buffer is allocated if `set_buffer` has + /// been called. The buffer is reused. + /// + /// # Panics + /// This method panics if `size` is `0`. + pub fn set_buffer_size(&mut self, size: usize) { + assert_ne!(size, 0, "Attempted to set empty buffer"); + self.default_size = size; + } + + /// Use a particular buffer as an intermediate decode buffer. + /// + /// Calling this sets or replaces the buffer. When a buffer has been set then it is used + /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical + /// for efficient decoding. Some optimization techniques require the buffer to hold one or more + /// previous decoded words. There is also additional overhead from `write` calls each time the + /// buffer has been filled. + /// + /// # Panics + /// This method panics if the `buffer` is empty. + pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { + assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); + self.buffer = Some(StreamBuf::Borrowed(buffer)); + } + + fn decode_part(&mut self, mut read: impl BufRead, must_finish: bool) -> StreamResult { + let IntoStream { + decoder, + writer, + buffer, + default_size, + } = self; + + enum Progress { + Ok, + Done, + } + + let mut bytes_read = 0; + let mut bytes_written = 0; + + // Converting to mutable refs to move into the `once` closure. + let read_bytes = &mut bytes_read; + let write_bytes = &mut bytes_written; + + let outbuf: &mut [u8] = + match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { + StreamBuf::Borrowed(slice) => &mut *slice, + StreamBuf::Owned(vec) => &mut *vec, + }; + assert!(!outbuf.is_empty()); + + let once = move || { + // Try to grab one buffer of input data. + let data = read.fill_buf()?; + + // Decode as much of the buffer as fits. + let result = decoder.decode_bytes(data, &mut outbuf[..]); + // Do the bookkeeping and consume the buffer. + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + read.consume(result.consumed_in); + + // Handle the status in the result. + let done = result.status.map_err(|err| { + io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) + })?; + + // Check if we had any new data at all. + if let LzwStatus::NoProgress = done { + debug_assert_eq!( + result.consumed_out, 0, + "No progress means we have not decoded any data" + ); + // In particular we did not finish decoding. + if must_finish { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "No more data but no end marker detected", + )); + } else { + return Ok(Progress::Done); + } + } + + // And finish by writing our result. + // TODO: we may lose data on error (also on status error above) which we might want to + // deterministically handle so that we don't need to restart everything from scratch as + // the only recovery strategy. Any changes welcome. + writer.write_all(&outbuf[..result.consumed_out])?; + + Ok(if let LzwStatus::Done = done { + Progress::Done + } else { + Progress::Ok + }) + }; + + // Decode chunks of input data until we're done. + let status = core::iter::repeat_with(once) + // scan+fuse can be replaced with map_while + .scan((), |(), result| match result { + Ok(Progress::Ok) => Some(Ok(())), + Err(err) => Some(Err(err)), + Ok(Progress::Done) => None, + }) + .fuse() + .collect(); + + StreamResult { + bytes_read, + bytes_written, + status, + } + } +} + +impl IntoVec<'_> { + /// Decode data from a slice. + /// + /// This will read data until the slice is empty or an end marker is reached. + pub fn decode(&mut self, read: &[u8]) -> VectorResult { + self.decode_part(read, false) + } + + /// Decode data from a slice, requiring an end marker. + pub fn decode_all(mut self, read: &[u8]) -> VectorResult { + self.decode_part(read, true) + } + + fn grab_buffer(&mut self) -> (&mut [u8], &mut Decoder) { + const CHUNK_SIZE: usize = 1 << 12; + let decoder = &mut self.decoder; + let length = self.vector.len(); + + // Use the vector to do overflow checks and w/e. + self.vector.reserve(CHUNK_SIZE); + // FIXME: decoding into uninit buffer? + self.vector.resize(length + CHUNK_SIZE, 0u8); + + (&mut self.vector[length..], decoder) + } + + fn decode_part(&mut self, part: &[u8], must_finish: bool) -> VectorResult { + let mut result = VectorResult { + consumed_in: 0, + consumed_out: 0, + status: Ok(LzwStatus::Ok), + }; + + enum Progress { + Ok, + Done, + } + + // Converting to mutable refs to move into the `once` closure. + let read_bytes = &mut result.consumed_in; + let write_bytes = &mut result.consumed_out; + let mut data = part; + + // A 64 MB buffer is quite large but should get alloc_zeroed. + // Note that the decoded size can be up to quadratic in code block. + let once = move || { + // Grab a new output buffer. + let (outbuf, decoder) = self.grab_buffer(); + + // Decode as much of the buffer as fits. + let result = decoder.decode_bytes(data, &mut outbuf[..]); + // Do the bookkeeping and consume the buffer. + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + data = &data[result.consumed_in..]; + + let unfilled = outbuf.len() - result.consumed_out; + let filled = self.vector.len() - unfilled; + self.vector.truncate(filled); + + // Handle the status in the result. + match result.status { + Err(err) => Err(err), + Ok(LzwStatus::NoProgress) if must_finish => Err(LzwError::InvalidCode), + Ok(LzwStatus::NoProgress) | Ok(LzwStatus::Done) => Ok(Progress::Done), + Ok(LzwStatus::Ok) => Ok(Progress::Ok), + } + }; + + // Decode chunks of input data until we're done. + let status: Result<(), _> = core::iter::repeat_with(once) + // scan+fuse can be replaced with map_while + .scan((), |(), result| match result { + Ok(Progress::Ok) => Some(Ok(())), + Err(err) => Some(Err(err)), + Ok(Progress::Done) => None, + }) + .fuse() + .collect(); + + if let Err(err) = status { + result.status = Err(err); + } + + result + } +} + +// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would +// trip over the usage of await, which is a reserved keyword in that edition/version. It only +// contains an impl block. +#[cfg(feature = "async")] +#[path = "decode_into_async.rs"] +mod impl_decode_into_async; + +impl<C: CodeBuffer> DecodeState<C> { + fn new(min_size: u8) -> Self { + DecodeState { + min_size, + table: Table::new(), + buffer: Buffer::new(), + last: None, + clear_code: 1 << min_size, + end_code: (1 << min_size) + 1, + next_code: (1 << min_size) + 2, + has_ended: false, + is_tiff: false, + implicit_reset: true, + code_buffer: CodeBuffer::new(min_size), + } + } + + fn init_tables(&mut self) { + self.code_buffer.reset(self.min_size); + self.next_code = (1 << self.min_size) + 2; + self.table.init(self.min_size); + } + + fn reset_tables(&mut self) { + self.code_buffer.reset(self.min_size); + self.next_code = (1 << self.min_size) + 2; + self.table.clear(self.min_size); + } +} + +impl<C: CodeBuffer> Stateful for DecodeState<C> { + fn has_ended(&self) -> bool { + self.has_ended + } + + fn restart(&mut self) { + self.has_ended = false; + } + + fn reset(&mut self) { + self.table.init(self.min_size); + self.buffer.read_mark = 0; + self.buffer.write_mark = 0; + self.last = None; + self.restart(); + self.code_buffer = CodeBuffer::new(self.min_size); + } + + fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { + // Skip everything if there is nothing to do. + if self.has_ended { + return BufferResult { + consumed_in: 0, + consumed_out: 0, + status: Ok(LzwStatus::Done), + }; + } + + // Rough description: + // We will fill the output slice as much as possible until either there is no more symbols + // to decode or an end code has been reached. This requires an internal buffer to hold a + // potential tail of the word corresponding to the last symbol. This tail will then be + // decoded first before continuing with the regular decoding. The same buffer is required + // to persist some symbol state across calls. + // + // We store the words corresponding to code symbols in an index chain, bytewise, where we + // push each decoded symbol. (TODO: wuffs shows some success with 8-byte units). This chain + // is traversed for each symbol when it is decoded and bytes are placed directly into the + // output slice. In the special case (new_code == next_code) we use an existing decoded + // version that is present in either the out bytes of this call or in buffer to copy the + // repeated prefix slice. + // TODO: I played with a 'decoding cache' to remember the position of long symbols and + // avoid traversing the chain, doing a copy of memory instead. It did however not lead to + // a serious improvement. It's just unlikely to both have a long symbol and have that + // repeated twice in the same output buffer. + // + // You will also find the (to my knowledge novel) concept of a _decoding burst_ which + // gained some >~10% speedup in tests. This is motivated by wanting to use out-of-order + // execution as much as possible and for this reason have the least possible stress on + // branch prediction. Our decoding table already gives us a lookahead on symbol lengths but + // only for re-used codes, not novel ones. This lookahead also makes the loop termination + // when restoring each byte of the code word perfectly predictable! So a burst is a chunk + // of code words which are all independent of each other, have known lengths _and_ are + // guaranteed to fit into the out slice without requiring a buffer. One burst can be + // decoded in an extremely tight loop. + // + // TODO: since words can be at most (1 << MAX_CODESIZE) = 4096 bytes long we could avoid + // that intermediate buffer at the expense of not always filling the output buffer + // completely. Alternatively we might follow its chain of precursor states twice. This may + // be even cheaper if we store more than one byte per link so it really should be + // evaluated. + // TODO: if the caller was required to provide the previous last word we could also avoid + // the buffer for cases where we need it to restore the next code! This could be built + // backwards compatible by only doing it after an opt-in call that enables the behaviour. + + // Record initial lengths for the result that is returned. + let o_in = inp.len(); + let o_out = out.len(); + + // The code_link is the previously decoded symbol. + // It's used to link the new code back to its predecessor. + let mut code_link = None; + // The status, which is written to on an invalid code. + let mut status = Ok(LzwStatus::Ok); + + match self.last.take() { + // No last state? This is the first code after a reset? + None => { + match self.next_symbol(&mut inp) { + // Plainly invalid code. + Some(code) if code > self.next_code => status = Err(LzwError::InvalidCode), + // next_code would require an actual predecessor. + Some(code) if code == self.next_code => status = Err(LzwError::InvalidCode), + // No more symbols available and nothing decoded yet. + // Assume that we didn't make progress, this may get reset to Done if we read + // some bytes from the input. + None => status = Ok(LzwStatus::NoProgress), + // Handle a valid code. + Some(init_code) => { + if init_code == self.clear_code { + self.init_tables(); + } else if init_code == self.end_code { + self.has_ended = true; + status = Ok(LzwStatus::Done); + } else if self.table.is_empty() { + if self.implicit_reset { + self.init_tables(); + + self.buffer.fill_reconstruct(&self.table, init_code); + let link = self.table.at(init_code).clone(); + code_link = Some((init_code, link)); + } else { + // We require an explicit reset. + status = Err(LzwError::InvalidCode); + } + } else { + // Reconstruct the first code in the buffer. + self.buffer.fill_reconstruct(&self.table, init_code); + let link = self.table.at(init_code).clone(); + code_link = Some((init_code, link)); + } + } + } + } + // Move the tracking state to the stack. + Some(tup) => code_link = Some(tup), + }; + + // Track an empty `burst` (see below) means we made no progress. + let mut burst_required_for_progress = false; + // Restore the previous state, if any. + if let Some((code, link)) = code_link.take() { + code_link = Some((code, link)); + let remain = self.buffer.buffer(); + // Check if we can fully finish the buffer. + if remain.len() > out.len() { + if out.is_empty() { + status = Ok(LzwStatus::NoProgress); + } else { + out.copy_from_slice(&remain[..out.len()]); + self.buffer.consume(out.len()); + out = &mut []; + } + } else if remain.is_empty() { + status = Ok(LzwStatus::NoProgress); + burst_required_for_progress = true; + } else { + let consumed = remain.len(); + out[..consumed].copy_from_slice(remain); + self.buffer.consume(consumed); + out = &mut out[consumed..]; + burst_required_for_progress = false; + } + } + + // The tracking state for a burst. + // These are actually initialized later but compiler wasn't smart enough to fully optimize + // out the init code so that appears outside th loop. + // TODO: maybe we can make it part of the state but it's dubious if that really gives a + // benefit over stack usage? Also the slices stored here would need some treatment as we + // can't infect the main struct with a lifetime. + let mut burst = [0; 6]; + let mut bytes = [0u16; 6]; + let mut target: [&mut [u8]; 6] = Default::default(); + // A special reference to out slice which holds the last decoded symbol. + let mut last_decoded: Option<&[u8]> = None; + + while let Some((mut code, mut link)) = code_link.take() { + if out.is_empty() && !self.buffer.buffer().is_empty() { + code_link = Some((code, link)); + break; + } + + let mut burst_size = 0; + // Ensure the code buffer is full, we're about to request some codes. + // Note that this also ensures at least one code is in the buffer if any input is left. + self.refill_bits(&mut inp); + // A burst is a sequence of decodes that are completely independent of each other. This + // is the case if neither is an end code, a clear code, or a next code, i.e. we have + // all of them in the decoding table and thus known their depths, and additionally if + // we can decode them directly into the output buffer. + for b in &mut burst { + // TODO: does it actually make a perf difference to avoid reading new bits here? + *b = match self.get_bits() { + None => break, + Some(code) => code, + }; + + // We can commit the previous burst code, and will take a slice from the output + // buffer. This also avoids the bounds check in the tight loop later. + if burst_size > 0 { + let len = bytes[burst_size - 1]; + let (into, tail) = out.split_at_mut(usize::from(len)); + target[burst_size - 1] = into; + out = tail; + } + + // Check that we don't overflow the code size with all codes we burst decode. + if let Some(potential_code) = self.next_code.checked_add(burst_size as u16) { + burst_size += 1; + if potential_code == self.code_buffer.max_code() - Code::from(self.is_tiff) { + break; + } + } else { + // next_code overflowed + break; + } + + // A burst code can't be special. + if *b == self.clear_code || *b == self.end_code || *b >= self.next_code { + break; + } + + // Read the code length and check that we can decode directly into the out slice. + let len = self.table.depths[usize::from(*b)]; + if out.len() < usize::from(len) { + break; + } + + bytes[burst_size - 1] = len; + } + + // No code left, and no more bytes to fill the buffer. + if burst_size == 0 { + if burst_required_for_progress { + status = Ok(LzwStatus::NoProgress); + } + code_link = Some((code, link)); + break; + } + + burst_required_for_progress = false; + // Note that the very last code in the burst buffer doesn't actually belong to the + // burst itself. TODO: sometimes it could, we just don't differentiate between the + // breaks and a loop end condition above. That may be a speed advantage? + let (&new_code, burst) = burst[..burst_size].split_last().unwrap(); + + // The very tight loop for restoring the actual burst. + for (&burst, target) in burst.iter().zip(&mut target[..burst_size - 1]) { + let cha = self.table.reconstruct(burst, target); + // TODO: this pushes into a Vec, maybe we can make this cleaner. + // Theoretically this has a branch and llvm tends to be flaky with code layout for + // the case of requiring an allocation (which can't occur in practice). + let new_link = self.table.derive(&link, cha, code); + self.next_code += 1; + code = burst; + link = new_link; + } + + // Update the slice holding the last decoded word. + if let Some(new_last) = target[..burst_size - 1].last_mut() { + let slice = core::mem::replace(new_last, &mut []); + last_decoded = Some(&*slice); + } + + // Now handle the special codes. + if new_code == self.clear_code { + self.reset_tables(); + last_decoded = None; + continue; + } + + if new_code == self.end_code { + self.has_ended = true; + status = Ok(LzwStatus::Done); + last_decoded = None; + break; + } + + if new_code > self.next_code { + status = Err(LzwError::InvalidCode); + last_decoded = None; + break; + } + + let required_len = if new_code == self.next_code { + self.table.depths[usize::from(code)] + 1 + } else { + self.table.depths[usize::from(new_code)] + }; + + let cha; + let is_in_buffer; + // Check if we will need to store our current state into the buffer. + if usize::from(required_len) > out.len() { + is_in_buffer = true; + if new_code == self.next_code { + // last_decoded will be Some if we have restored any code into the out slice. + // Otherwise it will still be present in the buffer. + if let Some(last) = last_decoded.take() { + self.buffer.bytes[..last.len()].copy_from_slice(last); + self.buffer.write_mark = last.len(); + self.buffer.read_mark = last.len(); + } + + cha = self.buffer.fill_cscsc(); + } else { + // Restore the decoded word into the buffer. + last_decoded = None; + cha = self.buffer.fill_reconstruct(&self.table, new_code); + } + } else { + is_in_buffer = false; + let (target, tail) = out.split_at_mut(usize::from(required_len)); + out = tail; + + if new_code == self.next_code { + // Reconstruct high. + let source = match last_decoded.take() { + Some(last) => last, + None => &self.buffer.bytes[..self.buffer.write_mark], + }; + cha = source[0]; + target[..source.len()].copy_from_slice(source); + target[source.len()..][0] = source[0]; + } else { + cha = self.table.reconstruct(new_code, target); + } + + // A new decoded word. + last_decoded = Some(target); + } + + let new_link; + // Each newly read code creates one new code/link based on the preceding code if we + // have enough space to put it there. + if !self.table.is_full() { + let link = self.table.derive(&link, cha, code); + + if self.next_code == self.code_buffer.max_code() - Code::from(self.is_tiff) + && self.code_buffer.code_size() < MAX_CODESIZE + { + self.bump_code_size(); + } + + self.next_code += 1; + new_link = link; + } else { + // It's actually quite likely that the next code will be a reset but just in case. + // FIXME: this path hasn't been tested very well. + new_link = link.clone(); + } + + // store the information on the decoded word. + code_link = Some((new_code, new_link)); + + // Can't make any more progress with decoding. + if is_in_buffer { + break; + } + } + + // We need to store the last word into the buffer in case the first code in the next + // iteration is the next_code. + if let Some(tail) = last_decoded { + self.buffer.bytes[..tail.len()].copy_from_slice(tail); + self.buffer.write_mark = tail.len(); + self.buffer.read_mark = tail.len(); + } + + // Ensure we don't indicate that no progress was made if we read some bytes from the input + // (which is progress). + if o_in > inp.len() { + if let Ok(LzwStatus::NoProgress) = status { + status = Ok(LzwStatus::Ok); + } + } + + // Store the code/link state. + self.last = code_link; + + BufferResult { + consumed_in: o_in.wrapping_sub(inp.len()), + consumed_out: o_out.wrapping_sub(out.len()), + status, + } + } +} + +impl<C: CodeBuffer> DecodeState<C> { + fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code> { + self.code_buffer.next_symbol(inp) + } + + fn bump_code_size(&mut self) { + self.code_buffer.bump_code_size() + } + + fn refill_bits(&mut self, inp: &mut &[u8]) { + self.code_buffer.refill_bits(inp) + } + + fn get_bits(&mut self) -> Option<Code> { + self.code_buffer.get_bits() + } +} + +impl CodeBuffer for MsbBuffer { + fn new(min_size: u8) -> Self { + MsbBuffer { + code_size: min_size + 1, + code_mask: (1u16 << (min_size + 1)) - 1, + bit_buffer: 0, + bits: 0, + } + } + + fn reset(&mut self, min_size: u8) { + self.code_size = min_size + 1; + self.code_mask = (1 << self.code_size) - 1; + } + + fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code> { + if self.bits < self.code_size { + self.refill_bits(inp); + } + + self.get_bits() + } + + fn bump_code_size(&mut self) { + self.code_size += 1; + self.code_mask = (self.code_mask << 1) | 1; + } + + fn refill_bits(&mut self, inp: &mut &[u8]) { + let wish_count = (64 - self.bits) / 8; + let mut buffer = [0u8; 8]; + let new_bits = match inp.get(..usize::from(wish_count)) { + Some(bytes) => { + buffer[..usize::from(wish_count)].copy_from_slice(bytes); + *inp = &inp[usize::from(wish_count)..]; + wish_count * 8 + } + None => { + let new_bits = inp.len() * 8; + buffer[..inp.len()].copy_from_slice(inp); + *inp = &[]; + new_bits as u8 + } + }; + self.bit_buffer |= u64::from_be_bytes(buffer) >> self.bits; + self.bits += new_bits; + } + + fn get_bits(&mut self) -> Option<Code> { + if self.bits < self.code_size { + return None; + } + + let mask = u64::from(self.code_mask); + let rotbuf = self.bit_buffer.rotate_left(self.code_size.into()); + self.bit_buffer = rotbuf & !mask; + self.bits -= self.code_size; + Some((rotbuf & mask) as u16) + } + + fn max_code(&self) -> Code { + self.code_mask + } + + fn code_size(&self) -> u8 { + self.code_size + } +} + +impl CodeBuffer for LsbBuffer { + fn new(min_size: u8) -> Self { + LsbBuffer { + code_size: min_size + 1, + code_mask: (1u16 << (min_size + 1)) - 1, + bit_buffer: 0, + bits: 0, + } + } + + fn reset(&mut self, min_size: u8) { + self.code_size = min_size + 1; + self.code_mask = (1 << self.code_size) - 1; + } + + fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code> { + if self.bits < self.code_size { + self.refill_bits(inp); + } + + self.get_bits() + } + + fn bump_code_size(&mut self) { + self.code_size += 1; + self.code_mask = (self.code_mask << 1) | 1; + } + + fn refill_bits(&mut self, inp: &mut &[u8]) { + let wish_count = (64 - self.bits) / 8; + let mut buffer = [0u8; 8]; + let new_bits = match inp.get(..usize::from(wish_count)) { + Some(bytes) => { + buffer[..usize::from(wish_count)].copy_from_slice(bytes); + *inp = &inp[usize::from(wish_count)..]; + wish_count * 8 + } + None => { + let new_bits = inp.len() * 8; + buffer[..inp.len()].copy_from_slice(inp); + *inp = &[]; + new_bits as u8 + } + }; + self.bit_buffer |= u64::from_be_bytes(buffer).swap_bytes() << self.bits; + self.bits += new_bits; + } + + fn get_bits(&mut self) -> Option<Code> { + if self.bits < self.code_size { + return None; + } + + let mask = u64::from(self.code_mask); + let code = self.bit_buffer & mask; + self.bit_buffer >>= self.code_size; + self.bits -= self.code_size; + Some(code as u16) + } + + fn max_code(&self) -> Code { + self.code_mask + } + + fn code_size(&self) -> u8 { + self.code_size + } +} + +impl Buffer { + fn new() -> Self { + Buffer { + bytes: vec![0; MAX_ENTRIES].into_boxed_slice(), + read_mark: 0, + write_mark: 0, + } + } + + /// When encoding a sequence `cScSc` where `c` is any character and `S` is any string + /// this results in two codes `AB`, `A` encoding `cS` and `B` encoding `cSc`. Supposing + /// the buffer is already filled with the reconstruction of `A`, we can easily fill it + /// with the reconstruction of `B`. + fn fill_cscsc(&mut self) -> u8 { + self.bytes[self.write_mark] = self.bytes[0]; + self.write_mark += 1; + self.read_mark = 0; + self.bytes[0] + } + + // Fill the buffer by decoding from the table + fn fill_reconstruct(&mut self, table: &Table, code: Code) -> u8 { + self.write_mark = 0; + self.read_mark = 0; + let depth = table.depths[usize::from(code)]; + let mut memory = core::mem::replace(&mut self.bytes, Box::default()); + + let out = &mut memory[..usize::from(depth)]; + let last = table.reconstruct(code, out); + + self.bytes = memory; + self.write_mark = usize::from(depth); + last + } + + fn buffer(&self) -> &[u8] { + &self.bytes[self.read_mark..self.write_mark] + } + + fn consume(&mut self, amt: usize) { + self.read_mark += amt; + } +} + +impl Table { + fn new() -> Self { + Table { + inner: Vec::with_capacity(MAX_ENTRIES), + depths: Vec::with_capacity(MAX_ENTRIES), + } + } + + fn clear(&mut self, min_size: u8) { + let static_count = usize::from(1u16 << u16::from(min_size)) + 2; + self.inner.truncate(static_count); + self.depths.truncate(static_count); + } + + fn init(&mut self, min_size: u8) { + self.inner.clear(); + self.depths.clear(); + for i in 0..(1u16 << u16::from(min_size)) { + self.inner.push(Link::base(i as u8)); + self.depths.push(1); + } + // Clear code. + self.inner.push(Link::base(0)); + self.depths.push(0); + // End code. + self.inner.push(Link::base(0)); + self.depths.push(0); + } + + fn at(&self, code: Code) -> &Link { + &self.inner[usize::from(code)] + } + + fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + fn is_full(&self) -> bool { + self.inner.len() >= MAX_ENTRIES + } + + fn derive(&mut self, from: &Link, byte: u8, prev: Code) -> Link { + let link = from.derive(byte, prev); + let depth = self.depths[usize::from(prev)] + 1; + self.inner.push(link.clone()); + self.depths.push(depth); + link + } + + fn reconstruct(&self, code: Code, out: &mut [u8]) -> u8 { + let mut code_iter = code; + let table = &self.inner[..=usize::from(code)]; + let len = code_iter; + for ch in out.iter_mut().rev() { + //(code, cha) = self.table[k as usize]; + // Note: This could possibly be replaced with an unchecked array access if + // - value is asserted to be < self.next_code() in push + // - min_size is asserted to be < MAX_CODESIZE + let entry = &table[usize::from(code_iter)]; + code_iter = core::cmp::min(len, entry.prev); + *ch = entry.byte; + } + out[0] + } +} + +impl Link { + fn base(byte: u8) -> Self { + Link { prev: 0, byte } + } + + // TODO: this has self type to make it clear we might depend on the old in a future + // optimization. However, that has no practical purpose right now. + fn derive(&self, byte: u8, prev: Code) -> Self { + Link { prev, byte } + } +} + +#[cfg(test)] +mod tests { + use crate::alloc::vec::Vec; + #[cfg(feature = "std")] + use crate::StreamBuf; + use crate::{decode::Decoder, BitOrder}; + + #[test] + fn invalid_code_size_low() { + let _ = Decoder::new(BitOrder::Msb, 0); + let _ = Decoder::new(BitOrder::Msb, 1); + } + + #[test] + #[should_panic] + fn invalid_code_size_high() { + let _ = Decoder::new(BitOrder::Msb, 14); + } + + fn make_encoded() -> Vec<u8> { + const FILE: &'static [u8] = include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/benches/binary-8-msb.lzw" + )); + return Vec::from(FILE); + } + + #[test] + #[cfg(feature = "std")] + fn into_stream_buffer_no_alloc() { + let encoded = make_encoded(); + let mut decoder = Decoder::new(BitOrder::Msb, 8); + + let mut output = vec![]; + let mut buffer = [0; 512]; + let mut istream = decoder.into_stream(&mut output); + istream.set_buffer(&mut buffer[..]); + istream.decode(&encoded[..]).status.unwrap(); + + match istream.buffer { + Some(StreamBuf::Borrowed(_)) => {} + None => panic!("Decoded without buffer??"), + Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), + } + } + + #[test] + #[cfg(feature = "std")] + fn into_stream_buffer_small_alloc() { + struct WriteTap<W: std::io::Write>(W); + const BUF_SIZE: usize = 512; + + impl<W: std::io::Write> std::io::Write for WriteTap<W> { + fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { + assert!(buf.len() <= BUF_SIZE); + self.0.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.0.flush() + } + } + + let encoded = make_encoded(); + let mut decoder = Decoder::new(BitOrder::Msb, 8); + + let mut output = vec![]; + let mut istream = decoder.into_stream(WriteTap(&mut output)); + istream.set_buffer_size(512); + istream.decode(&encoded[..]).status.unwrap(); + + match istream.buffer { + Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), + Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), + None => panic!("Decoded without buffer??"), + } + } + + #[test] + #[cfg(feature = "std")] + fn reset() { + let encoded = make_encoded(); + let mut decoder = Decoder::new(BitOrder::Msb, 8); + let mut reference = None; + + for _ in 0..2 { + let mut output = vec![]; + let mut buffer = [0; 512]; + let mut istream = decoder.into_stream(&mut output); + istream.set_buffer(&mut buffer[..]); + istream.decode_all(&encoded[..]).status.unwrap(); + + decoder.reset(); + if let Some(reference) = &reference { + assert_eq!(output, *reference); + } else { + reference = Some(output); + } + } + } +} diff --git a/vendor/weezl/src/decode_into_async.rs b/vendor/weezl/src/decode_into_async.rs new file mode 100644 index 0000000..e39a26f --- /dev/null +++ b/vendor/weezl/src/decode_into_async.rs @@ -0,0 +1,143 @@ +use crate::decode::IntoAsync; +use crate::error::LzwStatus; +use crate::error::StreamResult; +use crate::StreamBuf; +use std::io; + +impl<'d, W: futures::io::AsyncWrite + core::marker::Unpin> IntoAsync<'d, W> { + /// Decode data from a reader. + /// + /// This will read data until the stream is empty or an end marker is reached. + pub async fn decode(&mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { + self.decode_part(read, false).await + } + + /// Decode data from a reader, requiring an end marker. + pub async fn decode_all(mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { + self.decode_part(read, true).await + } + + /// Set the size of the intermediate decode buffer. + /// + /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is + /// available and any decoding method is called. No buffer is allocated if `set_buffer` has + /// been called. The buffer is reused. + /// + /// # Panics + /// This method panics if `size` is `0`. + pub fn set_buffer_size(&mut self, size: usize) { + assert_ne!(size, 0, "Attempted to set empty buffer"); + self.default_size = size; + } + + /// Use a particular buffer as an intermediate decode buffer. + /// + /// Calling this sets or replaces the buffer. When a buffer has been set then it is used + /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical + /// for efficient decoding. Some optimization techniques require the buffer to hold one or more + /// previous decoded words. There is also additional overhead from `write` calls each time the + /// buffer has been filled. + /// + /// # Panics + /// This method panics if the `buffer` is empty. + pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { + assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); + self.buffer = Some(StreamBuf::Borrowed(buffer)); + } + + async fn decode_part( + &mut self, + read: impl futures::io::AsyncBufRead, + must_finish: bool, + ) -> StreamResult { + use futures::io::AsyncBufReadExt; + use futures::io::AsyncWriteExt; + + let IntoAsync { + decoder, + writer, + buffer, + default_size, + } = self; + + futures::pin_mut!(read); + let mut read: core::pin::Pin<_> = read; + + let mut bytes_read = 0; + let mut bytes_written = 0; + + // Converting to mutable refs to move into the `once` closure. + let read_bytes = &mut bytes_read; + let write_bytes = &mut bytes_written; + + let outbuf: &mut [u8] = + match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { + StreamBuf::Borrowed(slice) => &mut *slice, + StreamBuf::Owned(vec) => &mut *vec, + }; + assert!(!outbuf.is_empty()); + + let status = loop { + // Try to grab one buffer of input data. + let mut filler = read.as_mut(); + let data = match filler.fill_buf().await { + Ok(buf) => buf, + Err(err) => break Err(err), + }; + + // Decode as much of the buffer as fits. + let result = decoder.decode_bytes(data, &mut outbuf[..]); + // Do the bookkeeping and consume the buffer. + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + read.as_mut().consume(result.consumed_in); + + // Handle an error status in the result. + let status = match result.status { + Ok(ok) => ok, + Err(err) => { + break Err(io::Error::new( + io::ErrorKind::InvalidData, + &*format!("{:?}", err), + )); + } + }; + + // Check if we had any new data at all. + if let LzwStatus::NoProgress = status { + debug_assert_eq!( + result.consumed_out, 0, + "No progress means we have not decoded any data" + ); + // In particular we did not finish decoding. + if must_finish { + break Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "No more data but no end marker detected", + )); + } else { + break Ok(()); + } + } + + // And finish by writing our result. + // TODO: we may lose data on error (also on status error above) which we might want to + // deterministically handle so that we don't need to restart everything from scratch as + // the only recovery strategy. Any changes welcome. + match writer.write_all(&outbuf[..result.consumed_out]).await { + Ok(_) => {} + Err(err) => break Err(err), + } + + if let LzwStatus::Done = status { + break Ok(()); + } + }; + + StreamResult { + bytes_read, + bytes_written, + status, + } + } +} diff --git a/vendor/weezl/src/encode.rs b/vendor/weezl/src/encode.rs new file mode 100644 index 0000000..492b18c --- /dev/null +++ b/vendor/weezl/src/encode.rs @@ -0,0 +1,1126 @@ +//! A module for all encoding needs. +use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; +use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; + +use crate::alloc::{boxed::Box, vec::Vec}; +#[cfg(feature = "std")] +use crate::error::StreamResult; +#[cfg(feature = "std")] +use std::io::{self, BufRead, Write}; + +/// The state for encoding data with an LZW algorithm. +/// +/// The same structure can be utilized with streams as well as your own buffers and driver logic. +/// It may even be possible to mix them if you are sufficiently careful not to lose any written +/// data in the process. +/// +/// This is a sans-IO implementation, meaning that it only contains the state of the encoder and +/// the caller will provide buffers for input and output data when calling the basic +/// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` +/// methods for enoding with a particular style of common IO. +/// +/// * [`encode`] for encoding once without any IO-loop. +/// * [`into_async`] for encoding with the `futures` traits for asynchronous IO. +/// * [`into_stream`] for encoding with the standard `io` traits. +/// * [`into_vec`] for in-memory encoding. +/// +/// [`encode_bytes`]: #method.encode_bytes +/// [`encode`]: #method.encode +/// [`into_async`]: #method.into_async +/// [`into_stream`]: #method.into_stream +/// [`into_vec`]: #method.into_vec +pub struct Encoder { + /// Internally dispatch via a dynamic trait object. This did not have any significant + /// performance impact as we batch data internally and this pointer does not change after + /// creation! + state: Box<dyn Stateful + Send + 'static>, +} + +/// A encoding stream sink. +/// +/// See [`Encoder::into_stream`] on how to create this type. +/// +/// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream +#[cfg_attr( + not(feature = "std"), + deprecated = "This type is only useful with the `std` feature." +)] +#[cfg_attr(not(feature = "std"), allow(dead_code))] +pub struct IntoStream<'d, W> { + encoder: &'d mut Encoder, + writer: W, + buffer: Option<StreamBuf<'d>>, + default_size: usize, +} + +/// An async decoding sink. +/// +/// See [`Encoder::into_async`] on how to create this type. +/// +/// [`Encoder::into_async`]: struct.Encoder.html#method.into_async +#[cfg(feature = "async")] +pub struct IntoAsync<'d, W> { + encoder: &'d mut Encoder, + writer: W, + buffer: Option<StreamBuf<'d>>, + default_size: usize, +} + +/// A encoding sink into a vector. +/// +/// See [`Encoder::into_vec`] on how to create this type. +/// +/// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec +pub struct IntoVec<'d> { + encoder: &'d mut Encoder, + vector: &'d mut Vec<u8>, +} + +trait Stateful { + fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; + fn mark_ended(&mut self) -> bool; + /// Reset the state tracking if end code has been written. + fn restart(&mut self); + /// Reset the encoder to the beginning, dropping all buffers etc. + fn reset(&mut self); +} + +struct EncodeState<B: Buffer> { + /// The configured minimal code size. + min_size: u8, + /// The current encoding symbol tree. + tree: Tree, + /// If we have pushed the end code. + has_ended: bool, + /// If tiff then bumps are a single code sooner. + is_tiff: bool, + /// The code corresponding to the currently read characters. + current_code: Code, + /// The clear code for resetting the dictionary. + clear_code: Code, + /// The bit buffer for encoding. + buffer: B, +} + +struct MsbBuffer { + /// The current code length. + code_size: u8, + /// The buffer bits. + buffer: u64, + /// The number of valid buffer bits. + bits_in_buffer: u8, +} + +struct LsbBuffer { + /// The current code length. + code_size: u8, + /// The buffer bits. + buffer: u64, + /// The number of valid buffer bits. + bits_in_buffer: u8, +} + +trait Buffer { + fn new(size: u8) -> Self; + /// Reset the code size in the buffer. + fn reset(&mut self, min_size: u8); + /// Apply effects of a Clear Code. + fn clear(&mut self, min_size: u8); + /// Insert a code into the buffer. + fn buffer_code(&mut self, code: Code); + /// Push bytes if the buffer space is getting small. + fn push_out(&mut self, out: &mut &mut [u8]) -> bool; + /// Flush all full bytes, returning if at least one more byte remains. + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool; + /// Pad the buffer to a full byte. + fn buffer_pad(&mut self); + /// Increase the maximum code size. + fn bump_code_size(&mut self); + /// Return the maximum code with the current code size. + fn max_code(&self) -> Code; + /// Return the current code size in bits. + fn code_size(&self) -> u8; +} + +/// One tree node for at most each code. +/// To avoid using too much memory we keep nodes with few successors in optimized form. This form +/// doesn't offer lookup by indexing but instead does a linear search. +#[derive(Default)] +struct Tree { + simples: Vec<Simple>, + complex: Vec<Full>, + keys: Vec<CompressedKey>, +} + +#[derive(Clone, Copy)] +enum FullKey { + NoSuccessor, + Simple(u16), + Full(u16), +} + +#[derive(Clone, Copy)] +struct CompressedKey(u16); + +const SHORT: usize = 16; + +#[derive(Clone, Copy)] +struct Simple { + codes: [Code; SHORT], + chars: [u8; SHORT], + count: u8, +} + +#[derive(Clone, Copy)] +struct Full { + char_continuation: [Code; 256], +} + +impl Encoder { + /// Create a new encoder with the specified bit order and symbol size. + /// + /// The algorithm for dynamically increasing the code symbol bit width is compatible with the + /// original specification. In particular you will need to specify an `Lsb` bit oder to encode + /// the data portion of a compressed `gif` image. + /// + /// # Panics + /// + /// The `size` needs to be in the interval `2..=12`. + pub fn new(order: BitOrder, size: u8) -> Self { + type Boxed = Box<dyn Stateful + Send + 'static>; + super::assert_encode_size(size); + let state = match order { + BitOrder::Lsb => Box::new(EncodeState::<LsbBuffer>::new(size)) as Boxed, + BitOrder::Msb => Box::new(EncodeState::<MsbBuffer>::new(size)) as Boxed, + }; + + Encoder { state } + } + + /// Create a TIFF compatible encoder with the specified bit order and symbol size. + /// + /// The algorithm for dynamically increasing the code symbol bit width is compatible with the + /// TIFF specification, which is a misinterpretation of the original algorithm for increasing + /// the code size. It switches one symbol sooner. + /// + /// # Panics + /// + /// The `size` needs to be in the interval `2..=12`. + pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { + type Boxed = Box<dyn Stateful + Send + 'static>; + super::assert_encode_size(size); + let state = match order { + BitOrder::Lsb => { + let mut state = Box::new(EncodeState::<LsbBuffer>::new(size)); + state.is_tiff = true; + state as Boxed + } + BitOrder::Msb => { + let mut state = Box::new(EncodeState::<MsbBuffer>::new(size)); + state.is_tiff = true; + state as Boxed + } + }; + + Encoder { state } + } + + /// Encode some bytes from `inp` into `out`. + /// + /// See [`into_stream`] for high-level functions (this interface is only available with the + /// `std` feature) and [`finish`] for marking the input data as complete. + /// + /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and + /// all following ones will _not_ be consumed and the `status` of the result will signal an + /// error. The result will also indicate that all bytes up to but not including the offending + /// byte have been consumed. You may try again with a fixed byte. + /// + /// [`into_stream`]: #method.into_stream + /// [`finish`]: #method.finish + pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { + self.state.advance(inp, out) + } + + /// Encode a single chunk of data. + /// + /// This method will add an end marker to the encoded chunk. + /// + /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize + /// buffer size, to supply an existing vector, to control whether an end marker is required, or + /// to preserve partial data in the case of a decoding error. + /// + /// [`into_vec`]: #into_vec + /// + /// # Example + /// + /// ``` + /// use weezl::{BitOrder, encode::Encoder}; + /// + /// let data = b"Hello, world"; + /// let encoded = Encoder::new(BitOrder::Msb, 9) + /// .encode(data) + /// .expect("All bytes valid for code size"); + /// ``` + pub fn encode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> { + let mut output = Vec::new(); + self.into_vec(&mut output).encode_all(data).status?; + Ok(output) + } + + /// Construct a encoder into a writer. + #[cfg(feature = "std")] + pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> { + IntoStream { + encoder: self, + writer, + buffer: None, + default_size: STREAM_BUF_SIZE, + } + } + + /// Construct a encoder into an async writer. + #[cfg(feature = "async")] + pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> { + IntoAsync { + encoder: self, + writer, + buffer: None, + default_size: STREAM_BUF_SIZE, + } + } + + /// Construct an encoder into a vector. + /// + /// All encoded data is appended and the vector is __not__ cleared. + /// + /// Compared to `into_stream` this interface allows a high-level access to encoding without + /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the + /// special target exposes. + pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> { + IntoVec { + encoder: self, + vector: vec, + } + } + + /// Mark the encoding as in the process of finishing. + /// + /// The next following call to `encode_bytes` which is able to consume the complete input will + /// also try to emit an end code. It's not recommended, but also not unsound, to use different + /// byte slices in different calls from this point forward and thus to 'delay' the actual end + /// of the data stream. The behaviour after the end marker has been written is unspecified but + /// sound. + pub fn finish(&mut self) { + self.state.mark_ended(); + } + + /// Undo marking this data stream as ending. + /// FIXME: clarify how this interacts with padding introduced after end code. + #[allow(dead_code)] + pub(crate) fn restart(&mut self) { + self.state.restart() + } + + /// Reset all internal state. + /// + /// This produce an encoder as if just constructed with `new` but taking slightly less work. In + /// particular it will not deallocate any internal allocations. It will also avoid some + /// duplicate setup work. + pub fn reset(&mut self) { + self.state.reset() + } +} + +#[cfg(feature = "std")] +impl<'d, W: Write> IntoStream<'d, W> { + /// Encode data from a reader. + /// + /// This will drain the supplied reader. It will not encode an end marker after all data has + /// been processed. + pub fn encode(&mut self, read: impl BufRead) -> StreamResult { + self.encode_part(read, false) + } + + /// Encode data from a reader and an end marker. + pub fn encode_all(mut self, read: impl BufRead) -> StreamResult { + self.encode_part(read, true) + } + + /// Set the size of the intermediate encode buffer. + /// + /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is + /// available and any encoding method is called. No buffer is allocated if `set_buffer` has + /// been called. The buffer is reused. + /// + /// # Panics + /// This method panics if `size` is `0`. + pub fn set_buffer_size(&mut self, size: usize) { + assert_ne!(size, 0, "Attempted to set empty buffer"); + self.default_size = size; + } + + /// Use a particular buffer as an intermediate encode buffer. + /// + /// Calling this sets or replaces the buffer. When a buffer has been set then it is used + /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant + /// for efficient encoding as there is additional overhead from `write` calls each time the + /// buffer has been filled. + /// + /// # Panics + /// This method panics if the `buffer` is empty. + pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { + assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); + self.buffer = Some(StreamBuf::Borrowed(buffer)); + } + + fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult { + let IntoStream { + encoder, + writer, + buffer, + default_size, + } = self; + enum Progress { + Ok, + Done, + } + + let mut bytes_read = 0; + let mut bytes_written = 0; + + let read_bytes = &mut bytes_read; + let write_bytes = &mut bytes_written; + + let outbuf: &mut [u8] = + match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { + StreamBuf::Borrowed(slice) => &mut *slice, + StreamBuf::Owned(vec) => &mut *vec, + }; + assert!(!outbuf.is_empty()); + + let once = move || { + let data = read.fill_buf()?; + + if data.is_empty() { + if finish { + encoder.finish(); + } else { + return Ok(Progress::Done); + } + } + + let result = encoder.encode_bytes(data, &mut outbuf[..]); + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + read.consume(result.consumed_in); + + let done = result.status.map_err(|err| { + io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) + })?; + + if let LzwStatus::Done = done { + writer.write_all(&outbuf[..result.consumed_out])?; + return Ok(Progress::Done); + } + + if let LzwStatus::NoProgress = done { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "No more data but no end marker detected", + )); + } + + writer.write_all(&outbuf[..result.consumed_out])?; + Ok(Progress::Ok) + }; + + let status = core::iter::repeat_with(once) + // scan+fuse can be replaced with map_while + .scan((), |(), result| match result { + Ok(Progress::Ok) => Some(Ok(())), + Err(err) => Some(Err(err)), + Ok(Progress::Done) => None, + }) + .fuse() + .collect(); + + StreamResult { + bytes_read, + bytes_written, + status, + } + } +} + +impl IntoVec<'_> { + /// Encode data from a slice. + pub fn encode(&mut self, read: &[u8]) -> VectorResult { + self.encode_part(read, false) + } + + /// Decode data from a reader, adding an end marker. + pub fn encode_all(mut self, read: &[u8]) -> VectorResult { + self.encode_part(read, true) + } + + fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) { + const CHUNK_SIZE: usize = 1 << 12; + let decoder = &mut self.encoder; + let length = self.vector.len(); + + // Use the vector to do overflow checks and w/e. + self.vector.reserve(CHUNK_SIZE); + // FIXME: encoding into uninit buffer? + self.vector.resize(length + CHUNK_SIZE, 0u8); + + (&mut self.vector[length..], decoder) + } + + fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult { + let mut result = VectorResult { + consumed_in: 0, + consumed_out: 0, + status: Ok(LzwStatus::Ok), + }; + + enum Progress { + Ok, + Done, + } + + // Converting to mutable refs to move into the `once` closure. + let read_bytes = &mut result.consumed_in; + let write_bytes = &mut result.consumed_out; + let mut data = part; + + // A 64 MB buffer is quite large but should get alloc_zeroed. + // Note that the decoded size can be up to quadratic in code block. + let once = move || { + // Grab a new output buffer. + let (outbuf, encoder) = self.grab_buffer(); + + if finish { + encoder.finish(); + } + + // Decode as much of the buffer as fits. + let result = encoder.encode_bytes(data, &mut outbuf[..]); + // Do the bookkeeping and consume the buffer. + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + data = &data[result.consumed_in..]; + + let unfilled = outbuf.len() - result.consumed_out; + let filled = self.vector.len() - unfilled; + self.vector.truncate(filled); + + // Handle the status in the result. + let done = result.status?; + if let LzwStatus::Done = done { + Ok(Progress::Done) + } else { + Ok(Progress::Ok) + } + }; + + // Decode chunks of input data until we're done. + let status: Result<(), _> = core::iter::repeat_with(once) + // scan+fuse can be replaced with map_while + .scan((), |(), result| match result { + Ok(Progress::Ok) => Some(Ok(())), + Err(err) => Some(Err(err)), + Ok(Progress::Done) => None, + }) + .fuse() + .collect(); + + if let Err(err) = status { + result.status = Err(err); + } + + result + } +} + +// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would +// trip over the usage of await, which is a reserved keyword in that edition/version. It only +// contains an impl block. +#[cfg(feature = "async")] +#[path = "encode_into_async.rs"] +mod impl_encode_into_async; + +impl<B: Buffer> EncodeState<B> { + fn new(min_size: u8) -> Self { + let clear_code = 1 << min_size; + let mut tree = Tree::default(); + tree.init(min_size); + let mut state = EncodeState { + min_size, + tree, + has_ended: false, + is_tiff: false, + current_code: clear_code, + clear_code, + buffer: B::new(min_size), + }; + state.buffer_code(clear_code); + state + } +} + +impl<B: Buffer> Stateful for EncodeState<B> { + fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { + let c_in = inp.len(); + let c_out = out.len(); + let mut status = Ok(LzwStatus::Ok); + + 'encoding: loop { + if self.push_out(&mut out) { + break; + } + + if inp.is_empty() && self.has_ended { + let end = self.end_code(); + if self.current_code != end { + if self.current_code != self.clear_code { + self.buffer_code(self.current_code); + + // When reading this code, the decoder will add an extra entry to its table + // before reading th end code. Thusly, it may increase its code size based + // on this additional entry. + if self.tree.keys.len() + usize::from(self.is_tiff) + > usize::from(self.buffer.max_code()) + && self.buffer.code_size() < MAX_CODESIZE + { + self.buffer.bump_code_size(); + } + } + self.buffer_code(end); + self.current_code = end; + self.buffer_pad(); + } + + break; + } + + let mut next_code = None; + let mut bytes = inp.iter(); + while let Some(&byte) = bytes.next() { + if self.min_size < 8 && byte >= 1 << self.min_size { + status = Err(LzwError::InvalidCode); + break 'encoding; + } + + inp = bytes.as_slice(); + match self.tree.iterate(self.current_code, byte) { + Ok(code) => self.current_code = code, + Err(_) => { + next_code = Some(self.current_code); + + self.current_code = u16::from(byte); + break; + } + } + } + + match next_code { + // No more bytes, no code produced. + None => break, + Some(code) => { + self.buffer_code(code); + + if self.tree.keys.len() + usize::from(self.is_tiff) + > usize::from(self.buffer.max_code()) + 1 + && self.buffer.code_size() < MAX_CODESIZE + { + self.buffer.bump_code_size(); + } + + if self.tree.keys.len() > MAX_ENTRIES { + self.buffer_code(self.clear_code); + self.tree.reset(self.min_size); + self.buffer.clear(self.min_size); + } + } + } + } + + if inp.is_empty() && self.current_code == self.end_code() { + if !self.flush_out(&mut out) { + status = Ok(LzwStatus::Done); + } + } + + BufferResult { + consumed_in: c_in - inp.len(), + consumed_out: c_out - out.len(), + status, + } + } + + fn mark_ended(&mut self) -> bool { + core::mem::replace(&mut self.has_ended, true) + } + + fn restart(&mut self) { + self.has_ended = false; + } + + fn reset(&mut self) { + self.restart(); + self.current_code = self.clear_code; + self.tree.reset(self.min_size); + self.buffer.reset(self.min_size); + self.buffer_code(self.clear_code); + } +} + +impl<B: Buffer> EncodeState<B> { + fn push_out(&mut self, out: &mut &mut [u8]) -> bool { + self.buffer.push_out(out) + } + + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { + self.buffer.flush_out(out) + } + + fn end_code(&self) -> Code { + self.clear_code + 1 + } + + fn buffer_pad(&mut self) { + self.buffer.buffer_pad(); + } + + fn buffer_code(&mut self, code: Code) { + self.buffer.buffer_code(code); + } +} + +impl Buffer for MsbBuffer { + fn new(min_size: u8) -> Self { + MsbBuffer { + code_size: min_size + 1, + buffer: 0, + bits_in_buffer: 0, + } + } + + fn reset(&mut self, min_size: u8) { + self.code_size = min_size + 1; + self.buffer = 0; + self.bits_in_buffer = 0; + } + + fn clear(&mut self, min_size: u8) { + self.code_size = min_size + 1; + } + + fn buffer_code(&mut self, code: Code) { + let shift = 64 - self.bits_in_buffer - self.code_size; + self.buffer |= u64::from(code) << shift; + self.bits_in_buffer += self.code_size; + } + + fn push_out(&mut self, out: &mut &mut [u8]) -> bool { + if self.bits_in_buffer + 2 * self.code_size < 64 { + return false; + } + + self.flush_out(out) + } + + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { + let want = usize::from(self.bits_in_buffer / 8); + let count = want.min((*out).len()); + let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); + *out = tail; + + for b in bytes { + *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8; + self.buffer <<= 8; + self.bits_in_buffer -= 8; + } + + count < want + } + + fn buffer_pad(&mut self) { + let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; + self.bits_in_buffer += to_byte; + } + + fn bump_code_size(&mut self) { + self.code_size += 1; + } + + fn max_code(&self) -> Code { + (1 << self.code_size) - 1 + } + + fn code_size(&self) -> u8 { + self.code_size + } +} + +impl Buffer for LsbBuffer { + fn new(min_size: u8) -> Self { + LsbBuffer { + code_size: min_size + 1, + buffer: 0, + bits_in_buffer: 0, + } + } + + fn reset(&mut self, min_size: u8) { + self.code_size = min_size + 1; + self.buffer = 0; + self.bits_in_buffer = 0; + } + + fn clear(&mut self, min_size: u8) { + self.code_size = min_size + 1; + } + + fn buffer_code(&mut self, code: Code) { + self.buffer |= u64::from(code) << self.bits_in_buffer; + self.bits_in_buffer += self.code_size; + } + + fn push_out(&mut self, out: &mut &mut [u8]) -> bool { + if self.bits_in_buffer + 2 * self.code_size < 64 { + return false; + } + + self.flush_out(out) + } + + fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { + let want = usize::from(self.bits_in_buffer / 8); + let count = want.min((*out).len()); + let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); + *out = tail; + + for b in bytes { + *b = (self.buffer & 0x0000_0000_0000_00ff) as u8; + self.buffer >>= 8; + self.bits_in_buffer -= 8; + } + + count < want + } + + fn buffer_pad(&mut self) { + let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; + self.bits_in_buffer += to_byte; + } + + fn bump_code_size(&mut self) { + self.code_size += 1; + } + + fn max_code(&self) -> Code { + (1 << self.code_size) - 1 + } + + fn code_size(&self) -> u8 { + self.code_size + } +} + +impl Tree { + fn init(&mut self, min_size: u8) { + // We need a way to represent the state of a currently empty buffer. We use the clear code + // for this, thus create one complex mapping that leads to the one-char base codes. + self.keys + .resize((1 << min_size) + 2, FullKey::NoSuccessor.into()); + self.complex.push(Full { + char_continuation: [0; 256], + }); + let map_of_begin = self.complex.last_mut().unwrap(); + for ch in 0u16..256 { + map_of_begin.char_continuation[usize::from(ch)] = ch; + } + self.keys[1 << min_size] = FullKey::Full(0).into(); + } + + fn reset(&mut self, min_size: u8) { + self.simples.clear(); + self.keys.truncate((1 << min_size) + 2); + // Keep entry for clear code. + self.complex.truncate(1); + // The first complex is not changed.. + for k in self.keys[..(1 << min_size) + 2].iter_mut() { + *k = FullKey::NoSuccessor.into(); + } + self.keys[1 << min_size] = FullKey::Full(0).into(); + } + + fn at_key(&self, code: Code, ch: u8) -> Option<Code> { + let key = self.keys[usize::from(code)]; + match FullKey::from(key) { + FullKey::NoSuccessor => None, + FullKey::Simple(idx) => { + let nexts = &self.simples[usize::from(idx)]; + let successors = nexts + .codes + .iter() + .zip(nexts.chars.iter()) + .take(usize::from(nexts.count)); + for (&scode, &sch) in successors { + if sch == ch { + return Some(scode); + } + } + + None + } + FullKey::Full(idx) => { + let full = &self.complex[usize::from(idx)]; + let precode = full.char_continuation[usize::from(ch)]; + if usize::from(precode) < MAX_ENTRIES { + Some(precode) + } else { + None + } + } + } + } + + /// Iterate to the next char. + /// Return Ok when it was already in the tree or creates a new entry for it and returns Err. + fn iterate(&mut self, code: Code, ch: u8) -> Result<Code, Code> { + if let Some(next) = self.at_key(code, ch) { + Ok(next) + } else { + Err(self.append(code, ch)) + } + } + + fn append(&mut self, code: Code, ch: u8) -> Code { + let next: Code = self.keys.len() as u16; + let key = self.keys[usize::from(code)]; + // TODO: with debug assertions, check for non-existence + match FullKey::from(key) { + FullKey::NoSuccessor => { + let new_key = FullKey::Simple(self.simples.len() as u16); + self.simples.push(Simple::default()); + let simples = self.simples.last_mut().unwrap(); + simples.codes[0] = next; + simples.chars[0] = ch; + simples.count = 1; + self.keys[usize::from(code)] = new_key.into(); + } + FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => { + let nexts = &mut self.simples[usize::from(idx)]; + let nidx = usize::from(nexts.count); + nexts.chars[nidx] = ch; + nexts.codes[nidx] = next; + nexts.count += 1; + } + FullKey::Simple(idx) => { + let new_key = FullKey::Full(self.complex.len() as u16); + let simples = &self.simples[usize::from(idx)]; + self.complex.push(Full { + char_continuation: [Code::max_value(); 256], + }); + let full = self.complex.last_mut().unwrap(); + for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) { + full.char_continuation[usize::from(pch)] = pcont; + } + self.keys[usize::from(code)] = new_key.into(); + } + FullKey::Full(idx) => { + let full = &mut self.complex[usize::from(idx)]; + full.char_continuation[usize::from(ch)] = next; + } + } + self.keys.push(FullKey::NoSuccessor.into()); + next + } +} + +impl Default for FullKey { + fn default() -> Self { + FullKey::NoSuccessor + } +} + +impl Default for Simple { + fn default() -> Self { + Simple { + codes: [0; SHORT], + chars: [0; SHORT], + count: 0, + } + } +} + +impl From<CompressedKey> for FullKey { + fn from(CompressedKey(key): CompressedKey) -> Self { + match (key >> MAX_CODESIZE) & 0xf { + 0 => FullKey::Full(key & 0xfff), + 1 => FullKey::Simple(key & 0xfff), + _ => FullKey::NoSuccessor, + } + } +} + +impl From<FullKey> for CompressedKey { + fn from(full: FullKey) -> Self { + CompressedKey(match full { + FullKey::NoSuccessor => 0x2000, + FullKey::Simple(code) => 0x1000 | code, + FullKey::Full(code) => code, + }) + } +} + +#[cfg(test)] +mod tests { + use super::{BitOrder, Encoder, LzwError, LzwStatus}; + use crate::alloc::vec::Vec; + use crate::decode::Decoder; + #[cfg(feature = "std")] + use crate::StreamBuf; + + #[test] + fn invalid_input_rejected() { + const BIT_LEN: u8 = 2; + let ref input = [0, 1 << BIT_LEN /* invalid */, 0]; + let ref mut target = [0u8; 128]; + let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN); + + encoder.finish(); + // We require simulation of normality, that is byte-for-byte compression. + let result = encoder.encode_bytes(input, target); + assert!(if let Err(LzwError::InvalidCode) = result.status { + true + } else { + false + }); + assert_eq!(result.consumed_in, 1); + + let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]); + assert!(if let Ok(LzwStatus::Done) = fixed.status { + true + } else { + false + }); + assert_eq!(fixed.consumed_in, 2); + + // Okay, now test we actually fixed it. + let ref mut compare = [0u8; 4]; + let mut todo = &target[..result.consumed_out + fixed.consumed_out]; + let mut free = &mut compare[..]; + let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN); + + // Decode with up to 16 rounds, far too much but inconsequential. + for _ in 0..16 { + if decoder.has_ended() { + break; + } + + let result = decoder.decode_bytes(todo, free); + assert!(result.status.is_ok()); + todo = &todo[result.consumed_in..]; + free = &mut free[result.consumed_out..]; + } + + let remaining = { free }.len(); + let len = compare.len() - remaining; + assert_eq!(todo, &[]); + assert_eq!(compare[..len], [0, 1, 0]); + } + + #[test] + #[should_panic] + fn invalid_code_size_low() { + let _ = Encoder::new(BitOrder::Msb, 1); + } + + #[test] + #[should_panic] + fn invalid_code_size_high() { + let _ = Encoder::new(BitOrder::Msb, 14); + } + + fn make_decoded() -> Vec<u8> { + const FILE: &'static [u8] = + include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.lock")); + return Vec::from(FILE); + } + + #[test] + #[cfg(feature = "std")] + fn into_stream_buffer_no_alloc() { + let encoded = make_decoded(); + let mut encoder = Encoder::new(BitOrder::Msb, 8); + + let mut output = vec![]; + let mut buffer = [0; 512]; + let mut istream = encoder.into_stream(&mut output); + istream.set_buffer(&mut buffer[..]); + istream.encode(&encoded[..]).status.unwrap(); + + match istream.buffer { + Some(StreamBuf::Borrowed(_)) => {} + None => panic!("Decoded without buffer??"), + Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), + } + } + + #[test] + #[cfg(feature = "std")] + fn into_stream_buffer_small_alloc() { + struct WriteTap<W: std::io::Write>(W); + const BUF_SIZE: usize = 512; + + impl<W: std::io::Write> std::io::Write for WriteTap<W> { + fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { + assert!(buf.len() <= BUF_SIZE); + self.0.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.0.flush() + } + } + + let encoded = make_decoded(); + let mut encoder = Encoder::new(BitOrder::Msb, 8); + + let mut output = vec![]; + let mut istream = encoder.into_stream(WriteTap(&mut output)); + istream.set_buffer_size(512); + istream.encode(&encoded[..]).status.unwrap(); + + match istream.buffer { + Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), + Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), + None => panic!("Decoded without buffer??"), + } + } + + #[test] + #[cfg(feature = "std")] + fn reset() { + let encoded = make_decoded(); + let mut encoder = Encoder::new(BitOrder::Msb, 8); + let mut reference = None; + + for _ in 0..2 { + let mut output = vec![]; + let mut buffer = [0; 512]; + let mut istream = encoder.into_stream(&mut output); + istream.set_buffer(&mut buffer[..]); + istream.encode_all(&encoded[..]).status.unwrap(); + + encoder.reset(); + if let Some(reference) = &reference { + assert_eq!(output, *reference); + } else { + reference = Some(output); + } + } + } +} diff --git a/vendor/weezl/src/encode_into_async.rs b/vendor/weezl/src/encode_into_async.rs new file mode 100644 index 0000000..6973540 --- /dev/null +++ b/vendor/weezl/src/encode_into_async.rs @@ -0,0 +1,142 @@ +use crate::encode::IntoAsync; +use crate::error::LzwStatus; +use crate::error::StreamResult; +use crate::StreamBuf; +use std::io; + +impl<'d, W: futures::io::AsyncWrite + core::marker::Unpin> IntoAsync<'d, W> { + /// Encode data from a reader. + /// + /// This will drain the supplied reader. It will not encode an end marker after all data has + /// been processed. + pub async fn encode(&mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { + self.encode_part(read, false).await + } + + /// Encode data from a reader and an end marker. + pub async fn encode_all(mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { + self.encode_part(read, true).await + } + + /// Set the size of the intermediate decode buffer. + /// + /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is + /// available and any decoding method is called. No buffer is allocated if `set_buffer` has + /// been called. The buffer is reused. + /// + /// # Panics + /// This method panics if `size` is `0`. + pub fn set_buffer_size(&mut self, size: usize) { + assert_ne!(size, 0, "Attempted to set empty buffer"); + self.default_size = size; + } + + /// Use a particular buffer as an intermediate decode buffer. + /// + /// Calling this sets or replaces the buffer. When a buffer has been set then it is used + /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical + /// for efficient decoding. Some optimization techniques require the buffer to hold one or more + /// previous decoded words. There is also additional overhead from `write` calls each time the + /// buffer has been filled. + /// + /// # Panics + /// This method panics if the `buffer` is empty. + pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { + assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); + self.buffer = Some(StreamBuf::Borrowed(buffer)); + } + + async fn encode_part( + &mut self, + read: impl futures::io::AsyncBufRead, + finish: bool, + ) -> StreamResult { + use futures::io::AsyncBufReadExt; + use futures::io::AsyncWriteExt; + + let IntoAsync { + encoder, + writer, + buffer, + default_size, + } = self; + + futures::pin_mut!(read); + let mut read: core::pin::Pin<_> = read; + + let mut bytes_read = 0; + let mut bytes_written = 0; + + // Converting to mutable refs to move into the `once` closure. + let read_bytes = &mut bytes_read; + let write_bytes = &mut bytes_written; + + let outbuf: &mut [u8] = + match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { + StreamBuf::Borrowed(slice) => &mut *slice, + StreamBuf::Owned(vec) => &mut *vec, + }; + assert!(!outbuf.is_empty()); + + let status = loop { + // Try to grab one buffer of input data. + let mut filler = read.as_mut(); + let data = match filler.fill_buf().await { + Ok(buf) => buf, + Err(err) => break Err(err), + }; + + if data.is_empty() { + if finish { + encoder.finish(); + } else { + break Ok(()); + } + } + + // Decode as much of the buffer as fits. + let result = encoder.encode_bytes(data, &mut outbuf[..]); + // Do the bookkeeping and consume the buffer. + *read_bytes += result.consumed_in; + *write_bytes += result.consumed_out; + read.as_mut().consume(result.consumed_in); + + // Handle an error status in the result. + let done = match result.status { + Ok(ok) => ok, + Err(err) => { + break Err(io::Error::new( + io::ErrorKind::InvalidData, + &*format!("{:?}", err), + )); + } + }; + + if let LzwStatus::Done = done { + break writer.write_all(&outbuf[..result.consumed_out]).await; + } + + if let LzwStatus::NoProgress = done { + break Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "No more data but no end marker detected", + )); + } + + // And finish by writing our result. + // TODO: we may lose data on error (also on status error above) which we might want to + // deterministically handle so that we don't need to restart everything from scratch as + // the only recovery strategy. Any changes welcome. + match writer.write_all(&outbuf[..result.consumed_out]).await { + Ok(_) => {} + Err(err) => break Err(err), + } + }; + + StreamResult { + bytes_read, + bytes_written, + status, + } + } +} diff --git a/vendor/weezl/src/error.rs b/vendor/weezl/src/error.rs new file mode 100644 index 0000000..38dd95c --- /dev/null +++ b/vendor/weezl/src/error.rs @@ -0,0 +1,72 @@ +/// The result of a coding operation on a pair of buffer. +#[must_use = "Contains a status with potential error information"] +pub struct BufferResult { + /// The number of bytes consumed from the input buffer. + pub consumed_in: usize, + /// The number of bytes written into the output buffer. + pub consumed_out: usize, + /// The status after returning from the write call. + pub status: Result<LzwStatus, LzwError>, +} + +/// The result of a coding operation into a vector. +#[must_use = "Contains a status with potential error information"] +pub struct VectorResult { + /// The number of bytes consumed from the input buffer. + pub consumed_in: usize, + /// The number of bytes written into the output buffer. + pub consumed_out: usize, + /// The status after returning from the write call. + pub status: Result<LzwStatus, LzwError>, +} + +/// The result of coding into an output stream. +#[cfg(feature = "std")] +#[must_use = "Contains a status with potential error information"] +pub struct StreamResult { + /// The total number of bytes consumed from the reader. + pub bytes_read: usize, + /// The total number of bytes written into the writer. + pub bytes_written: usize, + /// The possible error that occurred. + /// + /// Note that when writing into streams it is not in general possible to recover from an error. + pub status: std::io::Result<()>, +} + +/// The status after successful coding of an LZW stream. +#[derive(Debug, Clone, Copy)] +pub enum LzwStatus { + /// Everything went well. + Ok, + /// No bytes were read or written and no internal state advanced. + /// + /// If this is returned but your application can not provide more input data then decoding is + /// definitely stuck for good and it should stop trying and report some error of its own. In + /// other situations this may be used as a signal to refill an internal buffer. + NoProgress, + /// No more data will be produced because an end marker was reached. + Done, +} + +/// The error kind after unsuccessful coding of an LZW stream. +#[derive(Debug, Clone, Copy)] +pub enum LzwError { + /// The input contained an invalid code. + /// + /// For decompression this refers to a code larger than those currently known through the prior + /// decoding stages. For compression this refers to a byte that has no code representation due + /// to being larger than permitted by the `size` parameter given to the Encoder. + InvalidCode, +} + +impl core::fmt::Display for LzwError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match self { + LzwError::InvalidCode => f.write_str("invalid code in LZW stream"), + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for LzwError {} diff --git a/vendor/weezl/src/lib.rs b/vendor/weezl/src/lib.rs new file mode 100644 index 0000000..3286eb9 --- /dev/null +++ b/vendor/weezl/src/lib.rs @@ -0,0 +1,146 @@ +//! # LZW decoder and encoder +//! +//! This crates provides an `Encoder` and a `Decoder` in their respective modules. The code words +//! are written from and to bit byte slices (or streams) where it is possible to write either the +//! most or least significant bits first. The maximum possible code size is 12 bits, the smallest +//! available code size is 2 bits. +//! +//! ## Example +//! +//! These two code blocks show the compression and corresponding decompression. Note that you must +//! use the same arguments to `Encoder` and `Decoder`, otherwise the decoding might fail or produce +//! bad results. +//! +#![cfg_attr(feature = "std", doc = "```")] +#![cfg_attr(not(feature = "std"), doc = "```ignore")] +//! use weezl::{BitOrder, encode::Encoder}; +//! +//! let data = b"Hello, world"; +//! let compressed = Encoder::new(BitOrder::Msb, 9) +//! .encode(data) +//! .unwrap(); +//! ``` +//! +#![cfg_attr(feature = "std", doc = "```")] +#![cfg_attr(not(feature = "std"), doc = "```ignore")] +//! use weezl::{BitOrder, decode::Decoder}; +//! # let compressed = b"\x80\x04\x81\x94l\x1b\x06\xf0\xb0 \x1d\xc6\xf1\xc8l\x19 \x10".to_vec(); +//! # let data = b"Hello, world"; +//! +//! let decompressed = Decoder::new(BitOrder::Msb, 9) +//! .decode(&compressed) +//! .unwrap(); +//! assert_eq!(decompressed, data); +//! ``` +//! +//! ## LZW Details +//! +//! The de- and encoder expect the LZW stream to start with a clear code and end with an +//! end code which are defined as follows: +//! +//! * `CLEAR_CODE == 1 << min_code_size` +//! * `END_CODE == CLEAR_CODE + 1` +//! +//! For optimal performance, all buffers and input and output slices should be as large as possible +//! and at least 2048 bytes long. This extends to input streams which should have similarly sized +//! buffers. This library uses Rust's standard allocation interfaces (`Box` and `Vec` to be +//! precise). Since there are no ways to handle allocation errors it is not recommended to operate +//! it on 16-bit targets. +//! +//! ## Allocations and standard library +//! +//! The main algorithm can be used in `no_std` as well, although it requires an allocator. This +//! restriction might be lifted at a later stage. For this you should deactivate the `std` feature. +//! The main interfaces stay intact but the `into_stream` combinator is no available. +#![cfg_attr(not(feature = "std"), no_std)] +#![forbid(unsafe_code)] +#![forbid(missing_docs)] + +#[cfg(all(feature = "alloc", not(feature = "std")))] +extern crate alloc; +#[cfg(all(feature = "alloc", feature = "std"))] +use std as alloc; + +pub(crate) const MAX_CODESIZE: u8 = 12; +pub(crate) const MAX_ENTRIES: usize = 1 << MAX_CODESIZE as usize; + +/// Alias for a LZW code point +pub(crate) type Code = u16; + +/// A default buffer size for encoding/decoding buffer. +/// +/// Note that this is larger than the default size for buffers (usually 4K) since each code word +/// can expand to multiple bytes. Expanding one buffer would yield multiple and require a costly +/// break in the decoding loop. Note that the decoded size can be up to quadratic in code block. +pub(crate) const STREAM_BUF_SIZE: usize = 1 << 24; + +/// The order of bits in bytes. +#[derive(Clone, Copy, Debug)] +pub enum BitOrder { + /// The most significant bit is processed first. + Msb, + /// The least significant bit is processed first. + Lsb, +} + +/// An owned or borrowed buffer for stream operations. +#[cfg(feature = "alloc")] +pub(crate) enum StreamBuf<'d> { + Borrowed(&'d mut [u8]), + Owned(crate::alloc::vec::Vec<u8>), +} + +#[cold] +fn assert_decode_size(size: u8) { + assert!( + size <= MAX_CODESIZE, + "Maximum code size 12 required, got {}", + size + ); +} + +#[cold] +fn assert_encode_size(size: u8) { + assert!(size >= 2, "Minimum code size 2 required, got {}", size); + assert!( + size <= MAX_CODESIZE, + "Maximum code size 12 required, got {}", + size + ); +} + +#[cfg(feature = "alloc")] +pub mod decode; +#[cfg(feature = "alloc")] +pub mod encode; +mod error; + +#[cfg(feature = "std")] +pub use self::error::StreamResult; +pub use self::error::{BufferResult, LzwError, LzwStatus}; + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use crate::decode::Decoder; + use crate::encode::Encoder; + + #[cfg(feature = "std")] + use crate::{decode, encode}; + + #[test] + fn stable_send() { + fn must_be_send<T: Send + 'static>() {} + must_be_send::<Decoder>(); + must_be_send::<Encoder>(); + + #[cfg(feature = "std")] + fn _send_and_lt<'lt, T: Send + 'lt>() {} + + // Check that the inference `W: Send + 'd` => `IntoStream: Send + 'd` works. + #[cfg(feature = "std")] + fn _all_send_writer<'d, W: std::io::Write + Send + 'd>() { + _send_and_lt::<'d, decode::IntoStream<'d, W>>(); + _send_and_lt::<'d, encode::IntoStream<'d, W>>(); + } + } +} |