diff options
Diffstat (limited to 'vendor/weezl/src')
-rw-r--r-- | vendor/weezl/src/decode.rs | 1333 | ||||
-rw-r--r-- | vendor/weezl/src/decode_into_async.rs | 143 | ||||
-rw-r--r-- | vendor/weezl/src/encode.rs | 1126 | ||||
-rw-r--r-- | vendor/weezl/src/encode_into_async.rs | 142 | ||||
-rw-r--r-- | vendor/weezl/src/error.rs | 72 | ||||
-rw-r--r-- | vendor/weezl/src/lib.rs | 146 |
6 files changed, 0 insertions, 2962 deletions
diff --git a/vendor/weezl/src/decode.rs b/vendor/weezl/src/decode.rs deleted file mode 100644 index 641a3a8..0000000 --- a/vendor/weezl/src/decode.rs +++ /dev/null @@ -1,1333 +0,0 @@ -//! A module for all decoding needs. -#[cfg(feature = "std")] -use crate::error::StreamResult; -use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; -use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; - -use crate::alloc::{boxed::Box, vec, vec::Vec}; -#[cfg(feature = "std")] -use std::io::{self, BufRead, Write}; - -/// The state for decoding data with an LZW algorithm. -/// -/// The same structure can be utilized with streams as well as your own buffers and driver logic. -/// It may even be possible to mix them if you are sufficiently careful not to lose or skip any -/// already decode data in the process. -/// -/// This is a sans-IO implementation, meaning that it only contains the state of the decoder and -/// the caller will provide buffers for input and output data when calling the basic -/// [`decode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` -/// methods for decoding with a particular style of common IO. -/// -/// * [`decode`] for decoding once without any IO-loop. -/// * [`into_async`] for decoding with the `futures` traits for asynchronous IO. -/// * [`into_stream`] for decoding with the standard `io` traits. -/// * [`into_vec`] for in-memory decoding. -/// -/// [`decode_bytes`]: #method.decode_bytes -/// [`decode`]: #method.decode -/// [`into_async`]: #method.into_async -/// [`into_stream`]: #method.into_stream -/// [`into_vec`]: #method.into_vec -pub struct Decoder { - state: Box<dyn Stateful + Send + 'static>, -} - -/// A decoding stream sink. -/// -/// See [`Decoder::into_stream`] on how to create this type. -/// -/// [`Decoder::into_stream`]: struct.Decoder.html#method.into_stream -#[cfg_attr( - not(feature = "std"), - deprecated = "This type is only useful with the `std` feature." -)] -#[cfg_attr(not(feature = "std"), allow(dead_code))] -pub struct IntoStream<'d, W> { - decoder: &'d mut Decoder, - writer: W, - buffer: Option<StreamBuf<'d>>, - default_size: usize, -} - -/// An async decoding sink. -/// -/// See [`Decoder::into_async`] on how to create this type. -/// -/// [`Decoder::into_async`]: struct.Decoder.html#method.into_async -#[cfg(feature = "async")] -pub struct IntoAsync<'d, W> { - decoder: &'d mut Decoder, - writer: W, - buffer: Option<StreamBuf<'d>>, - default_size: usize, -} - -/// A decoding sink into a vector. -/// -/// See [`Decoder::into_vec`] on how to create this type. -/// -/// [`Decoder::into_vec`]: struct.Decoder.html#method.into_vec -pub struct IntoVec<'d> { - decoder: &'d mut Decoder, - vector: &'d mut Vec<u8>, -} - -trait Stateful { - fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; - fn has_ended(&self) -> bool; - /// Ignore an end code and continue decoding (no implied reset). - fn restart(&mut self); - /// Reset the decoder to the beginning, dropping all buffers etc. - fn reset(&mut self); -} - -#[derive(Clone)] -struct Link { - prev: Code, - byte: u8, -} - -#[derive(Default)] -struct MsbBuffer { - /// A buffer of individual bits. The oldest code is kept in the high-order bits. - bit_buffer: u64, - /// A precomputed mask for this code. - code_mask: u16, - /// The current code size. - code_size: u8, - /// The number of bits in the buffer. - bits: u8, -} - -#[derive(Default)] -struct LsbBuffer { - /// A buffer of individual bits. The oldest code is kept in the high-order bits. - bit_buffer: u64, - /// A precomputed mask for this code. - code_mask: u16, - /// The current code size. - code_size: u8, - /// The number of bits in the buffer. - bits: u8, -} - -trait CodeBuffer { - fn new(min_size: u8) -> Self; - fn reset(&mut self, min_size: u8); - fn bump_code_size(&mut self); - /// Retrieve the next symbol, refilling if necessary. - fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code>; - /// Refill the internal buffer. - fn refill_bits(&mut self, inp: &mut &[u8]); - /// Get the next buffered code word. - fn get_bits(&mut self) -> Option<Code>; - fn max_code(&self) -> Code; - fn code_size(&self) -> u8; -} - -struct DecodeState<CodeBuffer> { - /// The original minimum code size. - min_size: u8, - /// The table of decoded codes. - table: Table, - /// The buffer of decoded data. - buffer: Buffer, - /// The link which we are still decoding and its original code. - last: Option<(Code, Link)>, - /// The next code entry. - next_code: Code, - /// Code to reset all tables. - clear_code: Code, - /// Code to signal the end of the stream. - end_code: Code, - /// A stored flag if the end code has already appeared. - has_ended: bool, - /// If tiff then bumps are a single code sooner. - is_tiff: bool, - /// Do we allow stream to start without an explicit reset code? - implicit_reset: bool, - /// The buffer for decoded words. - code_buffer: CodeBuffer, -} - -struct Buffer { - bytes: Box<[u8]>, - read_mark: usize, - write_mark: usize, -} - -struct Table { - inner: Vec<Link>, - depths: Vec<u16>, -} - -impl Decoder { - /// Create a new decoder with the specified bit order and symbol size. - /// - /// The algorithm for dynamically increasing the code symbol bit width is compatible with the - /// original specification. In particular you will need to specify an `Lsb` bit oder to decode - /// the data portion of a compressed `gif` image. - /// - /// # Panics - /// - /// The `size` needs to be in the interval `0..=12`. - pub fn new(order: BitOrder, size: u8) -> Self { - type Boxed = Box<dyn Stateful + Send + 'static>; - super::assert_decode_size(size); - let state = match order { - BitOrder::Lsb => Box::new(DecodeState::<LsbBuffer>::new(size)) as Boxed, - BitOrder::Msb => Box::new(DecodeState::<MsbBuffer>::new(size)) as Boxed, - }; - - Decoder { state } - } - - /// Create a TIFF compatible decoder with the specified bit order and symbol size. - /// - /// The algorithm for dynamically increasing the code symbol bit width is compatible with the - /// TIFF specification, which is a misinterpretation of the original algorithm for increasing - /// the code size. It switches one symbol sooner. - /// - /// # Panics - /// - /// The `size` needs to be in the interval `0..=12`. - pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { - type Boxed = Box<dyn Stateful + Send + 'static>; - super::assert_decode_size(size); - let state = match order { - BitOrder::Lsb => { - let mut state = Box::new(DecodeState::<LsbBuffer>::new(size)); - state.is_tiff = true; - state as Boxed - } - BitOrder::Msb => { - let mut state = Box::new(DecodeState::<MsbBuffer>::new(size)); - state.is_tiff = true; - state as Boxed - } - }; - - Decoder { state } - } - - /// Decode some bytes from `inp` and write result to `out`. - /// - /// This will consume a prefix of the input buffer and write decoded output into a prefix of - /// the output buffer. See the respective fields of the return value for the count of consumed - /// and written bytes. For the next call You should have adjusted the inputs accordingly. - /// - /// The call will try to decode and write as many bytes of output as available. It will be - /// much more optimized (and avoid intermediate buffering) if it is allowed to write a large - /// contiguous chunk at once. - /// - /// See [`into_stream`] for high-level functions (that are only available with the `std` - /// feature). - /// - /// [`into_stream`]: #method.into_stream - pub fn decode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { - self.state.advance(inp, out) - } - - /// Decode a single chunk of lzw encoded data. - /// - /// This method requires the data to contain an end marker, and returns an error otherwise. - /// - /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize - /// buffer size, to supply an existing vector, to control whether an end marker is required, or - /// to preserve partial data in the case of a decoding error. - /// - /// [`into_vec`]: #into_vec - /// - /// # Example - /// - /// ``` - /// use weezl::{BitOrder, decode::Decoder}; - /// - /// // Encoded that was created with an encoder. - /// let data = b"\x80\x04\x81\x94l\x1b\x06\xf0\xb0 \x1d\xc6\xf1\xc8l\x19 \x10"; - /// let decoded = Decoder::new(BitOrder::Msb, 9) - /// .decode(data) - /// .unwrap(); - /// assert_eq!(decoded, b"Hello, world"); - /// ``` - pub fn decode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> { - let mut output = vec![]; - self.into_vec(&mut output).decode_all(data).status?; - Ok(output) - } - - /// Construct a decoder into a writer. - #[cfg(feature = "std")] - pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> { - IntoStream { - decoder: self, - writer, - buffer: None, - default_size: STREAM_BUF_SIZE, - } - } - - /// Construct a decoder into an async writer. - #[cfg(feature = "async")] - pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> { - IntoAsync { - decoder: self, - writer, - buffer: None, - default_size: STREAM_BUF_SIZE, - } - } - - /// Construct a decoder into a vector. - /// - /// All decoded data is appended and the vector is __not__ cleared. - /// - /// Compared to `into_stream` this interface allows a high-level access to decoding without - /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the - /// special target exposes. - pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> { - IntoVec { - decoder: self, - vector: vec, - } - } - - /// Check if the decoding has finished. - /// - /// No more output is produced beyond the end code that marked the finish of the stream. The - /// decoder may have read additional bytes, including padding bits beyond the last code word - /// but also excess bytes provided. - pub fn has_ended(&self) -> bool { - self.state.has_ended() - } - - /// Ignore an end code and continue. - /// - /// This will _not_ reset any of the inner code tables and not have the effect of a clear code. - /// It will instead continue as if the end code had not been present. If no end code has - /// occurred then this is a no-op. - /// - /// You can test if an end code has occurred with [`has_ended`](#method.has_ended). - /// FIXME: clarify how this interacts with padding introduced after end code. - #[allow(dead_code)] - pub(crate) fn restart(&mut self) { - self.state.restart(); - } - - /// Reset all internal state. - /// - /// This produce a decoder as if just constructed with `new` but taking slightly less work. In - /// particular it will not deallocate any internal allocations. It will also avoid some - /// duplicate setup work. - pub fn reset(&mut self) { - self.state.reset(); - } -} - -#[cfg(feature = "std")] -impl<'d, W: Write> IntoStream<'d, W> { - /// Decode data from a reader. - /// - /// This will read data until the stream is empty or an end marker is reached. - pub fn decode(&mut self, read: impl BufRead) -> StreamResult { - self.decode_part(read, false) - } - - /// Decode data from a reader, requiring an end marker. - pub fn decode_all(mut self, read: impl BufRead) -> StreamResult { - self.decode_part(read, true) - } - - /// Set the size of the intermediate decode buffer. - /// - /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is - /// available and any decoding method is called. No buffer is allocated if `set_buffer` has - /// been called. The buffer is reused. - /// - /// # Panics - /// This method panics if `size` is `0`. - pub fn set_buffer_size(&mut self, size: usize) { - assert_ne!(size, 0, "Attempted to set empty buffer"); - self.default_size = size; - } - - /// Use a particular buffer as an intermediate decode buffer. - /// - /// Calling this sets or replaces the buffer. When a buffer has been set then it is used - /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical - /// for efficient decoding. Some optimization techniques require the buffer to hold one or more - /// previous decoded words. There is also additional overhead from `write` calls each time the - /// buffer has been filled. - /// - /// # Panics - /// This method panics if the `buffer` is empty. - pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { - assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); - self.buffer = Some(StreamBuf::Borrowed(buffer)); - } - - fn decode_part(&mut self, mut read: impl BufRead, must_finish: bool) -> StreamResult { - let IntoStream { - decoder, - writer, - buffer, - default_size, - } = self; - - enum Progress { - Ok, - Done, - } - - let mut bytes_read = 0; - let mut bytes_written = 0; - - // Converting to mutable refs to move into the `once` closure. - let read_bytes = &mut bytes_read; - let write_bytes = &mut bytes_written; - - let outbuf: &mut [u8] = - match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { - StreamBuf::Borrowed(slice) => &mut *slice, - StreamBuf::Owned(vec) => &mut *vec, - }; - assert!(!outbuf.is_empty()); - - let once = move || { - // Try to grab one buffer of input data. - let data = read.fill_buf()?; - - // Decode as much of the buffer as fits. - let result = decoder.decode_bytes(data, &mut outbuf[..]); - // Do the bookkeeping and consume the buffer. - *read_bytes += result.consumed_in; - *write_bytes += result.consumed_out; - read.consume(result.consumed_in); - - // Handle the status in the result. - let done = result.status.map_err(|err| { - io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) - })?; - - // Check if we had any new data at all. - if let LzwStatus::NoProgress = done { - debug_assert_eq!( - result.consumed_out, 0, - "No progress means we have not decoded any data" - ); - // In particular we did not finish decoding. - if must_finish { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "No more data but no end marker detected", - )); - } else { - return Ok(Progress::Done); - } - } - - // And finish by writing our result. - // TODO: we may lose data on error (also on status error above) which we might want to - // deterministically handle so that we don't need to restart everything from scratch as - // the only recovery strategy. Any changes welcome. - writer.write_all(&outbuf[..result.consumed_out])?; - - Ok(if let LzwStatus::Done = done { - Progress::Done - } else { - Progress::Ok - }) - }; - - // Decode chunks of input data until we're done. - let status = core::iter::repeat_with(once) - // scan+fuse can be replaced with map_while - .scan((), |(), result| match result { - Ok(Progress::Ok) => Some(Ok(())), - Err(err) => Some(Err(err)), - Ok(Progress::Done) => None, - }) - .fuse() - .collect(); - - StreamResult { - bytes_read, - bytes_written, - status, - } - } -} - -impl IntoVec<'_> { - /// Decode data from a slice. - /// - /// This will read data until the slice is empty or an end marker is reached. - pub fn decode(&mut self, read: &[u8]) -> VectorResult { - self.decode_part(read, false) - } - - /// Decode data from a slice, requiring an end marker. - pub fn decode_all(mut self, read: &[u8]) -> VectorResult { - self.decode_part(read, true) - } - - fn grab_buffer(&mut self) -> (&mut [u8], &mut Decoder) { - const CHUNK_SIZE: usize = 1 << 12; - let decoder = &mut self.decoder; - let length = self.vector.len(); - - // Use the vector to do overflow checks and w/e. - self.vector.reserve(CHUNK_SIZE); - // FIXME: decoding into uninit buffer? - self.vector.resize(length + CHUNK_SIZE, 0u8); - - (&mut self.vector[length..], decoder) - } - - fn decode_part(&mut self, part: &[u8], must_finish: bool) -> VectorResult { - let mut result = VectorResult { - consumed_in: 0, - consumed_out: 0, - status: Ok(LzwStatus::Ok), - }; - - enum Progress { - Ok, - Done, - } - - // Converting to mutable refs to move into the `once` closure. - let read_bytes = &mut result.consumed_in; - let write_bytes = &mut result.consumed_out; - let mut data = part; - - // A 64 MB buffer is quite large but should get alloc_zeroed. - // Note that the decoded size can be up to quadratic in code block. - let once = move || { - // Grab a new output buffer. - let (outbuf, decoder) = self.grab_buffer(); - - // Decode as much of the buffer as fits. - let result = decoder.decode_bytes(data, &mut outbuf[..]); - // Do the bookkeeping and consume the buffer. - *read_bytes += result.consumed_in; - *write_bytes += result.consumed_out; - data = &data[result.consumed_in..]; - - let unfilled = outbuf.len() - result.consumed_out; - let filled = self.vector.len() - unfilled; - self.vector.truncate(filled); - - // Handle the status in the result. - match result.status { - Err(err) => Err(err), - Ok(LzwStatus::NoProgress) if must_finish => Err(LzwError::InvalidCode), - Ok(LzwStatus::NoProgress) | Ok(LzwStatus::Done) => Ok(Progress::Done), - Ok(LzwStatus::Ok) => Ok(Progress::Ok), - } - }; - - // Decode chunks of input data until we're done. - let status: Result<(), _> = core::iter::repeat_with(once) - // scan+fuse can be replaced with map_while - .scan((), |(), result| match result { - Ok(Progress::Ok) => Some(Ok(())), - Err(err) => Some(Err(err)), - Ok(Progress::Done) => None, - }) - .fuse() - .collect(); - - if let Err(err) = status { - result.status = Err(err); - } - - result - } -} - -// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would -// trip over the usage of await, which is a reserved keyword in that edition/version. It only -// contains an impl block. -#[cfg(feature = "async")] -#[path = "decode_into_async.rs"] -mod impl_decode_into_async; - -impl<C: CodeBuffer> DecodeState<C> { - fn new(min_size: u8) -> Self { - DecodeState { - min_size, - table: Table::new(), - buffer: Buffer::new(), - last: None, - clear_code: 1 << min_size, - end_code: (1 << min_size) + 1, - next_code: (1 << min_size) + 2, - has_ended: false, - is_tiff: false, - implicit_reset: true, - code_buffer: CodeBuffer::new(min_size), - } - } - - fn init_tables(&mut self) { - self.code_buffer.reset(self.min_size); - self.next_code = (1 << self.min_size) + 2; - self.table.init(self.min_size); - } - - fn reset_tables(&mut self) { - self.code_buffer.reset(self.min_size); - self.next_code = (1 << self.min_size) + 2; - self.table.clear(self.min_size); - } -} - -impl<C: CodeBuffer> Stateful for DecodeState<C> { - fn has_ended(&self) -> bool { - self.has_ended - } - - fn restart(&mut self) { - self.has_ended = false; - } - - fn reset(&mut self) { - self.table.init(self.min_size); - self.buffer.read_mark = 0; - self.buffer.write_mark = 0; - self.last = None; - self.restart(); - self.code_buffer = CodeBuffer::new(self.min_size); - } - - fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { - // Skip everything if there is nothing to do. - if self.has_ended { - return BufferResult { - consumed_in: 0, - consumed_out: 0, - status: Ok(LzwStatus::Done), - }; - } - - // Rough description: - // We will fill the output slice as much as possible until either there is no more symbols - // to decode or an end code has been reached. This requires an internal buffer to hold a - // potential tail of the word corresponding to the last symbol. This tail will then be - // decoded first before continuing with the regular decoding. The same buffer is required - // to persist some symbol state across calls. - // - // We store the words corresponding to code symbols in an index chain, bytewise, where we - // push each decoded symbol. (TODO: wuffs shows some success with 8-byte units). This chain - // is traversed for each symbol when it is decoded and bytes are placed directly into the - // output slice. In the special case (new_code == next_code) we use an existing decoded - // version that is present in either the out bytes of this call or in buffer to copy the - // repeated prefix slice. - // TODO: I played with a 'decoding cache' to remember the position of long symbols and - // avoid traversing the chain, doing a copy of memory instead. It did however not lead to - // a serious improvement. It's just unlikely to both have a long symbol and have that - // repeated twice in the same output buffer. - // - // You will also find the (to my knowledge novel) concept of a _decoding burst_ which - // gained some >~10% speedup in tests. This is motivated by wanting to use out-of-order - // execution as much as possible and for this reason have the least possible stress on - // branch prediction. Our decoding table already gives us a lookahead on symbol lengths but - // only for re-used codes, not novel ones. This lookahead also makes the loop termination - // when restoring each byte of the code word perfectly predictable! So a burst is a chunk - // of code words which are all independent of each other, have known lengths _and_ are - // guaranteed to fit into the out slice without requiring a buffer. One burst can be - // decoded in an extremely tight loop. - // - // TODO: since words can be at most (1 << MAX_CODESIZE) = 4096 bytes long we could avoid - // that intermediate buffer at the expense of not always filling the output buffer - // completely. Alternatively we might follow its chain of precursor states twice. This may - // be even cheaper if we store more than one byte per link so it really should be - // evaluated. - // TODO: if the caller was required to provide the previous last word we could also avoid - // the buffer for cases where we need it to restore the next code! This could be built - // backwards compatible by only doing it after an opt-in call that enables the behaviour. - - // Record initial lengths for the result that is returned. - let o_in = inp.len(); - let o_out = out.len(); - - // The code_link is the previously decoded symbol. - // It's used to link the new code back to its predecessor. - let mut code_link = None; - // The status, which is written to on an invalid code. - let mut status = Ok(LzwStatus::Ok); - - match self.last.take() { - // No last state? This is the first code after a reset? - None => { - match self.next_symbol(&mut inp) { - // Plainly invalid code. - Some(code) if code > self.next_code => status = Err(LzwError::InvalidCode), - // next_code would require an actual predecessor. - Some(code) if code == self.next_code => status = Err(LzwError::InvalidCode), - // No more symbols available and nothing decoded yet. - // Assume that we didn't make progress, this may get reset to Done if we read - // some bytes from the input. - None => status = Ok(LzwStatus::NoProgress), - // Handle a valid code. - Some(init_code) => { - if init_code == self.clear_code { - self.init_tables(); - } else if init_code == self.end_code { - self.has_ended = true; - status = Ok(LzwStatus::Done); - } else if self.table.is_empty() { - if self.implicit_reset { - self.init_tables(); - - self.buffer.fill_reconstruct(&self.table, init_code); - let link = self.table.at(init_code).clone(); - code_link = Some((init_code, link)); - } else { - // We require an explicit reset. - status = Err(LzwError::InvalidCode); - } - } else { - // Reconstruct the first code in the buffer. - self.buffer.fill_reconstruct(&self.table, init_code); - let link = self.table.at(init_code).clone(); - code_link = Some((init_code, link)); - } - } - } - } - // Move the tracking state to the stack. - Some(tup) => code_link = Some(tup), - }; - - // Track an empty `burst` (see below) means we made no progress. - let mut burst_required_for_progress = false; - // Restore the previous state, if any. - if let Some((code, link)) = code_link.take() { - code_link = Some((code, link)); - let remain = self.buffer.buffer(); - // Check if we can fully finish the buffer. - if remain.len() > out.len() { - if out.is_empty() { - status = Ok(LzwStatus::NoProgress); - } else { - out.copy_from_slice(&remain[..out.len()]); - self.buffer.consume(out.len()); - out = &mut []; - } - } else if remain.is_empty() { - status = Ok(LzwStatus::NoProgress); - burst_required_for_progress = true; - } else { - let consumed = remain.len(); - out[..consumed].copy_from_slice(remain); - self.buffer.consume(consumed); - out = &mut out[consumed..]; - burst_required_for_progress = false; - } - } - - // The tracking state for a burst. - // These are actually initialized later but compiler wasn't smart enough to fully optimize - // out the init code so that appears outside th loop. - // TODO: maybe we can make it part of the state but it's dubious if that really gives a - // benefit over stack usage? Also the slices stored here would need some treatment as we - // can't infect the main struct with a lifetime. - let mut burst = [0; 6]; - let mut bytes = [0u16; 6]; - let mut target: [&mut [u8]; 6] = Default::default(); - // A special reference to out slice which holds the last decoded symbol. - let mut last_decoded: Option<&[u8]> = None; - - while let Some((mut code, mut link)) = code_link.take() { - if out.is_empty() && !self.buffer.buffer().is_empty() { - code_link = Some((code, link)); - break; - } - - let mut burst_size = 0; - // Ensure the code buffer is full, we're about to request some codes. - // Note that this also ensures at least one code is in the buffer if any input is left. - self.refill_bits(&mut inp); - // A burst is a sequence of decodes that are completely independent of each other. This - // is the case if neither is an end code, a clear code, or a next code, i.e. we have - // all of them in the decoding table and thus known their depths, and additionally if - // we can decode them directly into the output buffer. - for b in &mut burst { - // TODO: does it actually make a perf difference to avoid reading new bits here? - *b = match self.get_bits() { - None => break, - Some(code) => code, - }; - - // We can commit the previous burst code, and will take a slice from the output - // buffer. This also avoids the bounds check in the tight loop later. - if burst_size > 0 { - let len = bytes[burst_size - 1]; - let (into, tail) = out.split_at_mut(usize::from(len)); - target[burst_size - 1] = into; - out = tail; - } - - // Check that we don't overflow the code size with all codes we burst decode. - if let Some(potential_code) = self.next_code.checked_add(burst_size as u16) { - burst_size += 1; - if potential_code == self.code_buffer.max_code() - Code::from(self.is_tiff) { - break; - } - } else { - // next_code overflowed - break; - } - - // A burst code can't be special. - if *b == self.clear_code || *b == self.end_code || *b >= self.next_code { - break; - } - - // Read the code length and check that we can decode directly into the out slice. - let len = self.table.depths[usize::from(*b)]; - if out.len() < usize::from(len) { - break; - } - - bytes[burst_size - 1] = len; - } - - // No code left, and no more bytes to fill the buffer. - if burst_size == 0 { - if burst_required_for_progress { - status = Ok(LzwStatus::NoProgress); - } - code_link = Some((code, link)); - break; - } - - burst_required_for_progress = false; - // Note that the very last code in the burst buffer doesn't actually belong to the - // burst itself. TODO: sometimes it could, we just don't differentiate between the - // breaks and a loop end condition above. That may be a speed advantage? - let (&new_code, burst) = burst[..burst_size].split_last().unwrap(); - - // The very tight loop for restoring the actual burst. - for (&burst, target) in burst.iter().zip(&mut target[..burst_size - 1]) { - let cha = self.table.reconstruct(burst, target); - // TODO: this pushes into a Vec, maybe we can make this cleaner. - // Theoretically this has a branch and llvm tends to be flaky with code layout for - // the case of requiring an allocation (which can't occur in practice). - let new_link = self.table.derive(&link, cha, code); - self.next_code += 1; - code = burst; - link = new_link; - } - - // Update the slice holding the last decoded word. - if let Some(new_last) = target[..burst_size - 1].last_mut() { - let slice = core::mem::replace(new_last, &mut []); - last_decoded = Some(&*slice); - } - - // Now handle the special codes. - if new_code == self.clear_code { - self.reset_tables(); - last_decoded = None; - continue; - } - - if new_code == self.end_code { - self.has_ended = true; - status = Ok(LzwStatus::Done); - last_decoded = None; - break; - } - - if new_code > self.next_code { - status = Err(LzwError::InvalidCode); - last_decoded = None; - break; - } - - let required_len = if new_code == self.next_code { - self.table.depths[usize::from(code)] + 1 - } else { - self.table.depths[usize::from(new_code)] - }; - - let cha; - let is_in_buffer; - // Check if we will need to store our current state into the buffer. - if usize::from(required_len) > out.len() { - is_in_buffer = true; - if new_code == self.next_code { - // last_decoded will be Some if we have restored any code into the out slice. - // Otherwise it will still be present in the buffer. - if let Some(last) = last_decoded.take() { - self.buffer.bytes[..last.len()].copy_from_slice(last); - self.buffer.write_mark = last.len(); - self.buffer.read_mark = last.len(); - } - - cha = self.buffer.fill_cscsc(); - } else { - // Restore the decoded word into the buffer. - last_decoded = None; - cha = self.buffer.fill_reconstruct(&self.table, new_code); - } - } else { - is_in_buffer = false; - let (target, tail) = out.split_at_mut(usize::from(required_len)); - out = tail; - - if new_code == self.next_code { - // Reconstruct high. - let source = match last_decoded.take() { - Some(last) => last, - None => &self.buffer.bytes[..self.buffer.write_mark], - }; - cha = source[0]; - target[..source.len()].copy_from_slice(source); - target[source.len()..][0] = source[0]; - } else { - cha = self.table.reconstruct(new_code, target); - } - - // A new decoded word. - last_decoded = Some(target); - } - - let new_link; - // Each newly read code creates one new code/link based on the preceding code if we - // have enough space to put it there. - if !self.table.is_full() { - let link = self.table.derive(&link, cha, code); - - if self.next_code == self.code_buffer.max_code() - Code::from(self.is_tiff) - && self.code_buffer.code_size() < MAX_CODESIZE - { - self.bump_code_size(); - } - - self.next_code += 1; - new_link = link; - } else { - // It's actually quite likely that the next code will be a reset but just in case. - // FIXME: this path hasn't been tested very well. - new_link = link.clone(); - } - - // store the information on the decoded word. - code_link = Some((new_code, new_link)); - - // Can't make any more progress with decoding. - if is_in_buffer { - break; - } - } - - // We need to store the last word into the buffer in case the first code in the next - // iteration is the next_code. - if let Some(tail) = last_decoded { - self.buffer.bytes[..tail.len()].copy_from_slice(tail); - self.buffer.write_mark = tail.len(); - self.buffer.read_mark = tail.len(); - } - - // Ensure we don't indicate that no progress was made if we read some bytes from the input - // (which is progress). - if o_in > inp.len() { - if let Ok(LzwStatus::NoProgress) = status { - status = Ok(LzwStatus::Ok); - } - } - - // Store the code/link state. - self.last = code_link; - - BufferResult { - consumed_in: o_in.wrapping_sub(inp.len()), - consumed_out: o_out.wrapping_sub(out.len()), - status, - } - } -} - -impl<C: CodeBuffer> DecodeState<C> { - fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code> { - self.code_buffer.next_symbol(inp) - } - - fn bump_code_size(&mut self) { - self.code_buffer.bump_code_size() - } - - fn refill_bits(&mut self, inp: &mut &[u8]) { - self.code_buffer.refill_bits(inp) - } - - fn get_bits(&mut self) -> Option<Code> { - self.code_buffer.get_bits() - } -} - -impl CodeBuffer for MsbBuffer { - fn new(min_size: u8) -> Self { - MsbBuffer { - code_size: min_size + 1, - code_mask: (1u16 << (min_size + 1)) - 1, - bit_buffer: 0, - bits: 0, - } - } - - fn reset(&mut self, min_size: u8) { - self.code_size = min_size + 1; - self.code_mask = (1 << self.code_size) - 1; - } - - fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code> { - if self.bits < self.code_size { - self.refill_bits(inp); - } - - self.get_bits() - } - - fn bump_code_size(&mut self) { - self.code_size += 1; - self.code_mask = (self.code_mask << 1) | 1; - } - - fn refill_bits(&mut self, inp: &mut &[u8]) { - let wish_count = (64 - self.bits) / 8; - let mut buffer = [0u8; 8]; - let new_bits = match inp.get(..usize::from(wish_count)) { - Some(bytes) => { - buffer[..usize::from(wish_count)].copy_from_slice(bytes); - *inp = &inp[usize::from(wish_count)..]; - wish_count * 8 - } - None => { - let new_bits = inp.len() * 8; - buffer[..inp.len()].copy_from_slice(inp); - *inp = &[]; - new_bits as u8 - } - }; - self.bit_buffer |= u64::from_be_bytes(buffer) >> self.bits; - self.bits += new_bits; - } - - fn get_bits(&mut self) -> Option<Code> { - if self.bits < self.code_size { - return None; - } - - let mask = u64::from(self.code_mask); - let rotbuf = self.bit_buffer.rotate_left(self.code_size.into()); - self.bit_buffer = rotbuf & !mask; - self.bits -= self.code_size; - Some((rotbuf & mask) as u16) - } - - fn max_code(&self) -> Code { - self.code_mask - } - - fn code_size(&self) -> u8 { - self.code_size - } -} - -impl CodeBuffer for LsbBuffer { - fn new(min_size: u8) -> Self { - LsbBuffer { - code_size: min_size + 1, - code_mask: (1u16 << (min_size + 1)) - 1, - bit_buffer: 0, - bits: 0, - } - } - - fn reset(&mut self, min_size: u8) { - self.code_size = min_size + 1; - self.code_mask = (1 << self.code_size) - 1; - } - - fn next_symbol(&mut self, inp: &mut &[u8]) -> Option<Code> { - if self.bits < self.code_size { - self.refill_bits(inp); - } - - self.get_bits() - } - - fn bump_code_size(&mut self) { - self.code_size += 1; - self.code_mask = (self.code_mask << 1) | 1; - } - - fn refill_bits(&mut self, inp: &mut &[u8]) { - let wish_count = (64 - self.bits) / 8; - let mut buffer = [0u8; 8]; - let new_bits = match inp.get(..usize::from(wish_count)) { - Some(bytes) => { - buffer[..usize::from(wish_count)].copy_from_slice(bytes); - *inp = &inp[usize::from(wish_count)..]; - wish_count * 8 - } - None => { - let new_bits = inp.len() * 8; - buffer[..inp.len()].copy_from_slice(inp); - *inp = &[]; - new_bits as u8 - } - }; - self.bit_buffer |= u64::from_be_bytes(buffer).swap_bytes() << self.bits; - self.bits += new_bits; - } - - fn get_bits(&mut self) -> Option<Code> { - if self.bits < self.code_size { - return None; - } - - let mask = u64::from(self.code_mask); - let code = self.bit_buffer & mask; - self.bit_buffer >>= self.code_size; - self.bits -= self.code_size; - Some(code as u16) - } - - fn max_code(&self) -> Code { - self.code_mask - } - - fn code_size(&self) -> u8 { - self.code_size - } -} - -impl Buffer { - fn new() -> Self { - Buffer { - bytes: vec![0; MAX_ENTRIES].into_boxed_slice(), - read_mark: 0, - write_mark: 0, - } - } - - /// When encoding a sequence `cScSc` where `c` is any character and `S` is any string - /// this results in two codes `AB`, `A` encoding `cS` and `B` encoding `cSc`. Supposing - /// the buffer is already filled with the reconstruction of `A`, we can easily fill it - /// with the reconstruction of `B`. - fn fill_cscsc(&mut self) -> u8 { - self.bytes[self.write_mark] = self.bytes[0]; - self.write_mark += 1; - self.read_mark = 0; - self.bytes[0] - } - - // Fill the buffer by decoding from the table - fn fill_reconstruct(&mut self, table: &Table, code: Code) -> u8 { - self.write_mark = 0; - self.read_mark = 0; - let depth = table.depths[usize::from(code)]; - let mut memory = core::mem::replace(&mut self.bytes, Box::default()); - - let out = &mut memory[..usize::from(depth)]; - let last = table.reconstruct(code, out); - - self.bytes = memory; - self.write_mark = usize::from(depth); - last - } - - fn buffer(&self) -> &[u8] { - &self.bytes[self.read_mark..self.write_mark] - } - - fn consume(&mut self, amt: usize) { - self.read_mark += amt; - } -} - -impl Table { - fn new() -> Self { - Table { - inner: Vec::with_capacity(MAX_ENTRIES), - depths: Vec::with_capacity(MAX_ENTRIES), - } - } - - fn clear(&mut self, min_size: u8) { - let static_count = usize::from(1u16 << u16::from(min_size)) + 2; - self.inner.truncate(static_count); - self.depths.truncate(static_count); - } - - fn init(&mut self, min_size: u8) { - self.inner.clear(); - self.depths.clear(); - for i in 0..(1u16 << u16::from(min_size)) { - self.inner.push(Link::base(i as u8)); - self.depths.push(1); - } - // Clear code. - self.inner.push(Link::base(0)); - self.depths.push(0); - // End code. - self.inner.push(Link::base(0)); - self.depths.push(0); - } - - fn at(&self, code: Code) -> &Link { - &self.inner[usize::from(code)] - } - - fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - fn is_full(&self) -> bool { - self.inner.len() >= MAX_ENTRIES - } - - fn derive(&mut self, from: &Link, byte: u8, prev: Code) -> Link { - let link = from.derive(byte, prev); - let depth = self.depths[usize::from(prev)] + 1; - self.inner.push(link.clone()); - self.depths.push(depth); - link - } - - fn reconstruct(&self, code: Code, out: &mut [u8]) -> u8 { - let mut code_iter = code; - let table = &self.inner[..=usize::from(code)]; - let len = code_iter; - for ch in out.iter_mut().rev() { - //(code, cha) = self.table[k as usize]; - // Note: This could possibly be replaced with an unchecked array access if - // - value is asserted to be < self.next_code() in push - // - min_size is asserted to be < MAX_CODESIZE - let entry = &table[usize::from(code_iter)]; - code_iter = core::cmp::min(len, entry.prev); - *ch = entry.byte; - } - out[0] - } -} - -impl Link { - fn base(byte: u8) -> Self { - Link { prev: 0, byte } - } - - // TODO: this has self type to make it clear we might depend on the old in a future - // optimization. However, that has no practical purpose right now. - fn derive(&self, byte: u8, prev: Code) -> Self { - Link { prev, byte } - } -} - -#[cfg(test)] -mod tests { - use crate::alloc::vec::Vec; - #[cfg(feature = "std")] - use crate::StreamBuf; - use crate::{decode::Decoder, BitOrder}; - - #[test] - fn invalid_code_size_low() { - let _ = Decoder::new(BitOrder::Msb, 0); - let _ = Decoder::new(BitOrder::Msb, 1); - } - - #[test] - #[should_panic] - fn invalid_code_size_high() { - let _ = Decoder::new(BitOrder::Msb, 14); - } - - fn make_encoded() -> Vec<u8> { - const FILE: &'static [u8] = include_bytes!(concat!( - env!("CARGO_MANIFEST_DIR"), - "/benches/binary-8-msb.lzw" - )); - return Vec::from(FILE); - } - - #[test] - #[cfg(feature = "std")] - fn into_stream_buffer_no_alloc() { - let encoded = make_encoded(); - let mut decoder = Decoder::new(BitOrder::Msb, 8); - - let mut output = vec![]; - let mut buffer = [0; 512]; - let mut istream = decoder.into_stream(&mut output); - istream.set_buffer(&mut buffer[..]); - istream.decode(&encoded[..]).status.unwrap(); - - match istream.buffer { - Some(StreamBuf::Borrowed(_)) => {} - None => panic!("Decoded without buffer??"), - Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), - } - } - - #[test] - #[cfg(feature = "std")] - fn into_stream_buffer_small_alloc() { - struct WriteTap<W: std::io::Write>(W); - const BUF_SIZE: usize = 512; - - impl<W: std::io::Write> std::io::Write for WriteTap<W> { - fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { - assert!(buf.len() <= BUF_SIZE); - self.0.write(buf) - } - fn flush(&mut self) -> std::io::Result<()> { - self.0.flush() - } - } - - let encoded = make_encoded(); - let mut decoder = Decoder::new(BitOrder::Msb, 8); - - let mut output = vec![]; - let mut istream = decoder.into_stream(WriteTap(&mut output)); - istream.set_buffer_size(512); - istream.decode(&encoded[..]).status.unwrap(); - - match istream.buffer { - Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), - Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), - None => panic!("Decoded without buffer??"), - } - } - - #[test] - #[cfg(feature = "std")] - fn reset() { - let encoded = make_encoded(); - let mut decoder = Decoder::new(BitOrder::Msb, 8); - let mut reference = None; - - for _ in 0..2 { - let mut output = vec![]; - let mut buffer = [0; 512]; - let mut istream = decoder.into_stream(&mut output); - istream.set_buffer(&mut buffer[..]); - istream.decode_all(&encoded[..]).status.unwrap(); - - decoder.reset(); - if let Some(reference) = &reference { - assert_eq!(output, *reference); - } else { - reference = Some(output); - } - } - } -} diff --git a/vendor/weezl/src/decode_into_async.rs b/vendor/weezl/src/decode_into_async.rs deleted file mode 100644 index e39a26f..0000000 --- a/vendor/weezl/src/decode_into_async.rs +++ /dev/null @@ -1,143 +0,0 @@ -use crate::decode::IntoAsync; -use crate::error::LzwStatus; -use crate::error::StreamResult; -use crate::StreamBuf; -use std::io; - -impl<'d, W: futures::io::AsyncWrite + core::marker::Unpin> IntoAsync<'d, W> { - /// Decode data from a reader. - /// - /// This will read data until the stream is empty or an end marker is reached. - pub async fn decode(&mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { - self.decode_part(read, false).await - } - - /// Decode data from a reader, requiring an end marker. - pub async fn decode_all(mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { - self.decode_part(read, true).await - } - - /// Set the size of the intermediate decode buffer. - /// - /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is - /// available and any decoding method is called. No buffer is allocated if `set_buffer` has - /// been called. The buffer is reused. - /// - /// # Panics - /// This method panics if `size` is `0`. - pub fn set_buffer_size(&mut self, size: usize) { - assert_ne!(size, 0, "Attempted to set empty buffer"); - self.default_size = size; - } - - /// Use a particular buffer as an intermediate decode buffer. - /// - /// Calling this sets or replaces the buffer. When a buffer has been set then it is used - /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical - /// for efficient decoding. Some optimization techniques require the buffer to hold one or more - /// previous decoded words. There is also additional overhead from `write` calls each time the - /// buffer has been filled. - /// - /// # Panics - /// This method panics if the `buffer` is empty. - pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { - assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); - self.buffer = Some(StreamBuf::Borrowed(buffer)); - } - - async fn decode_part( - &mut self, - read: impl futures::io::AsyncBufRead, - must_finish: bool, - ) -> StreamResult { - use futures::io::AsyncBufReadExt; - use futures::io::AsyncWriteExt; - - let IntoAsync { - decoder, - writer, - buffer, - default_size, - } = self; - - futures::pin_mut!(read); - let mut read: core::pin::Pin<_> = read; - - let mut bytes_read = 0; - let mut bytes_written = 0; - - // Converting to mutable refs to move into the `once` closure. - let read_bytes = &mut bytes_read; - let write_bytes = &mut bytes_written; - - let outbuf: &mut [u8] = - match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { - StreamBuf::Borrowed(slice) => &mut *slice, - StreamBuf::Owned(vec) => &mut *vec, - }; - assert!(!outbuf.is_empty()); - - let status = loop { - // Try to grab one buffer of input data. - let mut filler = read.as_mut(); - let data = match filler.fill_buf().await { - Ok(buf) => buf, - Err(err) => break Err(err), - }; - - // Decode as much of the buffer as fits. - let result = decoder.decode_bytes(data, &mut outbuf[..]); - // Do the bookkeeping and consume the buffer. - *read_bytes += result.consumed_in; - *write_bytes += result.consumed_out; - read.as_mut().consume(result.consumed_in); - - // Handle an error status in the result. - let status = match result.status { - Ok(ok) => ok, - Err(err) => { - break Err(io::Error::new( - io::ErrorKind::InvalidData, - &*format!("{:?}", err), - )); - } - }; - - // Check if we had any new data at all. - if let LzwStatus::NoProgress = status { - debug_assert_eq!( - result.consumed_out, 0, - "No progress means we have not decoded any data" - ); - // In particular we did not finish decoding. - if must_finish { - break Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "No more data but no end marker detected", - )); - } else { - break Ok(()); - } - } - - // And finish by writing our result. - // TODO: we may lose data on error (also on status error above) which we might want to - // deterministically handle so that we don't need to restart everything from scratch as - // the only recovery strategy. Any changes welcome. - match writer.write_all(&outbuf[..result.consumed_out]).await { - Ok(_) => {} - Err(err) => break Err(err), - } - - if let LzwStatus::Done = status { - break Ok(()); - } - }; - - StreamResult { - bytes_read, - bytes_written, - status, - } - } -} diff --git a/vendor/weezl/src/encode.rs b/vendor/weezl/src/encode.rs deleted file mode 100644 index 492b18c..0000000 --- a/vendor/weezl/src/encode.rs +++ /dev/null @@ -1,1126 +0,0 @@ -//! A module for all encoding needs. -use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; -use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; - -use crate::alloc::{boxed::Box, vec::Vec}; -#[cfg(feature = "std")] -use crate::error::StreamResult; -#[cfg(feature = "std")] -use std::io::{self, BufRead, Write}; - -/// The state for encoding data with an LZW algorithm. -/// -/// The same structure can be utilized with streams as well as your own buffers and driver logic. -/// It may even be possible to mix them if you are sufficiently careful not to lose any written -/// data in the process. -/// -/// This is a sans-IO implementation, meaning that it only contains the state of the encoder and -/// the caller will provide buffers for input and output data when calling the basic -/// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` -/// methods for enoding with a particular style of common IO. -/// -/// * [`encode`] for encoding once without any IO-loop. -/// * [`into_async`] for encoding with the `futures` traits for asynchronous IO. -/// * [`into_stream`] for encoding with the standard `io` traits. -/// * [`into_vec`] for in-memory encoding. -/// -/// [`encode_bytes`]: #method.encode_bytes -/// [`encode`]: #method.encode -/// [`into_async`]: #method.into_async -/// [`into_stream`]: #method.into_stream -/// [`into_vec`]: #method.into_vec -pub struct Encoder { - /// Internally dispatch via a dynamic trait object. This did not have any significant - /// performance impact as we batch data internally and this pointer does not change after - /// creation! - state: Box<dyn Stateful + Send + 'static>, -} - -/// A encoding stream sink. -/// -/// See [`Encoder::into_stream`] on how to create this type. -/// -/// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream -#[cfg_attr( - not(feature = "std"), - deprecated = "This type is only useful with the `std` feature." -)] -#[cfg_attr(not(feature = "std"), allow(dead_code))] -pub struct IntoStream<'d, W> { - encoder: &'d mut Encoder, - writer: W, - buffer: Option<StreamBuf<'d>>, - default_size: usize, -} - -/// An async decoding sink. -/// -/// See [`Encoder::into_async`] on how to create this type. -/// -/// [`Encoder::into_async`]: struct.Encoder.html#method.into_async -#[cfg(feature = "async")] -pub struct IntoAsync<'d, W> { - encoder: &'d mut Encoder, - writer: W, - buffer: Option<StreamBuf<'d>>, - default_size: usize, -} - -/// A encoding sink into a vector. -/// -/// See [`Encoder::into_vec`] on how to create this type. -/// -/// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec -pub struct IntoVec<'d> { - encoder: &'d mut Encoder, - vector: &'d mut Vec<u8>, -} - -trait Stateful { - fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; - fn mark_ended(&mut self) -> bool; - /// Reset the state tracking if end code has been written. - fn restart(&mut self); - /// Reset the encoder to the beginning, dropping all buffers etc. - fn reset(&mut self); -} - -struct EncodeState<B: Buffer> { - /// The configured minimal code size. - min_size: u8, - /// The current encoding symbol tree. - tree: Tree, - /// If we have pushed the end code. - has_ended: bool, - /// If tiff then bumps are a single code sooner. - is_tiff: bool, - /// The code corresponding to the currently read characters. - current_code: Code, - /// The clear code for resetting the dictionary. - clear_code: Code, - /// The bit buffer for encoding. - buffer: B, -} - -struct MsbBuffer { - /// The current code length. - code_size: u8, - /// The buffer bits. - buffer: u64, - /// The number of valid buffer bits. - bits_in_buffer: u8, -} - -struct LsbBuffer { - /// The current code length. - code_size: u8, - /// The buffer bits. - buffer: u64, - /// The number of valid buffer bits. - bits_in_buffer: u8, -} - -trait Buffer { - fn new(size: u8) -> Self; - /// Reset the code size in the buffer. - fn reset(&mut self, min_size: u8); - /// Apply effects of a Clear Code. - fn clear(&mut self, min_size: u8); - /// Insert a code into the buffer. - fn buffer_code(&mut self, code: Code); - /// Push bytes if the buffer space is getting small. - fn push_out(&mut self, out: &mut &mut [u8]) -> bool; - /// Flush all full bytes, returning if at least one more byte remains. - fn flush_out(&mut self, out: &mut &mut [u8]) -> bool; - /// Pad the buffer to a full byte. - fn buffer_pad(&mut self); - /// Increase the maximum code size. - fn bump_code_size(&mut self); - /// Return the maximum code with the current code size. - fn max_code(&self) -> Code; - /// Return the current code size in bits. - fn code_size(&self) -> u8; -} - -/// One tree node for at most each code. -/// To avoid using too much memory we keep nodes with few successors in optimized form. This form -/// doesn't offer lookup by indexing but instead does a linear search. -#[derive(Default)] -struct Tree { - simples: Vec<Simple>, - complex: Vec<Full>, - keys: Vec<CompressedKey>, -} - -#[derive(Clone, Copy)] -enum FullKey { - NoSuccessor, - Simple(u16), - Full(u16), -} - -#[derive(Clone, Copy)] -struct CompressedKey(u16); - -const SHORT: usize = 16; - -#[derive(Clone, Copy)] -struct Simple { - codes: [Code; SHORT], - chars: [u8; SHORT], - count: u8, -} - -#[derive(Clone, Copy)] -struct Full { - char_continuation: [Code; 256], -} - -impl Encoder { - /// Create a new encoder with the specified bit order and symbol size. - /// - /// The algorithm for dynamically increasing the code symbol bit width is compatible with the - /// original specification. In particular you will need to specify an `Lsb` bit oder to encode - /// the data portion of a compressed `gif` image. - /// - /// # Panics - /// - /// The `size` needs to be in the interval `2..=12`. - pub fn new(order: BitOrder, size: u8) -> Self { - type Boxed = Box<dyn Stateful + Send + 'static>; - super::assert_encode_size(size); - let state = match order { - BitOrder::Lsb => Box::new(EncodeState::<LsbBuffer>::new(size)) as Boxed, - BitOrder::Msb => Box::new(EncodeState::<MsbBuffer>::new(size)) as Boxed, - }; - - Encoder { state } - } - - /// Create a TIFF compatible encoder with the specified bit order and symbol size. - /// - /// The algorithm for dynamically increasing the code symbol bit width is compatible with the - /// TIFF specification, which is a misinterpretation of the original algorithm for increasing - /// the code size. It switches one symbol sooner. - /// - /// # Panics - /// - /// The `size` needs to be in the interval `2..=12`. - pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { - type Boxed = Box<dyn Stateful + Send + 'static>; - super::assert_encode_size(size); - let state = match order { - BitOrder::Lsb => { - let mut state = Box::new(EncodeState::<LsbBuffer>::new(size)); - state.is_tiff = true; - state as Boxed - } - BitOrder::Msb => { - let mut state = Box::new(EncodeState::<MsbBuffer>::new(size)); - state.is_tiff = true; - state as Boxed - } - }; - - Encoder { state } - } - - /// Encode some bytes from `inp` into `out`. - /// - /// See [`into_stream`] for high-level functions (this interface is only available with the - /// `std` feature) and [`finish`] for marking the input data as complete. - /// - /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and - /// all following ones will _not_ be consumed and the `status` of the result will signal an - /// error. The result will also indicate that all bytes up to but not including the offending - /// byte have been consumed. You may try again with a fixed byte. - /// - /// [`into_stream`]: #method.into_stream - /// [`finish`]: #method.finish - pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { - self.state.advance(inp, out) - } - - /// Encode a single chunk of data. - /// - /// This method will add an end marker to the encoded chunk. - /// - /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize - /// buffer size, to supply an existing vector, to control whether an end marker is required, or - /// to preserve partial data in the case of a decoding error. - /// - /// [`into_vec`]: #into_vec - /// - /// # Example - /// - /// ``` - /// use weezl::{BitOrder, encode::Encoder}; - /// - /// let data = b"Hello, world"; - /// let encoded = Encoder::new(BitOrder::Msb, 9) - /// .encode(data) - /// .expect("All bytes valid for code size"); - /// ``` - pub fn encode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> { - let mut output = Vec::new(); - self.into_vec(&mut output).encode_all(data).status?; - Ok(output) - } - - /// Construct a encoder into a writer. - #[cfg(feature = "std")] - pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> { - IntoStream { - encoder: self, - writer, - buffer: None, - default_size: STREAM_BUF_SIZE, - } - } - - /// Construct a encoder into an async writer. - #[cfg(feature = "async")] - pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> { - IntoAsync { - encoder: self, - writer, - buffer: None, - default_size: STREAM_BUF_SIZE, - } - } - - /// Construct an encoder into a vector. - /// - /// All encoded data is appended and the vector is __not__ cleared. - /// - /// Compared to `into_stream` this interface allows a high-level access to encoding without - /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the - /// special target exposes. - pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> { - IntoVec { - encoder: self, - vector: vec, - } - } - - /// Mark the encoding as in the process of finishing. - /// - /// The next following call to `encode_bytes` which is able to consume the complete input will - /// also try to emit an end code. It's not recommended, but also not unsound, to use different - /// byte slices in different calls from this point forward and thus to 'delay' the actual end - /// of the data stream. The behaviour after the end marker has been written is unspecified but - /// sound. - pub fn finish(&mut self) { - self.state.mark_ended(); - } - - /// Undo marking this data stream as ending. - /// FIXME: clarify how this interacts with padding introduced after end code. - #[allow(dead_code)] - pub(crate) fn restart(&mut self) { - self.state.restart() - } - - /// Reset all internal state. - /// - /// This produce an encoder as if just constructed with `new` but taking slightly less work. In - /// particular it will not deallocate any internal allocations. It will also avoid some - /// duplicate setup work. - pub fn reset(&mut self) { - self.state.reset() - } -} - -#[cfg(feature = "std")] -impl<'d, W: Write> IntoStream<'d, W> { - /// Encode data from a reader. - /// - /// This will drain the supplied reader. It will not encode an end marker after all data has - /// been processed. - pub fn encode(&mut self, read: impl BufRead) -> StreamResult { - self.encode_part(read, false) - } - - /// Encode data from a reader and an end marker. - pub fn encode_all(mut self, read: impl BufRead) -> StreamResult { - self.encode_part(read, true) - } - - /// Set the size of the intermediate encode buffer. - /// - /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is - /// available and any encoding method is called. No buffer is allocated if `set_buffer` has - /// been called. The buffer is reused. - /// - /// # Panics - /// This method panics if `size` is `0`. - pub fn set_buffer_size(&mut self, size: usize) { - assert_ne!(size, 0, "Attempted to set empty buffer"); - self.default_size = size; - } - - /// Use a particular buffer as an intermediate encode buffer. - /// - /// Calling this sets or replaces the buffer. When a buffer has been set then it is used - /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant - /// for efficient encoding as there is additional overhead from `write` calls each time the - /// buffer has been filled. - /// - /// # Panics - /// This method panics if the `buffer` is empty. - pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { - assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); - self.buffer = Some(StreamBuf::Borrowed(buffer)); - } - - fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult { - let IntoStream { - encoder, - writer, - buffer, - default_size, - } = self; - enum Progress { - Ok, - Done, - } - - let mut bytes_read = 0; - let mut bytes_written = 0; - - let read_bytes = &mut bytes_read; - let write_bytes = &mut bytes_written; - - let outbuf: &mut [u8] = - match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { - StreamBuf::Borrowed(slice) => &mut *slice, - StreamBuf::Owned(vec) => &mut *vec, - }; - assert!(!outbuf.is_empty()); - - let once = move || { - let data = read.fill_buf()?; - - if data.is_empty() { - if finish { - encoder.finish(); - } else { - return Ok(Progress::Done); - } - } - - let result = encoder.encode_bytes(data, &mut outbuf[..]); - *read_bytes += result.consumed_in; - *write_bytes += result.consumed_out; - read.consume(result.consumed_in); - - let done = result.status.map_err(|err| { - io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) - })?; - - if let LzwStatus::Done = done { - writer.write_all(&outbuf[..result.consumed_out])?; - return Ok(Progress::Done); - } - - if let LzwStatus::NoProgress = done { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "No more data but no end marker detected", - )); - } - - writer.write_all(&outbuf[..result.consumed_out])?; - Ok(Progress::Ok) - }; - - let status = core::iter::repeat_with(once) - // scan+fuse can be replaced with map_while - .scan((), |(), result| match result { - Ok(Progress::Ok) => Some(Ok(())), - Err(err) => Some(Err(err)), - Ok(Progress::Done) => None, - }) - .fuse() - .collect(); - - StreamResult { - bytes_read, - bytes_written, - status, - } - } -} - -impl IntoVec<'_> { - /// Encode data from a slice. - pub fn encode(&mut self, read: &[u8]) -> VectorResult { - self.encode_part(read, false) - } - - /// Decode data from a reader, adding an end marker. - pub fn encode_all(mut self, read: &[u8]) -> VectorResult { - self.encode_part(read, true) - } - - fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) { - const CHUNK_SIZE: usize = 1 << 12; - let decoder = &mut self.encoder; - let length = self.vector.len(); - - // Use the vector to do overflow checks and w/e. - self.vector.reserve(CHUNK_SIZE); - // FIXME: encoding into uninit buffer? - self.vector.resize(length + CHUNK_SIZE, 0u8); - - (&mut self.vector[length..], decoder) - } - - fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult { - let mut result = VectorResult { - consumed_in: 0, - consumed_out: 0, - status: Ok(LzwStatus::Ok), - }; - - enum Progress { - Ok, - Done, - } - - // Converting to mutable refs to move into the `once` closure. - let read_bytes = &mut result.consumed_in; - let write_bytes = &mut result.consumed_out; - let mut data = part; - - // A 64 MB buffer is quite large but should get alloc_zeroed. - // Note that the decoded size can be up to quadratic in code block. - let once = move || { - // Grab a new output buffer. - let (outbuf, encoder) = self.grab_buffer(); - - if finish { - encoder.finish(); - } - - // Decode as much of the buffer as fits. - let result = encoder.encode_bytes(data, &mut outbuf[..]); - // Do the bookkeeping and consume the buffer. - *read_bytes += result.consumed_in; - *write_bytes += result.consumed_out; - data = &data[result.consumed_in..]; - - let unfilled = outbuf.len() - result.consumed_out; - let filled = self.vector.len() - unfilled; - self.vector.truncate(filled); - - // Handle the status in the result. - let done = result.status?; - if let LzwStatus::Done = done { - Ok(Progress::Done) - } else { - Ok(Progress::Ok) - } - }; - - // Decode chunks of input data until we're done. - let status: Result<(), _> = core::iter::repeat_with(once) - // scan+fuse can be replaced with map_while - .scan((), |(), result| match result { - Ok(Progress::Ok) => Some(Ok(())), - Err(err) => Some(Err(err)), - Ok(Progress::Done) => None, - }) - .fuse() - .collect(); - - if let Err(err) = status { - result.status = Err(err); - } - - result - } -} - -// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would -// trip over the usage of await, which is a reserved keyword in that edition/version. It only -// contains an impl block. -#[cfg(feature = "async")] -#[path = "encode_into_async.rs"] -mod impl_encode_into_async; - -impl<B: Buffer> EncodeState<B> { - fn new(min_size: u8) -> Self { - let clear_code = 1 << min_size; - let mut tree = Tree::default(); - tree.init(min_size); - let mut state = EncodeState { - min_size, - tree, - has_ended: false, - is_tiff: false, - current_code: clear_code, - clear_code, - buffer: B::new(min_size), - }; - state.buffer_code(clear_code); - state - } -} - -impl<B: Buffer> Stateful for EncodeState<B> { - fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { - let c_in = inp.len(); - let c_out = out.len(); - let mut status = Ok(LzwStatus::Ok); - - 'encoding: loop { - if self.push_out(&mut out) { - break; - } - - if inp.is_empty() && self.has_ended { - let end = self.end_code(); - if self.current_code != end { - if self.current_code != self.clear_code { - self.buffer_code(self.current_code); - - // When reading this code, the decoder will add an extra entry to its table - // before reading th end code. Thusly, it may increase its code size based - // on this additional entry. - if self.tree.keys.len() + usize::from(self.is_tiff) - > usize::from(self.buffer.max_code()) - && self.buffer.code_size() < MAX_CODESIZE - { - self.buffer.bump_code_size(); - } - } - self.buffer_code(end); - self.current_code = end; - self.buffer_pad(); - } - - break; - } - - let mut next_code = None; - let mut bytes = inp.iter(); - while let Some(&byte) = bytes.next() { - if self.min_size < 8 && byte >= 1 << self.min_size { - status = Err(LzwError::InvalidCode); - break 'encoding; - } - - inp = bytes.as_slice(); - match self.tree.iterate(self.current_code, byte) { - Ok(code) => self.current_code = code, - Err(_) => { - next_code = Some(self.current_code); - - self.current_code = u16::from(byte); - break; - } - } - } - - match next_code { - // No more bytes, no code produced. - None => break, - Some(code) => { - self.buffer_code(code); - - if self.tree.keys.len() + usize::from(self.is_tiff) - > usize::from(self.buffer.max_code()) + 1 - && self.buffer.code_size() < MAX_CODESIZE - { - self.buffer.bump_code_size(); - } - - if self.tree.keys.len() > MAX_ENTRIES { - self.buffer_code(self.clear_code); - self.tree.reset(self.min_size); - self.buffer.clear(self.min_size); - } - } - } - } - - if inp.is_empty() && self.current_code == self.end_code() { - if !self.flush_out(&mut out) { - status = Ok(LzwStatus::Done); - } - } - - BufferResult { - consumed_in: c_in - inp.len(), - consumed_out: c_out - out.len(), - status, - } - } - - fn mark_ended(&mut self) -> bool { - core::mem::replace(&mut self.has_ended, true) - } - - fn restart(&mut self) { - self.has_ended = false; - } - - fn reset(&mut self) { - self.restart(); - self.current_code = self.clear_code; - self.tree.reset(self.min_size); - self.buffer.reset(self.min_size); - self.buffer_code(self.clear_code); - } -} - -impl<B: Buffer> EncodeState<B> { - fn push_out(&mut self, out: &mut &mut [u8]) -> bool { - self.buffer.push_out(out) - } - - fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { - self.buffer.flush_out(out) - } - - fn end_code(&self) -> Code { - self.clear_code + 1 - } - - fn buffer_pad(&mut self) { - self.buffer.buffer_pad(); - } - - fn buffer_code(&mut self, code: Code) { - self.buffer.buffer_code(code); - } -} - -impl Buffer for MsbBuffer { - fn new(min_size: u8) -> Self { - MsbBuffer { - code_size: min_size + 1, - buffer: 0, - bits_in_buffer: 0, - } - } - - fn reset(&mut self, min_size: u8) { - self.code_size = min_size + 1; - self.buffer = 0; - self.bits_in_buffer = 0; - } - - fn clear(&mut self, min_size: u8) { - self.code_size = min_size + 1; - } - - fn buffer_code(&mut self, code: Code) { - let shift = 64 - self.bits_in_buffer - self.code_size; - self.buffer |= u64::from(code) << shift; - self.bits_in_buffer += self.code_size; - } - - fn push_out(&mut self, out: &mut &mut [u8]) -> bool { - if self.bits_in_buffer + 2 * self.code_size < 64 { - return false; - } - - self.flush_out(out) - } - - fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { - let want = usize::from(self.bits_in_buffer / 8); - let count = want.min((*out).len()); - let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); - *out = tail; - - for b in bytes { - *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8; - self.buffer <<= 8; - self.bits_in_buffer -= 8; - } - - count < want - } - - fn buffer_pad(&mut self) { - let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; - self.bits_in_buffer += to_byte; - } - - fn bump_code_size(&mut self) { - self.code_size += 1; - } - - fn max_code(&self) -> Code { - (1 << self.code_size) - 1 - } - - fn code_size(&self) -> u8 { - self.code_size - } -} - -impl Buffer for LsbBuffer { - fn new(min_size: u8) -> Self { - LsbBuffer { - code_size: min_size + 1, - buffer: 0, - bits_in_buffer: 0, - } - } - - fn reset(&mut self, min_size: u8) { - self.code_size = min_size + 1; - self.buffer = 0; - self.bits_in_buffer = 0; - } - - fn clear(&mut self, min_size: u8) { - self.code_size = min_size + 1; - } - - fn buffer_code(&mut self, code: Code) { - self.buffer |= u64::from(code) << self.bits_in_buffer; - self.bits_in_buffer += self.code_size; - } - - fn push_out(&mut self, out: &mut &mut [u8]) -> bool { - if self.bits_in_buffer + 2 * self.code_size < 64 { - return false; - } - - self.flush_out(out) - } - - fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { - let want = usize::from(self.bits_in_buffer / 8); - let count = want.min((*out).len()); - let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); - *out = tail; - - for b in bytes { - *b = (self.buffer & 0x0000_0000_0000_00ff) as u8; - self.buffer >>= 8; - self.bits_in_buffer -= 8; - } - - count < want - } - - fn buffer_pad(&mut self) { - let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; - self.bits_in_buffer += to_byte; - } - - fn bump_code_size(&mut self) { - self.code_size += 1; - } - - fn max_code(&self) -> Code { - (1 << self.code_size) - 1 - } - - fn code_size(&self) -> u8 { - self.code_size - } -} - -impl Tree { - fn init(&mut self, min_size: u8) { - // We need a way to represent the state of a currently empty buffer. We use the clear code - // for this, thus create one complex mapping that leads to the one-char base codes. - self.keys - .resize((1 << min_size) + 2, FullKey::NoSuccessor.into()); - self.complex.push(Full { - char_continuation: [0; 256], - }); - let map_of_begin = self.complex.last_mut().unwrap(); - for ch in 0u16..256 { - map_of_begin.char_continuation[usize::from(ch)] = ch; - } - self.keys[1 << min_size] = FullKey::Full(0).into(); - } - - fn reset(&mut self, min_size: u8) { - self.simples.clear(); - self.keys.truncate((1 << min_size) + 2); - // Keep entry for clear code. - self.complex.truncate(1); - // The first complex is not changed.. - for k in self.keys[..(1 << min_size) + 2].iter_mut() { - *k = FullKey::NoSuccessor.into(); - } - self.keys[1 << min_size] = FullKey::Full(0).into(); - } - - fn at_key(&self, code: Code, ch: u8) -> Option<Code> { - let key = self.keys[usize::from(code)]; - match FullKey::from(key) { - FullKey::NoSuccessor => None, - FullKey::Simple(idx) => { - let nexts = &self.simples[usize::from(idx)]; - let successors = nexts - .codes - .iter() - .zip(nexts.chars.iter()) - .take(usize::from(nexts.count)); - for (&scode, &sch) in successors { - if sch == ch { - return Some(scode); - } - } - - None - } - FullKey::Full(idx) => { - let full = &self.complex[usize::from(idx)]; - let precode = full.char_continuation[usize::from(ch)]; - if usize::from(precode) < MAX_ENTRIES { - Some(precode) - } else { - None - } - } - } - } - - /// Iterate to the next char. - /// Return Ok when it was already in the tree or creates a new entry for it and returns Err. - fn iterate(&mut self, code: Code, ch: u8) -> Result<Code, Code> { - if let Some(next) = self.at_key(code, ch) { - Ok(next) - } else { - Err(self.append(code, ch)) - } - } - - fn append(&mut self, code: Code, ch: u8) -> Code { - let next: Code = self.keys.len() as u16; - let key = self.keys[usize::from(code)]; - // TODO: with debug assertions, check for non-existence - match FullKey::from(key) { - FullKey::NoSuccessor => { - let new_key = FullKey::Simple(self.simples.len() as u16); - self.simples.push(Simple::default()); - let simples = self.simples.last_mut().unwrap(); - simples.codes[0] = next; - simples.chars[0] = ch; - simples.count = 1; - self.keys[usize::from(code)] = new_key.into(); - } - FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => { - let nexts = &mut self.simples[usize::from(idx)]; - let nidx = usize::from(nexts.count); - nexts.chars[nidx] = ch; - nexts.codes[nidx] = next; - nexts.count += 1; - } - FullKey::Simple(idx) => { - let new_key = FullKey::Full(self.complex.len() as u16); - let simples = &self.simples[usize::from(idx)]; - self.complex.push(Full { - char_continuation: [Code::max_value(); 256], - }); - let full = self.complex.last_mut().unwrap(); - for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) { - full.char_continuation[usize::from(pch)] = pcont; - } - self.keys[usize::from(code)] = new_key.into(); - } - FullKey::Full(idx) => { - let full = &mut self.complex[usize::from(idx)]; - full.char_continuation[usize::from(ch)] = next; - } - } - self.keys.push(FullKey::NoSuccessor.into()); - next - } -} - -impl Default for FullKey { - fn default() -> Self { - FullKey::NoSuccessor - } -} - -impl Default for Simple { - fn default() -> Self { - Simple { - codes: [0; SHORT], - chars: [0; SHORT], - count: 0, - } - } -} - -impl From<CompressedKey> for FullKey { - fn from(CompressedKey(key): CompressedKey) -> Self { - match (key >> MAX_CODESIZE) & 0xf { - 0 => FullKey::Full(key & 0xfff), - 1 => FullKey::Simple(key & 0xfff), - _ => FullKey::NoSuccessor, - } - } -} - -impl From<FullKey> for CompressedKey { - fn from(full: FullKey) -> Self { - CompressedKey(match full { - FullKey::NoSuccessor => 0x2000, - FullKey::Simple(code) => 0x1000 | code, - FullKey::Full(code) => code, - }) - } -} - -#[cfg(test)] -mod tests { - use super::{BitOrder, Encoder, LzwError, LzwStatus}; - use crate::alloc::vec::Vec; - use crate::decode::Decoder; - #[cfg(feature = "std")] - use crate::StreamBuf; - - #[test] - fn invalid_input_rejected() { - const BIT_LEN: u8 = 2; - let ref input = [0, 1 << BIT_LEN /* invalid */, 0]; - let ref mut target = [0u8; 128]; - let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN); - - encoder.finish(); - // We require simulation of normality, that is byte-for-byte compression. - let result = encoder.encode_bytes(input, target); - assert!(if let Err(LzwError::InvalidCode) = result.status { - true - } else { - false - }); - assert_eq!(result.consumed_in, 1); - - let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]); - assert!(if let Ok(LzwStatus::Done) = fixed.status { - true - } else { - false - }); - assert_eq!(fixed.consumed_in, 2); - - // Okay, now test we actually fixed it. - let ref mut compare = [0u8; 4]; - let mut todo = &target[..result.consumed_out + fixed.consumed_out]; - let mut free = &mut compare[..]; - let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN); - - // Decode with up to 16 rounds, far too much but inconsequential. - for _ in 0..16 { - if decoder.has_ended() { - break; - } - - let result = decoder.decode_bytes(todo, free); - assert!(result.status.is_ok()); - todo = &todo[result.consumed_in..]; - free = &mut free[result.consumed_out..]; - } - - let remaining = { free }.len(); - let len = compare.len() - remaining; - assert_eq!(todo, &[]); - assert_eq!(compare[..len], [0, 1, 0]); - } - - #[test] - #[should_panic] - fn invalid_code_size_low() { - let _ = Encoder::new(BitOrder::Msb, 1); - } - - #[test] - #[should_panic] - fn invalid_code_size_high() { - let _ = Encoder::new(BitOrder::Msb, 14); - } - - fn make_decoded() -> Vec<u8> { - const FILE: &'static [u8] = - include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.lock")); - return Vec::from(FILE); - } - - #[test] - #[cfg(feature = "std")] - fn into_stream_buffer_no_alloc() { - let encoded = make_decoded(); - let mut encoder = Encoder::new(BitOrder::Msb, 8); - - let mut output = vec![]; - let mut buffer = [0; 512]; - let mut istream = encoder.into_stream(&mut output); - istream.set_buffer(&mut buffer[..]); - istream.encode(&encoded[..]).status.unwrap(); - - match istream.buffer { - Some(StreamBuf::Borrowed(_)) => {} - None => panic!("Decoded without buffer??"), - Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), - } - } - - #[test] - #[cfg(feature = "std")] - fn into_stream_buffer_small_alloc() { - struct WriteTap<W: std::io::Write>(W); - const BUF_SIZE: usize = 512; - - impl<W: std::io::Write> std::io::Write for WriteTap<W> { - fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { - assert!(buf.len() <= BUF_SIZE); - self.0.write(buf) - } - fn flush(&mut self) -> std::io::Result<()> { - self.0.flush() - } - } - - let encoded = make_decoded(); - let mut encoder = Encoder::new(BitOrder::Msb, 8); - - let mut output = vec![]; - let mut istream = encoder.into_stream(WriteTap(&mut output)); - istream.set_buffer_size(512); - istream.encode(&encoded[..]).status.unwrap(); - - match istream.buffer { - Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), - Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), - None => panic!("Decoded without buffer??"), - } - } - - #[test] - #[cfg(feature = "std")] - fn reset() { - let encoded = make_decoded(); - let mut encoder = Encoder::new(BitOrder::Msb, 8); - let mut reference = None; - - for _ in 0..2 { - let mut output = vec![]; - let mut buffer = [0; 512]; - let mut istream = encoder.into_stream(&mut output); - istream.set_buffer(&mut buffer[..]); - istream.encode_all(&encoded[..]).status.unwrap(); - - encoder.reset(); - if let Some(reference) = &reference { - assert_eq!(output, *reference); - } else { - reference = Some(output); - } - } - } -} diff --git a/vendor/weezl/src/encode_into_async.rs b/vendor/weezl/src/encode_into_async.rs deleted file mode 100644 index 6973540..0000000 --- a/vendor/weezl/src/encode_into_async.rs +++ /dev/null @@ -1,142 +0,0 @@ -use crate::encode::IntoAsync; -use crate::error::LzwStatus; -use crate::error::StreamResult; -use crate::StreamBuf; -use std::io; - -impl<'d, W: futures::io::AsyncWrite + core::marker::Unpin> IntoAsync<'d, W> { - /// Encode data from a reader. - /// - /// This will drain the supplied reader. It will not encode an end marker after all data has - /// been processed. - pub async fn encode(&mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { - self.encode_part(read, false).await - } - - /// Encode data from a reader and an end marker. - pub async fn encode_all(mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { - self.encode_part(read, true).await - } - - /// Set the size of the intermediate decode buffer. - /// - /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is - /// available and any decoding method is called. No buffer is allocated if `set_buffer` has - /// been called. The buffer is reused. - /// - /// # Panics - /// This method panics if `size` is `0`. - pub fn set_buffer_size(&mut self, size: usize) { - assert_ne!(size, 0, "Attempted to set empty buffer"); - self.default_size = size; - } - - /// Use a particular buffer as an intermediate decode buffer. - /// - /// Calling this sets or replaces the buffer. When a buffer has been set then it is used - /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical - /// for efficient decoding. Some optimization techniques require the buffer to hold one or more - /// previous decoded words. There is also additional overhead from `write` calls each time the - /// buffer has been filled. - /// - /// # Panics - /// This method panics if the `buffer` is empty. - pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { - assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); - self.buffer = Some(StreamBuf::Borrowed(buffer)); - } - - async fn encode_part( - &mut self, - read: impl futures::io::AsyncBufRead, - finish: bool, - ) -> StreamResult { - use futures::io::AsyncBufReadExt; - use futures::io::AsyncWriteExt; - - let IntoAsync { - encoder, - writer, - buffer, - default_size, - } = self; - - futures::pin_mut!(read); - let mut read: core::pin::Pin<_> = read; - - let mut bytes_read = 0; - let mut bytes_written = 0; - - // Converting to mutable refs to move into the `once` closure. - let read_bytes = &mut bytes_read; - let write_bytes = &mut bytes_written; - - let outbuf: &mut [u8] = - match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { - StreamBuf::Borrowed(slice) => &mut *slice, - StreamBuf::Owned(vec) => &mut *vec, - }; - assert!(!outbuf.is_empty()); - - let status = loop { - // Try to grab one buffer of input data. - let mut filler = read.as_mut(); - let data = match filler.fill_buf().await { - Ok(buf) => buf, - Err(err) => break Err(err), - }; - - if data.is_empty() { - if finish { - encoder.finish(); - } else { - break Ok(()); - } - } - - // Decode as much of the buffer as fits. - let result = encoder.encode_bytes(data, &mut outbuf[..]); - // Do the bookkeeping and consume the buffer. - *read_bytes += result.consumed_in; - *write_bytes += result.consumed_out; - read.as_mut().consume(result.consumed_in); - - // Handle an error status in the result. - let done = match result.status { - Ok(ok) => ok, - Err(err) => { - break Err(io::Error::new( - io::ErrorKind::InvalidData, - &*format!("{:?}", err), - )); - } - }; - - if let LzwStatus::Done = done { - break writer.write_all(&outbuf[..result.consumed_out]).await; - } - - if let LzwStatus::NoProgress = done { - break Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "No more data but no end marker detected", - )); - } - - // And finish by writing our result. - // TODO: we may lose data on error (also on status error above) which we might want to - // deterministically handle so that we don't need to restart everything from scratch as - // the only recovery strategy. Any changes welcome. - match writer.write_all(&outbuf[..result.consumed_out]).await { - Ok(_) => {} - Err(err) => break Err(err), - } - }; - - StreamResult { - bytes_read, - bytes_written, - status, - } - } -} diff --git a/vendor/weezl/src/error.rs b/vendor/weezl/src/error.rs deleted file mode 100644 index 38dd95c..0000000 --- a/vendor/weezl/src/error.rs +++ /dev/null @@ -1,72 +0,0 @@ -/// The result of a coding operation on a pair of buffer. -#[must_use = "Contains a status with potential error information"] -pub struct BufferResult { - /// The number of bytes consumed from the input buffer. - pub consumed_in: usize, - /// The number of bytes written into the output buffer. - pub consumed_out: usize, - /// The status after returning from the write call. - pub status: Result<LzwStatus, LzwError>, -} - -/// The result of a coding operation into a vector. -#[must_use = "Contains a status with potential error information"] -pub struct VectorResult { - /// The number of bytes consumed from the input buffer. - pub consumed_in: usize, - /// The number of bytes written into the output buffer. - pub consumed_out: usize, - /// The status after returning from the write call. - pub status: Result<LzwStatus, LzwError>, -} - -/// The result of coding into an output stream. -#[cfg(feature = "std")] -#[must_use = "Contains a status with potential error information"] -pub struct StreamResult { - /// The total number of bytes consumed from the reader. - pub bytes_read: usize, - /// The total number of bytes written into the writer. - pub bytes_written: usize, - /// The possible error that occurred. - /// - /// Note that when writing into streams it is not in general possible to recover from an error. - pub status: std::io::Result<()>, -} - -/// The status after successful coding of an LZW stream. -#[derive(Debug, Clone, Copy)] -pub enum LzwStatus { - /// Everything went well. - Ok, - /// No bytes were read or written and no internal state advanced. - /// - /// If this is returned but your application can not provide more input data then decoding is - /// definitely stuck for good and it should stop trying and report some error of its own. In - /// other situations this may be used as a signal to refill an internal buffer. - NoProgress, - /// No more data will be produced because an end marker was reached. - Done, -} - -/// The error kind after unsuccessful coding of an LZW stream. -#[derive(Debug, Clone, Copy)] -pub enum LzwError { - /// The input contained an invalid code. - /// - /// For decompression this refers to a code larger than those currently known through the prior - /// decoding stages. For compression this refers to a byte that has no code representation due - /// to being larger than permitted by the `size` parameter given to the Encoder. - InvalidCode, -} - -impl core::fmt::Display for LzwError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - match self { - LzwError::InvalidCode => f.write_str("invalid code in LZW stream"), - } - } -} - -#[cfg(feature = "std")] -impl std::error::Error for LzwError {} diff --git a/vendor/weezl/src/lib.rs b/vendor/weezl/src/lib.rs deleted file mode 100644 index 3286eb9..0000000 --- a/vendor/weezl/src/lib.rs +++ /dev/null @@ -1,146 +0,0 @@ -//! # LZW decoder and encoder -//! -//! This crates provides an `Encoder` and a `Decoder` in their respective modules. The code words -//! are written from and to bit byte slices (or streams) where it is possible to write either the -//! most or least significant bits first. The maximum possible code size is 12 bits, the smallest -//! available code size is 2 bits. -//! -//! ## Example -//! -//! These two code blocks show the compression and corresponding decompression. Note that you must -//! use the same arguments to `Encoder` and `Decoder`, otherwise the decoding might fail or produce -//! bad results. -//! -#![cfg_attr(feature = "std", doc = "```")] -#![cfg_attr(not(feature = "std"), doc = "```ignore")] -//! use weezl::{BitOrder, encode::Encoder}; -//! -//! let data = b"Hello, world"; -//! let compressed = Encoder::new(BitOrder::Msb, 9) -//! .encode(data) -//! .unwrap(); -//! ``` -//! -#![cfg_attr(feature = "std", doc = "```")] -#![cfg_attr(not(feature = "std"), doc = "```ignore")] -//! use weezl::{BitOrder, decode::Decoder}; -//! # let compressed = b"\x80\x04\x81\x94l\x1b\x06\xf0\xb0 \x1d\xc6\xf1\xc8l\x19 \x10".to_vec(); -//! # let data = b"Hello, world"; -//! -//! let decompressed = Decoder::new(BitOrder::Msb, 9) -//! .decode(&compressed) -//! .unwrap(); -//! assert_eq!(decompressed, data); -//! ``` -//! -//! ## LZW Details -//! -//! The de- and encoder expect the LZW stream to start with a clear code and end with an -//! end code which are defined as follows: -//! -//! * `CLEAR_CODE == 1 << min_code_size` -//! * `END_CODE == CLEAR_CODE + 1` -//! -//! For optimal performance, all buffers and input and output slices should be as large as possible -//! and at least 2048 bytes long. This extends to input streams which should have similarly sized -//! buffers. This library uses Rust's standard allocation interfaces (`Box` and `Vec` to be -//! precise). Since there are no ways to handle allocation errors it is not recommended to operate -//! it on 16-bit targets. -//! -//! ## Allocations and standard library -//! -//! The main algorithm can be used in `no_std` as well, although it requires an allocator. This -//! restriction might be lifted at a later stage. For this you should deactivate the `std` feature. -//! The main interfaces stay intact but the `into_stream` combinator is no available. -#![cfg_attr(not(feature = "std"), no_std)] -#![forbid(unsafe_code)] -#![forbid(missing_docs)] - -#[cfg(all(feature = "alloc", not(feature = "std")))] -extern crate alloc; -#[cfg(all(feature = "alloc", feature = "std"))] -use std as alloc; - -pub(crate) const MAX_CODESIZE: u8 = 12; -pub(crate) const MAX_ENTRIES: usize = 1 << MAX_CODESIZE as usize; - -/// Alias for a LZW code point -pub(crate) type Code = u16; - -/// A default buffer size for encoding/decoding buffer. -/// -/// Note that this is larger than the default size for buffers (usually 4K) since each code word -/// can expand to multiple bytes. Expanding one buffer would yield multiple and require a costly -/// break in the decoding loop. Note that the decoded size can be up to quadratic in code block. -pub(crate) const STREAM_BUF_SIZE: usize = 1 << 24; - -/// The order of bits in bytes. -#[derive(Clone, Copy, Debug)] -pub enum BitOrder { - /// The most significant bit is processed first. - Msb, - /// The least significant bit is processed first. - Lsb, -} - -/// An owned or borrowed buffer for stream operations. -#[cfg(feature = "alloc")] -pub(crate) enum StreamBuf<'d> { - Borrowed(&'d mut [u8]), - Owned(crate::alloc::vec::Vec<u8>), -} - -#[cold] -fn assert_decode_size(size: u8) { - assert!( - size <= MAX_CODESIZE, - "Maximum code size 12 required, got {}", - size - ); -} - -#[cold] -fn assert_encode_size(size: u8) { - assert!(size >= 2, "Minimum code size 2 required, got {}", size); - assert!( - size <= MAX_CODESIZE, - "Maximum code size 12 required, got {}", - size - ); -} - -#[cfg(feature = "alloc")] -pub mod decode; -#[cfg(feature = "alloc")] -pub mod encode; -mod error; - -#[cfg(feature = "std")] -pub use self::error::StreamResult; -pub use self::error::{BufferResult, LzwError, LzwStatus}; - -#[cfg(all(test, feature = "alloc"))] -mod tests { - use crate::decode::Decoder; - use crate::encode::Encoder; - - #[cfg(feature = "std")] - use crate::{decode, encode}; - - #[test] - fn stable_send() { - fn must_be_send<T: Send + 'static>() {} - must_be_send::<Decoder>(); - must_be_send::<Encoder>(); - - #[cfg(feature = "std")] - fn _send_and_lt<'lt, T: Send + 'lt>() {} - - // Check that the inference `W: Send + 'd` => `IntoStream: Send + 'd` works. - #[cfg(feature = "std")] - fn _all_send_writer<'d, W: std::io::Write + Send + 'd>() { - _send_and_lt::<'d, decode::IntoStream<'d, W>>(); - _send_and_lt::<'d, encode::IntoStream<'d, W>>(); - } - } -} |