diff options
Diffstat (limited to 'src/frame/header.rs')
-rw-r--r-- | src/frame/header.rs | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/src/frame/header.rs b/src/frame/header.rs new file mode 100644 index 0000000..1513c11 --- /dev/null +++ b/src/frame/header.rs @@ -0,0 +1,412 @@ +use twox_hash::XxHash32; + +use super::Error; +use std::{ + convert::TryInto, + fmt::Debug, + hash::Hasher, + io, + io::{Read, Write}, +}; + +const FLG_RESERVED_MASK: u8 = 0b00000010; +const FLG_VERSION_MASK: u8 = 0b11000000; +const FLG_SUPPORTED_VERSION_BITS: u8 = 0b01000000; + +const FLG_INDEPENDENT_BLOCKS: u8 = 0b00100000; +const FLG_BLOCK_CHECKSUMS: u8 = 0b00010000; +const FLG_CONTENT_SIZE: u8 = 0b00001000; +const FLG_CONTENT_CHECKSUM: u8 = 0b00000100; +const FLG_DICTIONARY_ID: u8 = 0b00000001; + +const BD_RESERVED_MASK: u8 = !BD_BLOCK_SIZE_MASK; +const BD_BLOCK_SIZE_MASK: u8 = 0b01110000; +const BD_BLOCK_SIZE_MASK_RSHIFT: u8 = 4; + +const BLOCK_UNCOMPRESSED_SIZE_BIT: u32 = 0x80000000; + +const LZ4F_MAGIC_NUMBER: u32 = 0x184D2204; +pub(crate) const LZ4F_LEGACY_MAGIC_NUMBER: u32 = 0x184C2102; +const LZ4F_SKIPPABLE_MAGIC_RANGE: std::ops::RangeInclusive<u32> = 0x184D2A50..=0x184D2A5F; + +pub(crate) const MAGIC_NUMBER_SIZE: usize = 4; +pub(crate) const MIN_FRAME_INFO_SIZE: usize = 7; +pub(crate) const MAX_FRAME_INFO_SIZE: usize = 19; +pub(crate) const BLOCK_INFO_SIZE: usize = 4; + +#[derive(Clone, Copy, PartialEq, Debug)] +/// Different predefines blocksizes to choose when compressing data. +#[derive(Default)] +pub enum BlockSize { + /// Will detect optimal frame size based on the size of the first write call + #[default] + Auto = 0, + /// The default block size. + Max64KB = 4, + /// 256KB block size. + Max256KB = 5, + /// 1MB block size. + Max1MB = 6, + /// 4MB block size. + Max4MB = 7, + /// 8MB block size. + Max8MB = 8, +} + +impl BlockSize { + /// Try to find optimal size based on passed buffer length. + pub(crate) fn from_buf_length(buf_len: usize) -> Self { + let mut blocksize = BlockSize::Max4MB; + + for candidate in [BlockSize::Max256KB, BlockSize::Max64KB] { + if buf_len > candidate.get_size() { + return blocksize; + } + blocksize = candidate; + } + BlockSize::Max64KB + } + pub(crate) fn get_size(&self) -> usize { + match self { + BlockSize::Auto => unreachable!(), + BlockSize::Max64KB => 64 * 1024, + BlockSize::Max256KB => 256 * 1024, + BlockSize::Max1MB => 1024 * 1024, + BlockSize::Max4MB => 4 * 1024 * 1024, + BlockSize::Max8MB => 8 * 1024 * 1024, + } + } +} + +#[derive(Clone, Copy, PartialEq, Debug)] +/// The two `BlockMode` operations that can be set on (`FrameInfo`)[FrameInfo] +#[derive(Default)] +pub enum BlockMode { + /// Every block is compressed independently. The default. + #[default] + Independent, + /// Blocks can reference data from previous blocks. + /// + /// Effective when the stream contains small blocks. + Linked, +} + +// From: https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md +// +// General Structure of LZ4 Frame format +// ------------------------------------- +// +// | MagicNb | F. Descriptor | Block | (...) | EndMark | C. Checksum | +// |:-------:|:-------------:| ----- | ----- | ------- | ----------- | +// | 4 bytes | 3-15 bytes | | | 4 bytes | 0-4 bytes | +// +// Frame Descriptor +// ---------------- +// +// | FLG | BD | (Content Size) | (Dictionary ID) | HC | +// | ------- | ------- |:--------------:|:---------------:| ------- | +// | 1 byte | 1 byte | 0 - 8 bytes | 0 - 4 bytes | 1 byte | +// +// __FLG byte__ +// +// | BitNb | 7-6 | 5 | 4 | 3 | 2 | 1 | 0 | +// | ------- |-------|-------|----------|------|----------|----------|------| +// |FieldName|Version|B.Indep|B.Checksum|C.Size|C.Checksum|*Reserved*|DictID| +// +// __BD byte__ +// +// | BitNb | 7 | 6-5-4 | 3-2-1-0 | +// | ------- | -------- | ------------- | -------- | +// |FieldName|*Reserved*| Block MaxSize |*Reserved*| +// +// Data Blocks +// ----------- +// +// | Block Size | data | (Block Checksum) | +// |:----------:| ------ |:----------------:| +// | 4 bytes | | 0 - 4 bytes | +// +#[derive(Debug, Default, Clone)] +/// The metadata for de/compressing with lz4 frame format. +pub struct FrameInfo { + /// If set, includes the total uncompressed size of data in the frame. + pub content_size: Option<u64>, + /// The identifier for the dictionary that must be used to correctly decode data. + /// The compressor and the decompressor must use exactly the same dictionary. + /// + /// Note that this is currently unsupported and for this reason it's not pub. + pub(crate) dict_id: Option<u32>, + /// The maximum uncompressed size of each data block. + pub block_size: BlockSize, + /// The block mode. + pub block_mode: BlockMode, + /// If set, includes a checksum for each data block in the frame. + pub block_checksums: bool, + /// If set, includes a content checksum to verify that the full frame contents have been + /// decoded correctly. + pub content_checksum: bool, + /// If set, use the legacy frame format + pub legacy_frame: bool, +} + +impl FrameInfo { + /// Create a new `FrameInfo`. + pub fn new() -> Self { + Self::default() + } + + /// Whether to include the total uncompressed size of data in the frame. + pub fn content_size(mut self, content_size: Option<u64>) -> Self { + self.content_size = content_size; + self + } + + /// The maximum uncompressed size of each data block. + pub fn block_size(mut self, block_size: BlockSize) -> Self { + self.block_size = block_size; + self + } + + /// The block mode. + pub fn block_mode(mut self, block_mode: BlockMode) -> Self { + self.block_mode = block_mode; + self + } + + /// If set, includes a checksum for each data block in the frame. + pub fn block_checksums(mut self, block_checksums: bool) -> Self { + self.block_checksums = block_checksums; + self + } + + /// If set, includes a content checksum to verify that the full frame contents have been + /// decoded correctly. + pub fn content_checksum(mut self, content_checksum: bool) -> Self { + self.content_checksum = content_checksum; + self + } + + /// If set, use the legacy frame format. + pub fn legacy_frame(mut self, legacy_frame: bool) -> Self { + self.legacy_frame = legacy_frame; + self + } + + pub(crate) fn read_size(input: &[u8]) -> Result<usize, Error> { + let mut required = MIN_FRAME_INFO_SIZE; + let magic_num = u32::from_le_bytes(input[0..4].try_into().unwrap()); + if magic_num == LZ4F_LEGACY_MAGIC_NUMBER { + return Ok(MAGIC_NUMBER_SIZE); + } + + if input.len() < required { + return Ok(required); + } + + if LZ4F_SKIPPABLE_MAGIC_RANGE.contains(&magic_num) { + return Ok(8); + } + if magic_num != LZ4F_MAGIC_NUMBER { + return Err(Error::WrongMagicNumber); + } + + if input[4] & FLG_CONTENT_SIZE != 0 { + required += 8; + } + if input[4] & FLG_DICTIONARY_ID != 0 { + required += 4 + } + Ok(required) + } + + pub(crate) fn write_size(&self) -> usize { + let mut required = MIN_FRAME_INFO_SIZE; + if self.content_size.is_some() { + required += 8; + } + if self.dict_id.is_some() { + required += 4; + } + required + } + + pub(crate) fn write(&self, output: &mut [u8]) -> Result<usize, Error> { + let write_size = self.write_size(); + if output.len() < write_size { + return Err(Error::IoError(io::ErrorKind::UnexpectedEof.into())); + } + let mut buffer = [0u8; MAX_FRAME_INFO_SIZE]; + assert!(write_size <= buffer.len()); + buffer[0..4].copy_from_slice(&LZ4F_MAGIC_NUMBER.to_le_bytes()); + buffer[4] = FLG_SUPPORTED_VERSION_BITS; + if self.block_checksums { + buffer[4] |= FLG_BLOCK_CHECKSUMS; + } + if self.content_checksum { + buffer[4] |= FLG_CONTENT_CHECKSUM; + } + if self.block_mode == BlockMode::Independent { + buffer[4] |= FLG_INDEPENDENT_BLOCKS; + } + buffer[5] = (self.block_size as u8) << BD_BLOCK_SIZE_MASK_RSHIFT; + + // Optional section + let mut offset = 6; + if let Some(size) = self.content_size { + buffer[4] |= FLG_CONTENT_SIZE; + buffer[offset..offset + 8].copy_from_slice(&size.to_le_bytes()); + offset += 8; + } + if let Some(dict_id) = self.dict_id { + buffer[4] |= FLG_DICTIONARY_ID; + buffer[offset..offset + 4].copy_from_slice(&dict_id.to_le_bytes()); + offset += 4; + } + + // Header checksum + let mut hasher = XxHash32::with_seed(0); + hasher.write(&buffer[4..offset]); + let header_checksum = (hasher.finish() >> 8) as u8; + buffer[offset] = header_checksum; + offset += 1; + + debug_assert_eq!(offset, write_size); + output[..write_size].copy_from_slice(&buffer[..write_size]); + Ok(write_size) + } + + pub(crate) fn read(mut input: &[u8]) -> Result<FrameInfo, Error> { + let original_input = input; + // 4 byte Magic + let magic_num = { + let mut buffer = [0u8; 4]; + input.read_exact(&mut buffer)?; + u32::from_le_bytes(buffer) + }; + if magic_num == LZ4F_LEGACY_MAGIC_NUMBER { + return Ok(FrameInfo { + block_size: BlockSize::Max8MB, + legacy_frame: true, + ..FrameInfo::default() + }); + } + if LZ4F_SKIPPABLE_MAGIC_RANGE.contains(&magic_num) { + let mut buffer = [0u8; 4]; + input.read_exact(&mut buffer)?; + let user_data_len = u32::from_le_bytes(buffer); + return Err(Error::SkippableFrame(user_data_len)); + } + if magic_num != LZ4F_MAGIC_NUMBER { + return Err(Error::WrongMagicNumber); + } + + // fixed size section + let [flg_byte, bd_byte] = { + let mut buffer = [0u8, 0]; + input.read_exact(&mut buffer)?; + buffer + }; + + if flg_byte & FLG_VERSION_MASK != FLG_SUPPORTED_VERSION_BITS { + // version is always 01 + return Err(Error::UnsupportedVersion(flg_byte & FLG_VERSION_MASK)); + } + + if flg_byte & FLG_RESERVED_MASK != 0 || bd_byte & BD_RESERVED_MASK != 0 { + return Err(Error::ReservedBitsSet); + } + + let block_mode = if flg_byte & FLG_INDEPENDENT_BLOCKS != 0 { + BlockMode::Independent + } else { + BlockMode::Linked + }; + let content_checksum = flg_byte & FLG_CONTENT_CHECKSUM != 0; + let block_checksums = flg_byte & FLG_BLOCK_CHECKSUMS != 0; + + let block_size = match (bd_byte & BD_BLOCK_SIZE_MASK) >> BD_BLOCK_SIZE_MASK_RSHIFT { + i @ 0..=3 => return Err(Error::UnsupportedBlocksize(i)), + 4 => BlockSize::Max64KB, + 5 => BlockSize::Max256KB, + 6 => BlockSize::Max1MB, + 7 => BlockSize::Max4MB, + _ => unreachable!(), + }; + + // var len section + let mut content_size = None; + if flg_byte & FLG_CONTENT_SIZE != 0 { + let mut buffer = [0u8; 8]; + input.read_exact(&mut buffer).unwrap(); + content_size = Some(u64::from_le_bytes(buffer)); + } + + let mut dict_id = None; + if flg_byte & FLG_DICTIONARY_ID != 0 { + let mut buffer = [0u8; 4]; + input.read_exact(&mut buffer)?; + dict_id = Some(u32::from_le_bytes(buffer)); + } + + // 1 byte header checksum + let expected_checksum = { + let mut buffer = [0u8; 1]; + input.read_exact(&mut buffer)?; + buffer[0] + }; + let mut hasher = XxHash32::with_seed(0); + hasher.write(&original_input[4..original_input.len() - input.len() - 1]); + let header_hash = (hasher.finish() >> 8) as u8; + if header_hash != expected_checksum { + return Err(Error::HeaderChecksumError); + } + + Ok(FrameInfo { + content_size, + dict_id, + block_size, + block_mode, + block_checksums, + content_checksum, + legacy_frame: false, + }) + } +} + +#[derive(Debug)] +pub(crate) enum BlockInfo { + Compressed(u32), + Uncompressed(u32), + EndMark, +} + +impl BlockInfo { + pub(crate) fn read(mut input: &[u8]) -> Result<Self, Error> { + let mut size_buffer = [0u8; 4]; + input.read_exact(&mut size_buffer)?; + let size = u32::from_le_bytes(size_buffer); + if size == 0 { + Ok(BlockInfo::EndMark) + } else if size & BLOCK_UNCOMPRESSED_SIZE_BIT != 0 { + Ok(BlockInfo::Uncompressed(size & !BLOCK_UNCOMPRESSED_SIZE_BIT)) + } else { + Ok(BlockInfo::Compressed(size)) + } + } + + pub(crate) fn write(&self, mut output: &mut [u8]) -> Result<usize, Error> { + let value = match self { + BlockInfo::Compressed(len) if *len == 0 => return Err(Error::InvalidBlockInfo), + BlockInfo::Compressed(len) | BlockInfo::Uncompressed(len) + if *len & BLOCK_UNCOMPRESSED_SIZE_BIT != 0 => + { + return Err(Error::InvalidBlockInfo) + } + BlockInfo::Compressed(len) => *len, + BlockInfo::Uncompressed(len) => *len | BLOCK_UNCOMPRESSED_SIZE_BIT, + BlockInfo::EndMark => 0, + }; + output.write_all(&value.to_le_bytes())?; + Ok(4) + } +} |