Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions parquet-variant/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ rust-version = { workspace = true }

[dependencies]
arrow-schema = "55.1.0"
chrono = { workspace = true }

[lib]


294 changes: 251 additions & 43 deletions parquet-variant/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
// specific language governing permissions and limitations
// under the License.
use arrow_schema::ArrowError;
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
use std::array::TryFromSliceError;

use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice};
use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};

#[derive(Debug, Clone, Copy)]
pub enum VariantBasicType {
Expand All @@ -33,7 +34,18 @@ pub enum VariantPrimitiveType {
BooleanTrue = 1,
BooleanFalse = 2,
Int8 = 3,
// TODO: Add types for the rest of primitives, once API is agreed upon
Int16 = 4,
Int32 = 5,
Int64 = 6,
Double = 7,
Decimal4 = 8,
Decimal8 = 9,
Decimal16 = 10,
Date = 11,
TimestampMicros = 12,
TimestampNtzMicros = 13,
Float = 14,
Binary = 15,
String = 16,
}

Expand Down Expand Up @@ -64,7 +76,18 @@ impl TryFrom<u8> for VariantPrimitiveType {
1 => Ok(VariantPrimitiveType::BooleanTrue),
2 => Ok(VariantPrimitiveType::BooleanFalse),
3 => Ok(VariantPrimitiveType::Int8),
// TODO: Add types for the rest, once API is agreed upon
4 => Ok(VariantPrimitiveType::Int16),
5 => Ok(VariantPrimitiveType::Int32),
6 => Ok(VariantPrimitiveType::Int64),
7 => Ok(VariantPrimitiveType::Double),
8 => Ok(VariantPrimitiveType::Decimal4),
9 => Ok(VariantPrimitiveType::Decimal8),
10 => Ok(VariantPrimitiveType::Decimal16),
11 => Ok(VariantPrimitiveType::Date),
12 => Ok(VariantPrimitiveType::TimestampMicros),
13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
14 => Ok(VariantPrimitiveType::Float),
15 => Ok(VariantPrimitiveType::Binary),
16 => Ok(VariantPrimitiveType::String),
_ => Err(ArrowError::InvalidArgumentError(format!(
"unknown primitive type: {}",
Expand All @@ -73,10 +96,10 @@ impl TryFrom<u8> for VariantPrimitiveType {
}
}
}
/// Extract the primitive type from a Variant value-header byte
pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> {
/// Extract the primitive type from a Variant value-metadata byte
pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
// last 6 bits contain the primitive-type, see spec
VariantPrimitiveType::try_from(header >> 2)
VariantPrimitiveType::try_from(metadata >> 2)
}

/// To be used in `map_err` when unpacking an integer from a slice of bytes.
Expand All @@ -85,23 +108,103 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
}

/// Decodes an Int8 from the value section of a variant.
pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> {
let value = i8::from_le_bytes(array_from_slice(value, 1)?);
pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
}

/// Decodes an Int16 from the value section of a variant.
pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
}

/// Decodes an Int32 from the value section of a variant.
pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
}

/// Decodes an Int64 from the value section of a variant.
pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
}

/// Decodes a Decimal4 from the value section of a variant.
pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
Ok((integer, scale))
}

/// Decodes a Decimal8 from the value section of a variant.
pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
Ok((integer, scale))
}

/// Decodes a Decimal16 from the value section of a variant.
pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
Ok((integer, scale))
}

/// Decodes a Float from the value section of a variant.
pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
}

/// Decodes a Double from the value section of a variant.
pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
}

/// Decodes a Date from the value section of a variant.
pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
Ok(value.date_naive())
}

/// Decodes a TimestampMicros from the value section of a variant.
pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
ArrowError::CastError(format!(
"Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
))
})
}

/// Decodes a TimestampNtzMicros from the value section of a variant.
pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
DateTime::from_timestamp_micros(micros_since_epoch)
.ok_or_else(|| {
ArrowError::CastError(format!(
"Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
))
})
.map(|v| v.naive_utc())
}

/// Decodes a Binary from the value section of a variant.
pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
let value = slice_from_slice(data, 4..4 + len)?;
Ok(value)
}

/// Decodes a long string from the value section of a variant.
pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> {
let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize;
let string = string_from_slice(value, 5..5 + len)?;
pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
let string = string_from_slice(data, 4..4 + len)?;
Ok(string)
}

/// Decodes a short string from the value section of a variant.
pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> {
let len = (first_byte_from_slice(value)? >> 2) as usize;

let string = string_from_slice(value, 1..1 + len)?;
pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> {
let len = (metadata >> 2) as usize;
let string = string_from_slice(data, 0..len)?;
Ok(string)
}

Expand All @@ -111,47 +214,152 @@ mod tests {

#[test]
fn test_i8() -> Result<(), ArrowError> {
let value = [
3 << 2, // Primitive type for i8
42,
];
let result = decode_int8(&value)?;
let data = [0x2a];
let result = decode_int8(&data)?;
assert_eq!(result, 42);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also change this to 0x2a?

Ok(())
}

#[test]
fn test_short_string() -> Result<(), ArrowError> {
let value = [
1 | 5 << 2, // Basic type for short string | length of short string
b'H',
b'e',
b'l',
b'l',
b'o',
b'o',
fn test_i16() -> Result<(), ArrowError> {
let data = [0xd2, 0x04];
let result = decode_int16(&data)?;
assert_eq!(result, 1234);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0x04d2?

Writing the number as hex makes clear that the test is about getting the bytes where they belong. Otherwise, most humans will have to go to their calculator to check the hex for 1234.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to have input be raw bytes and output be the cast rust datatype. For example with the test_date case, it asserts that the output is a NaiveDate rather than raw bytes.

Ok(())
}

#[test]
fn test_i32() -> Result<(), ArrowError> {
let data = [0x40, 0xe2, 0x01, 0x00];
let result = decode_int32(&data)?;
assert_eq!(result, 123456);
Ok(())
}

#[test]
fn test_i64() -> Result<(), ArrowError> {
let data = [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11];
let result = decode_int64(&data)?;
assert_eq!(result, 1234567890123456789);
Ok(())
}

#[test]
fn test_decimal4() -> Result<(), ArrowError> {
let data = [
0x02, // Scale
0xd2, 0x04, 0x00, 0x00, // Integer
];
let result = decode_short_string(&value)?;
let result = decode_decimal4(&data)?;
assert_eq!(result, (1234, 2));
Ok(())
}

#[test]
fn test_decimal8() -> Result<(), ArrowError> {
let data = [
0x02, // Scale
0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Integer
];
let result = decode_decimal8(&data)?;
assert_eq!(result, (1234567890, 2));
Ok(())
}

#[test]
fn test_decimal16() -> Result<(), ArrowError> {
let data = [
0x02, // Scale
0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, // Integer
];
let result = decode_decimal16(&data)?;
assert_eq!(result, (1234567891234567890, 2));
Ok(())
}

#[test]
fn test_float() -> Result<(), ArrowError> {
let data = [0x06, 0x2c, 0x93, 0x4e];
let result = decode_float(&data)?;
assert_eq!(result, 1234567890.1234);
Ok(())
}

#[test]
fn test_double() -> Result<(), ArrowError> {
let data = [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41];
let result = decode_double(&data)?;
assert_eq!(result, 1234567890.1234);
Ok(())
}

#[test]
fn test_date() -> Result<(), ArrowError> {
let data = [0xe2, 0x4e, 0x0, 0x0];
let result = decode_date(&data)?;
assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap());
Ok(())
}

#[test]
fn test_timestamp_micros() -> Result<(), ArrowError> {
let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00];
let result = decode_timestamp_micros(&data)?;
assert_eq!(
result,
NaiveDate::from_ymd_opt(2025, 4, 16)
.unwrap()
.and_hms_milli_opt(16, 34, 56, 780)
.unwrap()
.and_utc()
);
Ok(())
}

#[test]
fn test_timestampntz_micros() -> Result<(), ArrowError> {
let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00];
let result = decode_timestampntz_micros(&data)?;
assert_eq!(
result,
NaiveDate::from_ymd_opt(2025, 4, 16)
.unwrap()
.and_hms_milli_opt(16, 34, 56, 780)
.unwrap()
);
Ok(())
}

#[test]
fn test_binary() -> Result<(), ArrowError> {
let data = [
0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
];
let result = decode_binary(&data)?;
assert_eq!(
result,
[0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
);
Ok(())
}

#[test]
fn test_short_string() -> Result<(), ArrowError> {
let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
let result = decode_short_string(1 | 5 << 2, &data)?;
assert_eq!(result, "Hello");
Ok(())
}

#[test]
fn test_string() -> Result<(), ArrowError> {
let value = [
16 << 2, // Basic type for short string | length of short string
5,
0,
0,
0, // Length of string
b'H',
b'e',
b'l',
b'l',
b'o',
b'o',
let data = [
0x05, 0, 0, 0, // Length of string, 4-byte little-endian
b'H', b'e', b'l', b'l', b'o', b'o',
];
let result = decode_long_string(&value)?;
let result = decode_long_string(&data)?;
assert_eq!(result, "Hello");
Ok(())
}
Expand Down
Loading
Loading