-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Implement read support for remaining primitive types #7644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
512b73a
d354c95
e0918d1
2eb1d22
f926138
aa64f33
a4c48c3
8b40263
85036ad
ffa65fc
f6a6a7b
2f27c38
9a89cc7
9fb65ac
1292140
81ee289
8e0985c
b62d776
932d925
346c264
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,7 +32,6 @@ rust-version = { workspace = true } | |
|
|
||
| [dependencies] | ||
| arrow-schema = "55.1.0" | ||
| chrono = { workspace = true } | ||
|
|
||
| [lib] | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,9 +15,10 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
| use arrow_schema::ArrowError; | ||
| use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc}; | ||
| use std::array::TryFromSliceError; | ||
|
|
||
| use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice}; | ||
| use crate::utils::{array_from_slice, slice_from_slice, string_from_slice}; | ||
|
|
||
| #[derive(Debug, Clone, Copy)] | ||
| pub enum VariantBasicType { | ||
|
|
@@ -33,7 +34,18 @@ pub enum VariantPrimitiveType { | |
| BooleanTrue = 1, | ||
| BooleanFalse = 2, | ||
| Int8 = 3, | ||
| // TODO: Add types for the rest of primitives, once API is agreed upon | ||
| Int16 = 4, | ||
| Int32 = 5, | ||
| Int64 = 6, | ||
| Double = 7, | ||
| Decimal4 = 8, | ||
| Decimal8 = 9, | ||
| Decimal16 = 10, | ||
| Date = 11, | ||
| TimestampMicros = 12, | ||
| TimestampNtzMicros = 13, | ||
| Float = 14, | ||
| Binary = 15, | ||
| String = 16, | ||
| } | ||
|
|
||
|
|
@@ -64,7 +76,18 @@ impl TryFrom<u8> for VariantPrimitiveType { | |
| 1 => Ok(VariantPrimitiveType::BooleanTrue), | ||
| 2 => Ok(VariantPrimitiveType::BooleanFalse), | ||
| 3 => Ok(VariantPrimitiveType::Int8), | ||
| // TODO: Add types for the rest, once API is agreed upon | ||
| 4 => Ok(VariantPrimitiveType::Int16), | ||
| 5 => Ok(VariantPrimitiveType::Int32), | ||
| 6 => Ok(VariantPrimitiveType::Int64), | ||
| 7 => Ok(VariantPrimitiveType::Double), | ||
| 8 => Ok(VariantPrimitiveType::Decimal4), | ||
| 9 => Ok(VariantPrimitiveType::Decimal8), | ||
| 10 => Ok(VariantPrimitiveType::Decimal16), | ||
| 11 => Ok(VariantPrimitiveType::Date), | ||
| 12 => Ok(VariantPrimitiveType::TimestampMicros), | ||
| 13 => Ok(VariantPrimitiveType::TimestampNtzMicros), | ||
| 14 => Ok(VariantPrimitiveType::Float), | ||
| 15 => Ok(VariantPrimitiveType::Binary), | ||
| 16 => Ok(VariantPrimitiveType::String), | ||
| _ => Err(ArrowError::InvalidArgumentError(format!( | ||
| "unknown primitive type: {}", | ||
|
|
@@ -73,10 +96,10 @@ impl TryFrom<u8> for VariantPrimitiveType { | |
| } | ||
| } | ||
| } | ||
| /// Extract the primitive type from a Variant value-header byte | ||
| pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { | ||
| /// Extract the primitive type from a Variant value-metadata byte | ||
| pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> { | ||
| // last 6 bits contain the primitive-type, see spec | ||
| VariantPrimitiveType::try_from(header >> 2) | ||
| VariantPrimitiveType::try_from(metadata >> 2) | ||
| } | ||
|
|
||
| /// To be used in `map_err` when unpacking an integer from a slice of bytes. | ||
|
|
@@ -85,23 +108,103 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { | |
| } | ||
|
|
||
| /// Decodes an Int8 from the value section of a variant. | ||
| pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> { | ||
| let value = i8::from_le_bytes(array_from_slice(value, 1)?); | ||
| pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> { | ||
| Ok(i8::from_le_bytes(array_from_slice(data, 0)?)) | ||
| } | ||
|
|
||
| /// Decodes an Int16 from the value section of a variant. | ||
| pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> { | ||
| Ok(i16::from_le_bytes(array_from_slice(data, 0)?)) | ||
| } | ||
|
|
||
| /// Decodes an Int32 from the value section of a variant. | ||
| pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> { | ||
| Ok(i32::from_le_bytes(array_from_slice(data, 0)?)) | ||
| } | ||
|
|
||
| /// Decodes an Int64 from the value section of a variant. | ||
| pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> { | ||
| Ok(i64::from_le_bytes(array_from_slice(data, 0)?)) | ||
| } | ||
|
|
||
| /// Decodes a Decimal4 from the value section of a variant. | ||
| pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> { | ||
| let scale = u8::from_le_bytes(array_from_slice(data, 0)?); | ||
| let integer = i32::from_le_bytes(array_from_slice(data, 1)?); | ||
| Ok((integer, scale)) | ||
| } | ||
|
|
||
| /// Decodes a Decimal8 from the value section of a variant. | ||
| pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> { | ||
| let scale = u8::from_le_bytes(array_from_slice(data, 0)?); | ||
| let integer = i64::from_le_bytes(array_from_slice(data, 1)?); | ||
| Ok((integer, scale)) | ||
| } | ||
|
|
||
| /// Decodes a Decimal16 from the value section of a variant. | ||
| pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> { | ||
| let scale = u8::from_le_bytes(array_from_slice(data, 0)?); | ||
| let integer = i128::from_le_bytes(array_from_slice(data, 1)?); | ||
| Ok((integer, scale)) | ||
| } | ||
|
|
||
| /// Decodes a Float from the value section of a variant. | ||
| pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> { | ||
| Ok(f32::from_le_bytes(array_from_slice(data, 0)?)) | ||
| } | ||
|
|
||
| /// Decodes a Double from the value section of a variant. | ||
| pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> { | ||
| Ok(f64::from_le_bytes(array_from_slice(data, 0)?)) | ||
| } | ||
|
|
||
| /// Decodes a Date from the value section of a variant. | ||
| pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> { | ||
| let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?); | ||
| let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch)); | ||
| Ok(value.date_naive()) | ||
| } | ||
|
|
||
| /// Decodes a TimestampMicros from the value section of a variant. | ||
| pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> { | ||
| let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); | ||
| DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| { | ||
| ArrowError::CastError(format!( | ||
| "Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>" | ||
| )) | ||
| }) | ||
| } | ||
|
|
||
| /// Decodes a TimestampNtzMicros from the value section of a variant. | ||
| pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> { | ||
| let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); | ||
| DateTime::from_timestamp_micros(micros_since_epoch) | ||
| .ok_or_else(|| { | ||
| ArrowError::CastError(format!( | ||
| "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime" | ||
| )) | ||
| }) | ||
| .map(|v| v.naive_utc()) | ||
| } | ||
|
|
||
| /// Decodes a Binary from the value section of a variant. | ||
| pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> { | ||
| let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; | ||
| let value = slice_from_slice(data, 4..4 + len)?; | ||
| Ok(value) | ||
| } | ||
|
|
||
| /// Decodes a long string from the value section of a variant. | ||
| pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { | ||
| let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; | ||
| let string = string_from_slice(value, 5..5 + len)?; | ||
| pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> { | ||
| let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; | ||
| let string = string_from_slice(data, 4..4 + len)?; | ||
| Ok(string) | ||
| } | ||
|
|
||
| /// Decodes a short string from the value section of a variant. | ||
| pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { | ||
| let len = (first_byte_from_slice(value)? >> 2) as usize; | ||
|
|
||
| let string = string_from_slice(value, 1..1 + len)?; | ||
| pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> { | ||
| let len = (metadata >> 2) as usize; | ||
| let string = string_from_slice(data, 0..len)?; | ||
| Ok(string) | ||
| } | ||
|
|
||
|
|
@@ -111,47 +214,152 @@ mod tests { | |
|
|
||
| #[test] | ||
| fn test_i8() -> Result<(), ArrowError> { | ||
| let value = [ | ||
| 3 << 2, // Primitive type for i8 | ||
| 42, | ||
| ]; | ||
| let result = decode_int8(&value)?; | ||
| let data = [0x2a]; | ||
| let result = decode_int8(&data)?; | ||
| assert_eq!(result, 42); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also change this to 0x2a? |
||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_short_string() -> Result<(), ArrowError> { | ||
| let value = [ | ||
| 1 | 5 << 2, // Basic type for short string | length of short string | ||
| b'H', | ||
| b'e', | ||
| b'l', | ||
| b'l', | ||
| b'o', | ||
| b'o', | ||
| fn test_i16() -> Result<(), ArrowError> { | ||
| let data = [0xd2, 0x04]; | ||
| let result = decode_int16(&data)?; | ||
| assert_eq!(result, 1234); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0x04d2? Writing the number as hex makes clear that the test is about getting the bytes where they belong. Otherwise, most humans will have to go to their calculator to check the hex for 1234.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sense to have input be raw bytes and output be the cast rust datatype. For example with the |
||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_i32() -> Result<(), ArrowError> { | ||
| let data = [0x40, 0xe2, 0x01, 0x00]; | ||
| let result = decode_int32(&data)?; | ||
| assert_eq!(result, 123456); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_i64() -> Result<(), ArrowError> { | ||
| let data = [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11]; | ||
| let result = decode_int64(&data)?; | ||
| assert_eq!(result, 1234567890123456789); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_decimal4() -> Result<(), ArrowError> { | ||
superserious-dev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| let data = [ | ||
| 0x02, // Scale | ||
| 0xd2, 0x04, 0x00, 0x00, // Integer | ||
| ]; | ||
| let result = decode_short_string(&value)?; | ||
| let result = decode_decimal4(&data)?; | ||
| assert_eq!(result, (1234, 2)); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_decimal8() -> Result<(), ArrowError> { | ||
| let data = [ | ||
| 0x02, // Scale | ||
| 0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Integer | ||
| ]; | ||
| let result = decode_decimal8(&data)?; | ||
| assert_eq!(result, (1234567890, 2)); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_decimal16() -> Result<(), ArrowError> { | ||
| let data = [ | ||
| 0x02, // Scale | ||
| 0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
| 0x00, 0x00, // Integer | ||
| ]; | ||
| let result = decode_decimal16(&data)?; | ||
| assert_eq!(result, (1234567891234567890, 2)); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_float() -> Result<(), ArrowError> { | ||
| let data = [0x06, 0x2c, 0x93, 0x4e]; | ||
| let result = decode_float(&data)?; | ||
| assert_eq!(result, 1234567890.1234); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_double() -> Result<(), ArrowError> { | ||
| let data = [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41]; | ||
| let result = decode_double(&data)?; | ||
| assert_eq!(result, 1234567890.1234); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_date() -> Result<(), ArrowError> { | ||
| let data = [0xe2, 0x4e, 0x0, 0x0]; | ||
| let result = decode_date(&data)?; | ||
| assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_timestamp_micros() -> Result<(), ArrowError> { | ||
| let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00]; | ||
| let result = decode_timestamp_micros(&data)?; | ||
| assert_eq!( | ||
| result, | ||
| NaiveDate::from_ymd_opt(2025, 4, 16) | ||
| .unwrap() | ||
| .and_hms_milli_opt(16, 34, 56, 780) | ||
| .unwrap() | ||
| .and_utc() | ||
| ); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_timestampntz_micros() -> Result<(), ArrowError> { | ||
| let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00]; | ||
| let result = decode_timestampntz_micros(&data)?; | ||
| assert_eq!( | ||
| result, | ||
| NaiveDate::from_ymd_opt(2025, 4, 16) | ||
| .unwrap() | ||
| .and_hms_milli_opt(16, 34, 56, 780) | ||
| .unwrap() | ||
| ); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_binary() -> Result<(), ArrowError> { | ||
| let data = [ | ||
| 0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian | ||
| 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, | ||
| ]; | ||
| let result = decode_binary(&data)?; | ||
| assert_eq!( | ||
| result, | ||
| [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe] | ||
| ); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_short_string() -> Result<(), ArrowError> { | ||
| let data = [b'H', b'e', b'l', b'l', b'o', b'o']; | ||
| let result = decode_short_string(1 | 5 << 2, &data)?; | ||
| assert_eq!(result, "Hello"); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_string() -> Result<(), ArrowError> { | ||
| let value = [ | ||
| 16 << 2, // Basic type for short string | length of short string | ||
| 5, | ||
| 0, | ||
| 0, | ||
| 0, // Length of string | ||
| b'H', | ||
| b'e', | ||
| b'l', | ||
| b'l', | ||
| b'o', | ||
| b'o', | ||
| let data = [ | ||
| 0x05, 0, 0, 0, // Length of string, 4-byte little-endian | ||
| b'H', b'e', b'l', b'l', b'o', b'o', | ||
| ]; | ||
| let result = decode_long_string(&value)?; | ||
| let result = decode_long_string(&data)?; | ||
| assert_eq!(result, "Hello"); | ||
| Ok(()) | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.