Skip to content

Commit

Permalink
Remove Chrono dependency (#38)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Nov 12, 2023
1 parent fd1bc01 commit f19cc7b
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 110 deletions.
3 changes: 0 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ rust-version = "1.70"
[dependencies]
arrow = { version = "47.0", features = ["prettyprint"] }
bytes = "1.4"
chrono = "0.4"
fallible-streaming-iterator = { version = "0.1" }
flate2 = "1"
futures = { version = "0.3", default-features = false, features = ["std"] }
Expand Down
37 changes: 17 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# datafusion-orc
Implementation of ORC file format
Implementation of ORC file format read/write with Arrow in-memory format

[![test](https://github.com/datafusion-contrib/datafusion-orc/actions/workflows/ci.yml/badge.svg)](https://github.com/datafusion-contrib/datafusion-orc/actions/workflows/ci.yml)
[![codecov](https://codecov.io/gh/WenyXu/orc-rs/branch/main/graph/badge.svg?token=2CSHZX02XM)](https://codecov.io/gh/WenyXu/orc-rs)
Expand All @@ -16,22 +16,22 @@ Read [Apache ORC](https://orc.apache.org/) in Rust.

## Current Support

| Column Encoding | Read | Write | Rust Type | Arrow DataType |
| ------------------------- | ---- | ----- | --------------------- | ----------------------- |
| SmallInt, Int, BigInt || | i16, i32, i64 | Int16, Int32, Int64 |
| Float, Double || | f32, f64 | Float32, Float64 |
| String, Char, and VarChar || | string | Utf8 |
| Boolean || | bool | Boolean |
| TinyInt || | i8 | Int8 |
| Binary || | Vec\<u8\> | Binary |
| Decimal || | | |
| Date || | chrono::NaiveDate | Date32 |
| Timestamp || | chrono::NaiveDateTime | Timestamp(Nanosecond,_) |
| Timestamp instant || | | |
| Struct || | | Struct |
| List || | | List |
| Map || | | Map |
| Union || | | |
| Column Encoding | Read | Write | Arrow DataType |
| ------------------------- | ---- | ----- | ----------------------- |
| SmallInt, Int, BigInt || | Int16, Int32, Int64 |
| Float, Double || | Float32, Float64 |
| String, Char, and VarChar || | Utf8 |
| Boolean || | Boolean |
| TinyInt || | Int8 |
| Binary || | Binary |
| Decimal || | |
| Date || | Date32 |
| Timestamp || | Timestamp(Nanosecond,_) |
| Timestamp instant || | |
| Struct || | Struct |
| List || | List |
| Map || | Map |
| Union || | |


## Compression Support
Expand All @@ -45,6 +45,3 @@ Read [Apache ORC](https://orc.apache.org/) in Rust.
| LZ4 |||
| ZSTD |||




17 changes: 7 additions & 10 deletions src/arrow_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ use arrow::datatypes::{Field, TimeUnit};
use arrow::error::ArrowError;
use arrow::record_batch::{RecordBatch, RecordBatchReader};
use bytes::Bytes;
use chrono::{Datelike, NaiveDate, NaiveDateTime};
use snafu::{OptionExt, ResultExt};

use self::column::list::{new_list_iter, ListDecoder};
Expand All @@ -30,7 +29,6 @@ use self::column::tinyint::new_i8_iter;
use self::column::Column;
use crate::arrow_reader::column::binary::new_binary_iterator;
use crate::arrow_reader::column::boolean::new_boolean_iter;
use crate::arrow_reader::column::date::{new_date_iter, UNIX_EPOCH_FROM_CE};
use crate::arrow_reader::column::float::{new_f32_iter, new_f64_iter};
use crate::arrow_reader::column::int::new_i64_iter;
use crate::arrow_reader::column::string::StringDecoder;
Expand Down Expand Up @@ -152,8 +150,8 @@ pub enum Decoder {
Boolean(NullableIterator<bool>),
Float32(NullableIterator<f32>),
Float64(NullableIterator<f64>),
Timestamp(NullableIterator<NaiveDateTime>),
Date(NullableIterator<NaiveDate>),
Timestamp(NullableIterator<i64>),
Date(NullableIterator<i64>),
String(StringDecoder),
Binary(NullableIterator<Vec<u8>>),
Struct(StructDecoder),
Expand Down Expand Up @@ -476,8 +474,7 @@ impl Decoder {
.downcast_mut::<TimestampNanosecondBuilder>()
.unwrap();
for value in values {
builder
.append_option(value.map(|value| value.timestamp_nanos_opt().unwrap()));
builder.append_option(value);
}
}
}
Expand All @@ -490,10 +487,10 @@ impl Decoder {
.downcast_mut::<Date32Builder>()
.unwrap();

// Dates are just signed integers indicating no. of days since epoch
// Same as for Arrow, so no conversion needed
for value in values {
builder.append_option(
value.map(|value| value.num_days_from_ce() - UNIX_EPOCH_FROM_CE),
);
builder.append_option(value.map(|v| v as i32));
}
}
}
Expand Down Expand Up @@ -752,7 +749,7 @@ pub fn reader_factory(col: &Column, stripe: &Stripe) -> Result<Decoder> {
crate::proto::r#type::Kind::Struct => Decoder::Struct(new_struct_iter(col, stripe)?),
crate::proto::r#type::Kind::Union => todo!(),
crate::proto::r#type::Kind::Decimal => todo!(),
crate::proto::r#type::Kind::Date => Decoder::Date(new_date_iter(col, stripe)?),
crate::proto::r#type::Kind::Date => Decoder::Date(new_i64_iter(col, stripe)?),
crate::proto::r#type::Kind::Varchar => Decoder::String(StringDecoder::new(col, stripe)?),
crate::proto::r#type::Kind::Char => Decoder::String(StringDecoder::new(col, stripe)?),
crate::proto::r#type::Kind::TimestampInstant => todo!(),
Expand Down
1 change: 0 additions & 1 deletion src/arrow_reader/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ use crate::reader::Reader;

pub mod binary;
pub mod boolean;
pub mod date;
pub mod float;
pub mod int;
pub mod list;
Expand Down
51 changes: 0 additions & 51 deletions src/arrow_reader/column/date.rs

This file was deleted.

36 changes: 18 additions & 18 deletions src/arrow_reader/column/timestamp.rs
Original file line number Diff line number Diff line change
@@ -1,37 +1,40 @@
use chrono::NaiveDateTime;
use snafu::OptionExt;

use crate::arrow_reader::column::present::new_present_iter;
use crate::arrow_reader::column::{Column, NullableIterator};
use crate::arrow_reader::Stripe;
use crate::error::{InvalidTimestampSnafu, Result};
use crate::error::Result;
use crate::proto::stream::Kind;
use crate::reader::decode::{get_direct_signed_rle_reader, get_direct_unsigned_rle_reader};

// TIMESTAMP_BASE is 1 January 2015, the base value for all timestamp values.
const TIMESTAMP_BASE: i64 = 1420070400;
// This records the number of seconds since 1 January 1970 (epoch) for the base,
// since Arrow uses the epoch as the base instead.
const TIMESTAMP_BASE_SECONDS_SINCE_EPOCH: i64 = 1_420_070_400;
const NANOSECONDS_IN_SECOND: i64 = 1_000_000_000;

pub struct TimestampIterator {
data: Box<dyn Iterator<Item = Result<i64>> + Send>,
secondary: Box<dyn Iterator<Item = Result<u64>> + Send>,
}

impl TimestampIterator {
fn iter_next(&mut self) -> Result<Option<NaiveDateTime>> {
fn iter_next(&mut self) -> Result<Option<i64>> {
let next = match (self.data.next(), self.secondary.next()) {
(Some(data), Some(nanos)) => {
let data = data?;
(Some(seconds_since_orc_base), Some(nanos)) => {
let data = seconds_since_orc_base?;
let mut nanos = nanos?;
// last 3 bits indicate how many trailing zeros were truncated
let zeros = nanos & 0x7;
nanos >>= 3;
// multiply by powers of 10 to get back the trailing zeros
if zeros != 0 {
nanos *= 10_u64.pow(zeros as u32 + 1);
}
let timestamp =
NaiveDateTime::from_timestamp_opt(data + TIMESTAMP_BASE, nanos as u32)
.context(InvalidTimestampSnafu)?;

Some(timestamp)
// convert into nanoseconds since epoch, which Arrow uses as native representation
// of timestamps
let nanoseconds_since_epoch = (data + TIMESTAMP_BASE_SECONDS_SINCE_EPOCH)
* NANOSECONDS_IN_SECOND
+ (nanos as i64);
Some(nanoseconds_since_epoch)
}
// TODO: throw error for mismatched stream lengths?
_ => None,
Expand All @@ -41,17 +44,14 @@ impl TimestampIterator {
}

impl Iterator for TimestampIterator {
type Item = Result<NaiveDateTime>;
type Item = Result<i64>;

fn next(&mut self) -> Option<Self::Item> {
self.iter_next().transpose()
}
}

pub fn new_timestamp_iter(
column: &Column,
stripe: &Stripe,
) -> Result<NullableIterator<NaiveDateTime>> {
pub fn new_timestamp_iter(column: &Column, stripe: &Stripe) -> Result<NullableIterator<i64>> {
let present = new_present_iter(column, stripe)?.collect::<Result<Vec<_>>>()?;

let reader = stripe.stream_map.get(column, Kind::Data)?;
Expand Down
6 changes: 0 additions & 6 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,6 @@ pub enum Error {
encoding: proto::column_encoding::Kind,
},

#[snafu(display("Failed to convert to timestamp"))]
InvalidTimestamp { location: Location },

#[snafu(display("Failed to convert to date"))]
InvalidDate { location: Location },

#[snafu(display("Failed to add day to a date"))]
AddDays { location: Location },

Expand Down

0 comments on commit f19cc7b

Please sign in to comment.