Skip to content

Commit

Permalink
fixed an issue with dimension separators (#19)
Browse files Browse the repository at this point in the history
* fixed an issue with dimension separators

* linter fix
  • Loading branch information
maximedion2 committed May 9, 2024
1 parent 96bce2a commit f852355
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 46 deletions.
2 changes: 1 addition & 1 deletion src/async_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ where
pos,
&cols,
self.meta.get_real_dims(pos),
self.meta.get_separators(),
self.meta.get_chunk_patterns(),
)
.await;

Expand Down
73 changes: 56 additions & 17 deletions src/async_reader/zarr_read_async.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use std::collections::HashMap;
use std::fs::{read, read_to_string};
use std::sync::Arc;

use crate::reader::metadata::ChunkSeparator;
use crate::reader::metadata::{ChunkPattern, ChunkSeparator};
use crate::reader::{ZarrError, ZarrResult};
use crate::reader::{ZarrInMemoryChunk, ZarrStoreMetadata};

Expand All @@ -40,7 +40,7 @@ pub trait ZarrReadAsync<'a> {
position: &'a [usize],
cols: &'a [String],
real_dims: Vec<usize>,
separators: HashMap<String, ChunkSeparator>,
patterns: HashMap<String, ChunkPattern>,
) -> ZarrResult<ZarrInMemoryChunk>;
}

Expand Down Expand Up @@ -99,26 +99,49 @@ impl<'a> ZarrReadAsync<'a> for ZarrPath {
position: &'a [usize],
cols: &'a [String],
real_dims: Vec<usize>,
separators: HashMap<String, ChunkSeparator>,
patterns: HashMap<String, ChunkPattern>,
) -> ZarrResult<ZarrInMemoryChunk> {
let mut chunk = ZarrInMemoryChunk::new(real_dims);
for var in cols {
let s: Vec<String> = position.iter().map(|i| i.to_string()).collect();
let separator = separators
let pattern = patterns
.get(var.as_str())
.ok_or(ZarrError::InvalidMetadata(
"Could not find separator for column".to_string(),
))?;

let p = match separator {
ChunkSeparator::Period => self.location.child(var.to_string()).child(s.join(".")),
ChunkSeparator::Slash => {
let mut partial_path = self.location.child(var.to_string()).child("c");
for idx in s {
partial_path = partial_path.child(idx);
let p = match pattern {
ChunkPattern {
separator: sep,
c_prefix: false,
} => match sep {
ChunkSeparator::Period => {
self.location.child(var.to_string()).child(s.join("."))
}
partial_path
}
ChunkSeparator::Slash => {
let mut path = self.location.child(var.to_string());
for idx in s {
path = path.child(idx);
}
path
}
},
ChunkPattern {
separator: sep,
c_prefix: true,
} => match sep {
ChunkSeparator::Period => self
.location
.child(var.to_string())
.child("c.".to_string() + &s.join(".")),
ChunkSeparator::Slash => {
let mut path = self.location.child(var.to_string()).child("c");
for idx in s {
path = path.child(idx);
}
path
}
},
};
let get_res = self.store.get(&p).await?;
let data = match get_res.payload {
Expand Down Expand Up @@ -165,7 +188,10 @@ mod zarr_read_async_tests {
&ZarrArrayMetadata::new(
2,
ZarrDataType::UInt(1),
ChunkSeparator::Period,
ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false
},
None,
vec![ZarrCodec::Bytes(Endianness::Little)],
)
Expand All @@ -175,7 +201,10 @@ mod zarr_read_async_tests {
&ZarrArrayMetadata::new(
2,
ZarrDataType::Float(8),
ChunkSeparator::Period,
ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false
},
None,
vec![ZarrCodec::Bytes(Endianness::Little)],
)
Expand All @@ -197,7 +226,7 @@ mod zarr_read_async_tests {
&pos,
meta.get_columns(),
meta.get_real_dims(&pos),
meta.get_separators(),
meta.get_chunk_patterns(),
)
.await
.unwrap();
Expand All @@ -214,7 +243,12 @@ mod zarr_read_async_tests {
let col_proj = ZarrProjection::skip(vec!["float_data".to_string()]);
let cols = col_proj.apply_selection(meta.get_columns()).unwrap();
let chunk = store
.get_zarr_chunk(&pos, &cols, meta.get_real_dims(&pos), meta.get_separators())
.get_zarr_chunk(
&pos,
&cols,
meta.get_real_dims(&pos),
meta.get_chunk_patterns(),
)
.await
.unwrap();
assert_eq!(
Expand All @@ -226,7 +260,12 @@ mod zarr_read_async_tests {
let col_proj = ZarrProjection::keep(vec!["float_data".to_string()]);
let cols = col_proj.apply_selection(meta.get_columns()).unwrap();
let chunk = store
.get_zarr_chunk(&pos, &cols, meta.get_real_dims(&pos), meta.get_separators())
.get_zarr_chunk(
&pos,
&cols,
meta.get_real_dims(&pos),
meta.get_chunk_patterns(),
)
.await
.unwrap();
assert_eq!(
Expand Down
2 changes: 1 addition & 1 deletion src/datafusion/table_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use datafusion::{

use super::table_provider::{ListingZarrTableConfig, ListingZarrTableOptions, ZarrTableProvider};

struct ZarrListingTableFactory {}
pub struct ZarrListingTableFactory {}

#[async_trait]
impl TableProviderFactory for ZarrListingTableFactory {
Expand Down
83 changes: 69 additions & 14 deletions src/reader/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,20 @@ impl FromStr for ChunkSeparator {
}
}

// Struct for the chunk file pattern
#[derive(Debug, PartialEq, Clone)]
pub struct ChunkPattern {
pub(crate) separator: ChunkSeparator,
pub(crate) c_prefix: bool,
}

/// The metadata for a single zarr array, which holds various parameters
/// for the data stored in the array.
#[derive(Debug, PartialEq, Clone)]
pub struct ZarrArrayMetadata {
zarr_format: u8,
data_type: ZarrDataType,
chunk_separator: ChunkSeparator,
chunk_pattern: ChunkPattern,
sharding_options: Option<ShardingOptions>,
codecs: Vec<ZarrCodec>,
}
Expand All @@ -165,14 +172,14 @@ impl ZarrArrayMetadata {
pub(crate) fn new(
zarr_format: u8,
data_type: ZarrDataType,
chunk_separator: ChunkSeparator,
chunk_pattern: ChunkPattern,
sharding_options: Option<ShardingOptions>,
codecs: Vec<ZarrCodec>,
) -> Self {
Self {
zarr_format,
data_type,
chunk_separator,
chunk_pattern,
sharding_options,
codecs,
}
Expand All @@ -186,8 +193,8 @@ impl ZarrArrayMetadata {
self.sharding_options.clone()
}

pub(crate) fn get_separator(&self) -> ChunkSeparator {
self.chunk_separator.clone()
pub(crate) fn get_chunk_pattern(&self) -> ChunkPattern {
self.chunk_pattern.clone()
}
}

Expand Down Expand Up @@ -248,6 +255,22 @@ fn extract_string_from_json(map: &Value, key: &str, err_str: &str) -> ZarrResult
Ok(res.to_string())
}

fn extract_optional_string_from_json(
map: &Value,
key: &str,
err_str: &str,
) -> ZarrResult<Option<String>> {
let res = map.get(key);
if let Some(val) = res {
let val = val
.as_str()
.ok_or(ZarrError::InvalidMetadata(err_str.to_string()))?;
return Ok(Some(val.to_string()));
}

Ok(None)
}

fn extract_u64_from_json(map: &Value, key: &str, err_str: &str) -> ZarrResult<u64> {
let res = map
.get(key)
Expand Down Expand Up @@ -361,6 +384,12 @@ impl ZarrStoreMetadata {
let dtype = extract_string_from_json(&meta_map, "dtype", error_string)?;
let data_type = extract_type_v2(&dtype)?;

// parse dimenstion separator
let error_string = "error parsing metadata dimension separator";
let maybe_sep =
extract_optional_string_from_json(&meta_map, "dimension_separator", error_string)?;
let dim_separator = maybe_sep.unwrap_or(".".to_string());

// parse endianness
let endianness = match dtype.chars().next().unwrap() {
'<' | '|' => Endianness::Little,
Expand Down Expand Up @@ -407,7 +436,10 @@ impl ZarrStoreMetadata {
let array_meta = ZarrArrayMetadata {
zarr_format: 2,
data_type,
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::from_str(&dim_separator)?,
c_prefix: false,
},
sharding_options: None,
codecs,
};
Expand Down Expand Up @@ -623,6 +655,11 @@ impl ZarrStoreMetadata {
let (_, config) = extract_config(chunk_key_encoding)?;
let chunk_key_encoding = extract_string_from_json(config, "separator", error_string)?;
let chunk_key_encoding = ChunkSeparator::from_str(&chunk_key_encoding)?;
let c_prefix = chunk_key_encoding == ChunkSeparator::Slash;
let chunk_key_encoding = ChunkPattern {
separator: chunk_key_encoding,
c_prefix,
};

// codecs
let codec_configs = meta_map
Expand Down Expand Up @@ -766,12 +803,12 @@ impl ZarrStoreMetadata {
self.chunks.as_ref().unwrap()
}

pub(crate) fn get_separators(&self) -> HashMap<String, ChunkSeparator> {
pub(crate) fn get_chunk_patterns(&self) -> HashMap<String, ChunkPattern> {
let mut m = HashMap::new();
for col in &self.columns {
m.insert(
col.to_string(),
self.get_array_meta(col).unwrap().get_separator(),
self.get_array_meta(col).unwrap().get_chunk_pattern(),
);
}

Expand Down Expand Up @@ -857,7 +894,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::Int(4),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Little),
Expand All @@ -876,7 +916,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::TimeStamp(8, "ms".to_string()),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Little),
Expand All @@ -895,7 +938,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::Bool,
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Transpose(vec![1, 0]),
Expand All @@ -909,7 +955,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::FixedLengthString(112),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Big),
Expand Down Expand Up @@ -1077,7 +1126,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 3,
data_type: ZarrDataType::Int(4),
chunk_separator: ChunkSeparator::Slash,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Slash,
c_prefix: true,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Little),
Expand Down Expand Up @@ -1143,7 +1195,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 3,
data_type: ZarrDataType::Int(4),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: Some(ShardingOptions::new(
vec![4, 4],
vec![2, 2],
Expand Down
2 changes: 1 addition & 1 deletion src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ impl<T: ZarrRead> ZarrIterator for ZarrStore<T> {
pos,
&cols,
self.meta.get_real_dims(pos),
self.meta.get_separators(),
self.meta.get_chunk_patterns(),
);
self.curr_chunk += 1;
Some(chnk)
Expand Down
Loading

0 comments on commit f852355

Please sign in to comment.