Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded #97

Open
progval opened this issue Jun 20, 2024 · 0 comments

Comments

@progval
Copy link
Contributor

progval commented Jun 20, 2024

this check:

if (patch_bit_width + value_bit_width) > (N::BYTE_SIZE * 8) {
return OutOfSpecSnafu {
msg: "combined patch width and value width cannot exceed the size of the integer type being decoded",
}
.fail();
}

can be falsified while reading files created with pyorc. To reproduce:

wget https://softwareheritage.s3.amazonaws.com/graph/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc

(sorry, it's 4GB. I don't have a smaller example on hand)

then checkout 28e911b (a commit from #96 because it's the only way not to hit an overflow crash before this bug), apply this patch:

diff --git a/Cargo.toml b/Cargo.toml
index 2af67e1..ecec249 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,6 +70,10 @@ required-features = ["datafusion"]
 # Some issue when publishing and path isn't specified, so adding here
 path = "./examples/datafusion_integration.rs"
 
+[[example]]
+name = "repro"
+required-features = ["cli"]
+
 [[bin]]
 name = "orc-metadata"
 required-features = ["cli"]
diff --git a/src/reader/decode/rle_v2/patched_base.rs b/src/reader/decode/rle_v2/patched_base.rs
index c33815b..c149ea4 100644
--- a/src/reader/decode/rle_v2/patched_base.rs
+++ b/src/reader/decode/rle_v2/patched_base.rs
@@ -53,6 +53,7 @@ impl<N: NInt, R: Read> RleReaderV2<N, R> {
             .fail();
         }
         if (patch_bit_width + value_bit_width) > (N::BYTE_SIZE * 8) {
+            eprintln!("patch_bit_width= {} value_bit_width= {} N::BYTE_SIZE= {}", patch_bit_width, value_bit_width, N::BYTE_SIZE);
             return OutOfSpecSnafu {
                 msg: "combined patch width and value width cannot exceed the size of the integer type being decoded",
             }

then run this code with ./revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc as parameter:

use std::fs::File;
use std::path::PathBuf;
use std::sync::Arc;

use anyhow::{Context, Result};
use arrow::datatypes::{DataType, Decimal128Type, DecimalType, Schema};
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::projection::ProjectionMask;
//use rayon::prelude::*;

fn transform_schema(schema: &Schema) -> Arc<Schema> {
    Arc::new(Schema::new(
        schema
            .fields()
            .iter()
            .cloned()
            .map(|field| match field.data_type() {
                DataType::Timestamp(_, _) => (*field)
                    .clone()
                    //.with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
                    .with_data_type(DataType::Decimal128(Decimal128Type::MAX_SCALE as _, 9)),
                _ => (*field).clone(),
            })
            .collect::<Vec<_>>(),
    ))
}

pub fn main() -> Result<()> {
    std::env::args()
        .skip(1)
        .collect::<Vec<_>>()
        //.into_par_iter()
        .into_iter()
        .try_for_each(|arg| {
            let file_path = PathBuf::from(arg);
            println!("reading {}", file_path.display());
            let file = File::open(&file_path)?;
            let reader_builder = ArrowReaderBuilder::try_new(file)?;
            let projection = ProjectionMask::named_roots(
                reader_builder.file_metadata().root_data_type(),
                ["date"].as_slice(),
            );
            let reader_builder = reader_builder
                .with_projection(projection)
                .with_batch_size(1024);
            let schema = transform_schema(&reader_builder.schema());
            let reader = reader_builder.with_schema(schema).build();
            for (i, chunk) in reader.enumerate() {
                let chunk = chunk.with_context(|| {
                    format!("Could not read chunk {} of {}", i, file_path.display())
                })?;
                //println!("{:?}", chunk);
            }

            Ok(())
        })
}

which prints:

reading /srv/softwareheritage/ssd/data/vlorentz/datasets/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc
patch_bit_width= 40 value_bit_width= 30 N::BYTE_SIZE= 8
Error: Could not read chunk 29525 of /srv/softwareheritage/ssd/data/vlorentz/datasets/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc

Caused by:
    0: External error: Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded
    1: Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded

Stack backtrace:
   0: anyhow::context::<impl anyhow::Context<T,E> for core::result::Result<T,E>>::with_context
   1: repro::main
   2: std::sys_common::backtrace::__rust_begin_short_backtrace
   3: std::rt::lang_start::{{closure}}
   4: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ops/function.rs:284:13
   5: std::panicking::try::do_call
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
   6: std::panicking::try
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:516:19
   7: std::panic::catch_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panic.rs:142:14
   8: std::rt::lang_start_internal::{{closure}}
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/rt.rs:148:48
   9: std::panicking::try::do_call
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
  10: std::panicking::try
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:516:19
  11: std::panic::catch_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panic.rs:142:14
  12: std::rt::lang_start_internal
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/rt.rs:148:20
  13: main
  14: __libc_start_main
             at ./csu/../csu/libc-start.c:308:16
  15: _start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
1 participant