orcxx-rs

Rust wrapper for the official C++ library for Apache ORC.

It uses a submodule pointing to an Apache ORC release, builds its C++ part (including vendored protobuf, lz4, zstd, ...), and links against that.

The orcxx_derive crate provides a custom derive macro.

orcxx_derive examples

RowIterator API

```rust extern crate orcxx; extern crate orcxx_derive;

use std::num::NonZeroU64;

use orcxx::deserialize::{OrcDeserialize, OrcStruct}; use orcxx::rowiterator::RowIterator; use orcxx::reader; use orcxxderive::OrcDeserialize;

// Define structure

[derive(OrcDeserialize, Clone, Default, Debug, PartialEq, Eq)]

struct Test1 { long1: Option, }

// Open file let orcpath = "../orcxx/orc/examples/TestOrcFile.test1.orc"; let inputstream = reader::InputStream::fromlocalfile(orcpath).expect("Could not open .orc"); let reader = reader::Reader::new(inputstream).expect("Could not read .orc");

let batchsize = NonZeroU64::new(1024).unwrap(); let mut rows: Vec> = RowIterator::new(&reader, batchsize) .expect("Could not open ORC file") .expect("Unexpected schema") .collect();

assert_eq!( rows, vec![ Some(Test1 { long1: Some(9223372036854775807) }), Some(Test1 { long1: Some(9223372036854775807) }) ] ); ```

Loop API

RowIterator clones structures before yielding them. This can be avoided by looping and writing directly to a buffer:

```rust extern crate orcxx; extern crate orcxx_derive;

use orcxx::deserialize::{CheckableKind, OrcDeserialize, OrcStruct}; use orcxx::reader; use orcxx_derive::OrcDeserialize;

// Define structure

[derive(OrcDeserialize, Default, Debug, PartialEq, Eq)]

struct Test1 { long1: Option, }

// Open file let orcpath = "../orcxx/orc/examples/TestOrcFile.test1.orc"; let inputstream = reader::InputStream::fromlocalfile(orcpath).expect("Could not open .orc"); let reader = reader::Reader::new(inputstream).expect("Could not read .orc");

// Only read columns we need let options = reader::RowReaderOptions::default().include_names(Test1::columns());

let mut rowreader = reader.rowreader(&options).expect("Could not open ORC file"); Test1::checkkind(&rowreader.selected_kind()).expect("Unexpected schema");

let mut rows: Vec> = Vec::new();

// Allocate work buffer let mut batch = rowreader.rowbatch(1024);

// Read structs until the end while rowreader.readinto(&mut batch) { let newrows = Option::::fromvectorbatch(&batch.borrow()).unwrap(); rows.extend(newrows); }

assert_eq!( rows, vec![ Some(Test1 { long1: Some(9223372036854775807) }), Some(Test1 { long1: Some(9223372036854775807) }) ] ); ```

Nested structures

The above two examples also work with nested structures:

```rust extern crate orcxx; extern crate orcxx_derive;

use orcxx_derive::OrcDeserialize;

[derive(OrcDeserialize, Default, Debug, PartialEq)]

struct Test1Option { boolean1: Option, byte1: Option, short1: Option, int1: Option, long1: Option, float1: Option, double1: Option, bytes1: Option>, string1: Option, list: Option>>, }

[derive(OrcDeserialize, Default, Debug, PartialEq)]

struct Test1ItemOption { int1: Option, string1: Option, } ```

orcxx examples

ColumnTree API

Columns can also be read directly without writing their values to structures. This is particularly useful to read files whose schema is not known at compile time.

Low-level API

This reads batches directly from the C++ library, and leaves the Rust code to dynamically cast base vectors to more specific types; here string vectors.

```rust extern crate orcxx; extern crate orcxx_derive;

use orcxx::reader; use orcxx::vector::ColumnVectorBatch;

let inputstream = reader::InputStream::fromlocal_file("../orcxx/orc/examples/TestOrcFile.test1.orc") .expect("Could not open");

let reader = reader::Reader::new(input_stream).expect("Could not read");

println!("{:#?}", reader.kind()); // Prints the type of columns in the file

let mut rowreader = reader.rowreader(&reader::RowReaderOptions::default()).unwrap(); let mut batch = rowreader.rowbatch(1024);

let mut totalelements = 0; let mut allstrings: Vec = Vec::new(); while rowreader.readinto(&mut batch) { totalelements += (&batch).numelements();

let struct_vector = batch.borrow().try_into_structs().unwrap();
let vectors = struct_vector.fields();

for vector in vectors {
    match vector.try_into_strings() {
        Ok(string_vector) => {
            for s in string_vector.iter() {
                all_strings.push(
                    std::str::from_utf8(s.unwrap_or(b"<null>"))
                    .unwrap().to_owned())
            }
        }
        Err(e) => {}
    }
}

}

asserteq!(totalelements, 2); asserteq!( allstrings, vec!["\0\u{1}\u{2}\u{3}\u{4}", "", "hi", "bye"] .iter() .map(|s| s.to_owned()) .collect::>() ); ```