Rust wrapper for the official C++ library for Apache ORC.
It uses a submodule pointing to an Apache ORC release, builds its C++ part (including vendored protobuf, lz4, zstd, ...), and links against that.
The orcxx_derive
crate provides a custom derive
macro.
orcxx_derive
examplesRowIterator
API```rust extern crate orcxx; extern crate orcxx_derive;
use std::num::NonZeroU64;
use orcxx::deserialize::{OrcDeserialize, OrcStruct}; use orcxx::rowiterator::RowIterator; use orcxx::reader; use orcxxderive::OrcDeserialize;
// Define structure
struct Test1 {
long1: Option
// Open file let orcpath = "../orcxx/orc/examples/TestOrcFile.test1.orc"; let inputstream = reader::InputStream::fromlocalfile(orcpath).expect("Could not open .orc"); let reader = reader::Reader::new(inputstream).expect("Could not read .orc");
let batchsize = NonZeroU64::new(1024).unwrap(); let mut rows: Vecsize) .expect("Could not open ORC file") .expect("Unexpected schema") .collect();
assert_eq!( rows, vec![ Some(Test1 { long1: Some(9223372036854775807) }), Some(Test1 { long1: Some(9223372036854775807) }) ] ); ```
RowIterator
clones structures before yielding them. This can be avoided by looping
and writing directly to a buffer:
```rust extern crate orcxx; extern crate orcxx_derive;
use orcxx::deserialize::{CheckableKind, OrcDeserialize, OrcStruct}; use orcxx::reader; use orcxx_derive::OrcDeserialize;
// Define structure
struct Test1 {
long1: Option
// Open file let orcpath = "../orcxx/orc/examples/TestOrcFile.test1.orc"; let inputstream = reader::InputStream::fromlocalfile(orcpath).expect("Could not open .orc"); let reader = reader::Reader::new(inputstream).expect("Could not read .orc");
// Only read columns we need let options = reader::RowReaderOptions::default().include_names(Test1::columns());
let mut rowreader = reader.rowreader(&options).expect("Could not open ORC file"); Test1::checkkind(&rowreader.selected_kind()).expect("Unexpected schema");
let mut rows: Vec
// Allocate work buffer let mut batch = rowreader.rowbatch(1024);
// Read structs until the end
while rowreader.readinto(&mut batch) {
let newrows = Option::
assert_eq!( rows, vec![ Some(Test1 { long1: Some(9223372036854775807) }), Some(Test1 { long1: Some(9223372036854775807) }) ] ); ```
The above two examples also work with nested structures:
```rust extern crate orcxx; extern crate orcxx_derive;
use orcxx_derive::OrcDeserialize;
struct Test1Option {
boolean1: Option
struct Test1ItemOption {
int1: Option
orcxx
examplesColumns can also be read directly without writing their values to structures. This is particularly useful to read files whose schema is not known at compile time.
This reads batches directly from the C++ library, and leaves the Rust code to dynamically cast base vectors to more specific types; here string vectors.
```rust extern crate orcxx; extern crate orcxx_derive;
use orcxx::reader; use orcxx::vector::ColumnVectorBatch;
let inputstream = reader::InputStream::fromlocal_file("../orcxx/orc/examples/TestOrcFile.test1.orc") .expect("Could not open");
let reader = reader::Reader::new(input_stream).expect("Could not read");
println!("{:#?}", reader.kind()); // Prints the type of columns in the file
let mut rowreader = reader.rowreader(&reader::RowReaderOptions::default()).unwrap(); let mut batch = rowreader.rowbatch(1024);
let mut totalelements = 0;
let mut allstrings: Vec
let struct_vector = batch.borrow().try_into_structs().unwrap();
let vectors = struct_vector.fields();
for vector in vectors {
match vector.try_into_strings() {
Ok(string_vector) => {
for s in string_vector.iter() {
all_strings.push(
std::str::from_utf8(s.unwrap_or(b"<null>"))
.unwrap().to_owned())
}
}
Err(e) => {}
}
}
}
asserteq!(totalelements, 2);
asserteq!(
allstrings,
vec!["\0\u{1}\u{2}\u{3}\u{4}", "", "hi", "bye"]
.iter()
.map(|s| s.to_owned())
.collect::