kun yu 77e1ddd81b branch-0.4.0
Former-commit-id: a4df63653202df32d0b983de27f5c969905d17ac
2019-07-30 10:23:34 +08:00

218 lines
7.1 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
namespace arrow {
namespace dataset {
// ----------------------------------------------------------------------
// Computing partition values
// TODO(wesm): API for computing partition keys derived from raw
// values. For example, year(value) or hash_function(value) instead of
// simply value, so a dataset with a timestamp column might group all
// data with year 2009 in the same partition
// /// \brief
// class ScalarTransform {
// public:
// virtual Status Transform(const std::shared_ptr<Scalar>& input,
// std::shared_ptr<Scalar>* output) const = 0;
// };
// class PartitionField {
// public:
// private:
// std::string field_name_;
// };
// ----------------------------------------------------------------------
// Partition identifiers
/// \brief A partition level identifier which can be used
///
/// TODO(wesm): Is this general enough? What other kinds of partition
/// keys exist and do we need to support them?
class PartitionKey {
public:
const std::vector<std::string>& fields() const { return fields_; }
const std::vector<std::shared_ptr<Scalar>>& values() const { return values_; }
private:
std::vector<std::string> fields_;
std::vector<std::shared_ptr<Scalar>> values_;
};
/// \brief Intermediate data structure for data parsed from a string
/// partition identifier.
///
/// For example, the identifier "foo=5" might be parsed with a single
/// "foo" field and the value 5. A more complex identifier might be
/// written as "foo=5,bar=2", which would yield two fields and two
/// values.
///
/// Some partition schemes may store the field names in a metadata
/// store instead of in file paths, for example
/// dataset_root/2009/11/... could be used when the partition fields
/// are "year" and "month"
struct PartitionKeyData {
std::vector<std::string> fields;
std::vector<std::shared_ptr<Scalar>> values;
};
// ----------------------------------------------------------------------
// Partition schemes
/// \brief
class ARROW_DS_EXPORT PartitionScheme {
public:
virtual ~PartitionScheme() = default;
/// \brief The name identifying the kind of partition scheme
virtual std::string name() const = 0;
virtual bool PathMatchesScheme(const std::string& path) const = 0;
virtual Status ParseKey(const std::string& path, PartitionKeyData* out) const = 0;
};
/// \brief Multi-level, directory based partitioning scheme
/// originating from Apache Hive with all data files stored in the
/// leaf directories. Data is partitioned by static values of a
/// particular column in the schema. Partition keys are represented in
/// the form $key=$value in directory names
class ARROW_DS_EXPORT HivePartitionScheme : public PartitionScheme {
public:
/// \brief Return true if path
bool PathMatchesScheme(const std::string& path) const override;
virtual Status ParseKey(const std::string& path, PartitionKeyData* out) const = 0;
};
// ----------------------------------------------------------------------
//
// Partitioned datasets come in different forms. Here is an example of
// a Hive-style partitioned dataset:
//
// dataset_root/
// key1=$k1_v1/
// key2=$k2_v1/
// 0.parquet
// 1.parquet
// 2.parquet
// 3.parquet
// key2=$k2_v2/
// 0.parquet
// 1.parquet
// key1=$k1_v2/
// key2=$k2_v1/
// 0.parquet
// 1.parquet
// key2=$k2_v2/
// 0.parquet
// 1.parquet
// 2.parquet
//
// In this case, the dataset has 11 fragments (11 files) to be
// scanned, or potentially more if it is configured to split Parquet
// files at the row group level
class ARROW_DS_EXPORT Partition : public DataSource {
public:
std::string type() const override;
/// \brief The key for this partition source, may be nullptr,
/// e.g. for the top-level partitioned source container
virtual const PartitionKey* key() const = 0;
virtual std::unique_ptr<DataFragmentIterator> GetFragments(
const Selector& selector) = 0;
};
/// \brief Simple implementation of Partition, which consists of a
/// partition identifier, subpartitions, and some data fragments
class ARROW_DS_EXPORT SimplePartition : public Partition {
public:
SimplePartition(std::unique_ptr<PartitionKey> partition_key,
DataFragmentVector&& data_fragments, PartitionVector&& subpartitions,
std::shared_ptr<ScanOptions> scan_options = NULLPTR)
: key_(std::move(partition_key)),
data_fragments_(std::move(data_fragments)),
subpartitions_(std::move(subpartitions)),
scan_options_(scan_options) {}
const PartitionKey* key() const override { return key_.get(); }
int num_subpartitions() const { return static_cast<int>(subpartitions_.size()); }
int num_data_fragments() const { return static_cast<int>(data_fragments__.size()); }
const PartitionVector& subpartitions() const { return subpartitions_; }
const DataFragmentVector& data_fragments() const { return data_fragments_; }
std::unique_ptr<DataFragmentIterator> GetFragments(
const FilterVector& filters) override;
private:
std::unique_ptr<PartitionKey> key_;
/// \brief Data fragments belonging to this partition level. In some
/// partition schemes such as Hive-style, this member is
/// mutually-exclusive with subpartitions, where data fragments
/// occur only in the partition leaves
std::vector<std::shared_ptr<DataFragment>> data_fragments_;
/// \brief Child partitions of this partition
std::vector<std::shared_ptr<Partition>> subpartitions_;
/// \brief Default scan options to use for data fragments
std::shared_ptr<ScanOptions> scan_options_;
};
/// \brief A PartitionSource that returns fragments as the result of input iterators
class ARROW_DS_EXPORT LazyPartition : public Partition {
public:
const PartitionKey* key() const override;
std::unique_ptr<DataFragmentIterator> GetFragments(
const& DataSelector selector) override;
// TODO(wesm): Iterate over subpartitions
protected:
std::unique_ptr<PartitionIterator> partition_iter_;
// By default, once this source is consumed using GetFragments, it
// cannot be consumed again. By setting this to true, we cache
bool cache_manifest_ = false;
};
} // namespace dataset
} // namespace arrow