mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-10 15:22:56 +00:00
* feat: support to copy from orc format * test: add copy from orc test * chore: add license header * refactor: remove unimplemented macro * chore: apply suggestions from CR * chore: bump orc-rust to 0.2.3
104 lines
3.3 KiB
Python
104 lines
3.3 KiB
Python
# Copyright 2023 Greptime Team
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import random
|
|
import datetime
|
|
import pyorc
|
|
|
|
data = {
|
|
"double_a": [1.0, 2.0, 3.0, 4.0, 5.0],
|
|
"a": [1.0, 2.0, None, 4.0, 5.0],
|
|
"b": [True, False, None, True, False],
|
|
"str_direct": ["a", "cccccc", None, "ddd", "ee"],
|
|
"d": ["a", "bb", None, "ccc", "ddd"],
|
|
"e": ["ddd", "cc", None, "bb", "a"],
|
|
"f": ["aaaaa", "bbbbb", None, "ccccc", "ddddd"],
|
|
"int_short_repeated": [5, 5, None, 5, 5],
|
|
"int_neg_short_repeated": [-5, -5, None, -5, -5],
|
|
"int_delta": [1, 2, None, 4, 5],
|
|
"int_neg_delta": [5, 4, None, 2, 1],
|
|
"int_direct": [1, 6, None, 3, 2],
|
|
"int_neg_direct": [-1, -6, None, -3, -2],
|
|
"bigint_direct": [1, 6, None, 3, 2],
|
|
"bigint_neg_direct": [-1, -6, None, -3, -2],
|
|
"bigint_other": [5, -5, 1, 5, 5],
|
|
"utf8_increase": ["a", "bb", "ccc", "dddd", "eeeee"],
|
|
"utf8_decrease": ["eeeee", "dddd", "ccc", "bb", "a"],
|
|
"timestamp_simple": [datetime.datetime(2023, 4, 1, 20, 15, 30, 2000), datetime.datetime.fromtimestamp(int('1629617204525777000')/1000000000), datetime.datetime(2023, 1, 1), datetime.datetime(2023, 2, 1), datetime.datetime(2023, 3, 1)],
|
|
"date_simple": [datetime.date(2023, 4, 1), datetime.date(2023, 3, 1), datetime.date(2023, 1, 1), datetime.date(2023, 2, 1), datetime.date(2023, 3, 1)]
|
|
}
|
|
|
|
def infer_schema(data):
|
|
schema = "struct<"
|
|
for key, value in data.items():
|
|
dt = type(value[0])
|
|
if dt == float:
|
|
dt = "float"
|
|
elif dt == int:
|
|
dt = "int"
|
|
elif dt == bool:
|
|
dt = "boolean"
|
|
elif dt == str:
|
|
dt = "string"
|
|
elif key.startswith("timestamp"):
|
|
dt = "timestamp"
|
|
elif key.startswith("date"):
|
|
dt = "date"
|
|
else:
|
|
print(key,value,dt)
|
|
raise NotImplementedError
|
|
if key.startswith("double"):
|
|
dt = "double"
|
|
if key.startswith("bigint"):
|
|
dt = "bigint"
|
|
schema += key + ":" + dt + ","
|
|
|
|
schema = schema[:-1] + ">"
|
|
return schema
|
|
|
|
|
|
|
|
def _write(
|
|
schema: str,
|
|
data,
|
|
file_name: str,
|
|
compression=pyorc.CompressionKind.NONE,
|
|
dict_key_size_threshold=0.0,
|
|
):
|
|
output = open(file_name, "wb")
|
|
writer = pyorc.Writer(
|
|
output,
|
|
schema,
|
|
dict_key_size_threshold=dict_key_size_threshold,
|
|
# use a small number to ensure that compression crosses value boundaries
|
|
compression_block_size=32,
|
|
compression=compression,
|
|
)
|
|
num_rows = len(list(data.values())[0])
|
|
for x in range(num_rows):
|
|
row = tuple(values[x] for values in data.values())
|
|
writer.write(row)
|
|
writer.close()
|
|
|
|
with open(file_name, "rb") as f:
|
|
reader = pyorc.Reader(f)
|
|
list(reader)
|
|
|
|
|
|
_write(
|
|
infer_schema(data),
|
|
data,
|
|
"test.orc",
|
|
)
|