From e45fdb2a89b17fd295f33165d5beb5061875bdc3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 19 Aug 2025 13:14:36 +0200 Subject: [PATCH 01/26] Add pyarrow-stubs --- python/pyarrow/__init__.pyi | 656 ++ python/pyarrow/__lib_pxi/__init__.pyi | 0 python/pyarrow/__lib_pxi/array.pyi | 4274 +++++++++ python/pyarrow/__lib_pxi/benchmark.pyi | 1 + python/pyarrow/__lib_pxi/builder.pyi | 89 + python/pyarrow/__lib_pxi/compat.pyi | 5 + python/pyarrow/__lib_pxi/config.pyi | 41 + python/pyarrow/__lib_pxi/device.pyi | 88 + python/pyarrow/__lib_pxi/error.pyi | 53 + python/pyarrow/__lib_pxi/io.pyi | 1474 ++++ python/pyarrow/__lib_pxi/ipc.pyi | 705 ++ python/pyarrow/__lib_pxi/memory.pyi | 174 + python/pyarrow/__lib_pxi/pandas_shim.pyi | 51 + python/pyarrow/__lib_pxi/scalar.pyi | 1017 +++ python/pyarrow/__lib_pxi/table.pyi | 5617 ++++++++++++ python/pyarrow/__lib_pxi/tensor.pyi | 688 ++ python/pyarrow/__lib_pxi/types.pyi | 4413 ++++++++++ python/pyarrow/_azurefs.pyi | 74 + python/pyarrow/_compute.pyi | 1721 ++++ python/pyarrow/_csv.pyi | 641 ++ python/pyarrow/_cuda.pyi | 556 ++ python/pyarrow/_dataset.pyi | 2301 +++++ python/pyarrow/_dataset_orc.pyi | 6 + python/pyarrow/_dataset_parquet.pyi | 314 + .../pyarrow/_dataset_parquet_encryption.pyi | 85 + python/pyarrow/_feather.pyi | 29 + python/pyarrow/_flight.pyi | 1380 +++ python/pyarrow/_fs.pyi | 1005 +++ python/pyarrow/_gcsfs.pyi | 83 + python/pyarrow/_hdfs.pyi | 75 + python/pyarrow/_json.pyi | 169 + python/pyarrow/_orc.pyi | 56 + python/pyarrow/_parquet.pyi | 445 + python/pyarrow/_parquet_encryption.pyi | 67 + python/pyarrow/_s3fs.pyi | 74 + python/pyarrow/_stubs_typing.pyi | 80 + python/pyarrow/_substrait.pyi | 39 + python/pyarrow/acero.pyi | 85 + python/pyarrow/benchmark.pyi | 3 + python/pyarrow/cffi.pyi | 4 + python/pyarrow/compute.pyi | 7779 +++++++++++++++++ python/pyarrow/csv.pyi | 27 + python/pyarrow/cuda.pyi | 25 + python/pyarrow/dataset.pyi | 229 + python/pyarrow/feather.pyi | 50 + python/pyarrow/flight.pyi | 95 + python/pyarrow/fs.pyi | 77 + python/pyarrow/gandiva.pyi | 65 + python/pyarrow/interchange/__init__.pyi | 0 python/pyarrow/interchange/buffer.pyi | 58 + python/pyarrow/interchange/column.pyi | 252 + python/pyarrow/interchange/dataframe.pyi | 102 + python/pyarrow/interchange/from_dataframe.pyi | 244 + python/pyarrow/ipc.pyi | 123 + python/pyarrow/json.pyi | 3 + python/pyarrow/lib.pyi | 106 + python/pyarrow/orc.pyi | 279 + python/pyarrow/pandas_compat.pyi | 54 + python/pyarrow/parquet/__init__.pyi | 1 + python/pyarrow/parquet/core.pyi | 2061 +++++ python/pyarrow/parquet/encryption.pyi | 15 + python/pyarrow/substrait.pyi | 21 + python/pyarrow/types.pyi | 194 + python/pyarrow/util.pyi | 27 + 64 files changed, 40525 insertions(+) create mode 100644 python/pyarrow/__init__.pyi create mode 100644 python/pyarrow/__lib_pxi/__init__.pyi create mode 100644 python/pyarrow/__lib_pxi/array.pyi create mode 100644 python/pyarrow/__lib_pxi/benchmark.pyi create mode 100644 python/pyarrow/__lib_pxi/builder.pyi create mode 100644 python/pyarrow/__lib_pxi/compat.pyi create mode 100644 python/pyarrow/__lib_pxi/config.pyi create mode 100644 python/pyarrow/__lib_pxi/device.pyi create mode 100644 python/pyarrow/__lib_pxi/error.pyi create mode 100644 python/pyarrow/__lib_pxi/io.pyi create mode 100644 python/pyarrow/__lib_pxi/ipc.pyi create mode 100644 python/pyarrow/__lib_pxi/memory.pyi create mode 100644 python/pyarrow/__lib_pxi/pandas_shim.pyi create mode 100644 python/pyarrow/__lib_pxi/scalar.pyi create mode 100644 python/pyarrow/__lib_pxi/table.pyi create mode 100644 python/pyarrow/__lib_pxi/tensor.pyi create mode 100644 python/pyarrow/__lib_pxi/types.pyi create mode 100644 python/pyarrow/_azurefs.pyi create mode 100644 python/pyarrow/_compute.pyi create mode 100644 python/pyarrow/_csv.pyi create mode 100644 python/pyarrow/_cuda.pyi create mode 100644 python/pyarrow/_dataset.pyi create mode 100644 python/pyarrow/_dataset_orc.pyi create mode 100644 python/pyarrow/_dataset_parquet.pyi create mode 100644 python/pyarrow/_dataset_parquet_encryption.pyi create mode 100644 python/pyarrow/_feather.pyi create mode 100644 python/pyarrow/_flight.pyi create mode 100644 python/pyarrow/_fs.pyi create mode 100644 python/pyarrow/_gcsfs.pyi create mode 100644 python/pyarrow/_hdfs.pyi create mode 100644 python/pyarrow/_json.pyi create mode 100644 python/pyarrow/_orc.pyi create mode 100644 python/pyarrow/_parquet.pyi create mode 100644 python/pyarrow/_parquet_encryption.pyi create mode 100644 python/pyarrow/_s3fs.pyi create mode 100644 python/pyarrow/_stubs_typing.pyi create mode 100644 python/pyarrow/_substrait.pyi create mode 100644 python/pyarrow/acero.pyi create mode 100644 python/pyarrow/benchmark.pyi create mode 100644 python/pyarrow/cffi.pyi create mode 100644 python/pyarrow/compute.pyi create mode 100644 python/pyarrow/csv.pyi create mode 100644 python/pyarrow/cuda.pyi create mode 100644 python/pyarrow/dataset.pyi create mode 100644 python/pyarrow/feather.pyi create mode 100644 python/pyarrow/flight.pyi create mode 100644 python/pyarrow/fs.pyi create mode 100644 python/pyarrow/gandiva.pyi create mode 100644 python/pyarrow/interchange/__init__.pyi create mode 100644 python/pyarrow/interchange/buffer.pyi create mode 100644 python/pyarrow/interchange/column.pyi create mode 100644 python/pyarrow/interchange/dataframe.pyi create mode 100644 python/pyarrow/interchange/from_dataframe.pyi create mode 100644 python/pyarrow/ipc.pyi create mode 100644 python/pyarrow/json.pyi create mode 100644 python/pyarrow/lib.pyi create mode 100644 python/pyarrow/orc.pyi create mode 100644 python/pyarrow/pandas_compat.pyi create mode 100644 python/pyarrow/parquet/__init__.pyi create mode 100644 python/pyarrow/parquet/core.pyi create mode 100644 python/pyarrow/parquet/encryption.pyi create mode 100644 python/pyarrow/substrait.pyi create mode 100644 python/pyarrow/types.pyi create mode 100644 python/pyarrow/util.pyi diff --git a/python/pyarrow/__init__.pyi b/python/pyarrow/__init__.pyi new file mode 100644 index 00000000000..8a0d1e870c5 --- /dev/null +++ b/python/pyarrow/__init__.pyi @@ -0,0 +1,656 @@ +# ruff: noqa: F401, I001, E402 +__version__: str + +import pyarrow.lib as _lib + +_gc_enabled: bool + +from pyarrow.lib import ( + BuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + PyExtensionType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + +# Buffers, allocation +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc + +import pyarrow.types as types + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + +# ---------------------------------------------------------------------- +# Returning absolute path to the pyarrow include directory (if bundled, e.g. in +# wheels) +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... + +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "get_library_dirs", +] diff --git a/python/pyarrow/__lib_pxi/__init__.pyi b/python/pyarrow/__lib_pxi/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pyarrow/__lib_pxi/array.pyi b/python/pyarrow/__lib_pxi/array.pyi new file mode 100644 index 00000000000..ec1cda30a88 --- /dev/null +++ b/python/pyarrow/__lib_pxi/array.pyi @@ -0,0 +1,4274 @@ +import datetime as dt +import sys + +from collections.abc import Callable +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + Any, + Generic, + Iterable, + Iterator, + Literal, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from pandas.core.dtypes.base import ExtensionDtype +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, +) +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + MonthDayNano, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated + +from . import scalar, types +from .device import DeviceAllocationType +from .scalar import NullableCollection, Scalar +from .types import ( + DataType, + Field, + MapType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, +) + +@overload +def array( + values: NullableCollection[bool], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: NullableCollection[int], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: NullableCollection[float], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: NullableCollection[Decimal], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Decimal128Array: ... +@overload +def array( + values: NullableCollection[dict[str, Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def array( + values: NullableCollection[dt.date], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: NullableCollection[dt.time], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: NullableCollection[dt.timedelta], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["us"]]: ... +@overload +def array( + values: NullableCollection[MonthDayNano], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: NullableCollection[str], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: NullableCollection[bytes], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: NullableCollection[list[Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ListArray[Any]: ... +@overload +def array( + values: NullableCollection[_ScalarT], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[_ScalarT]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"] | types.NullType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> NullArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"] | types.BoolType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int8Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"] | types.Int16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int16Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"] | types.Int32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"] | types.Int64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt8Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.UInt16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt16Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"] | types.Uint32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.UInt64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> HalfFloatArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> FloatArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"] | types.Float64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"] | types.StringType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[Scalar[_DataTypeT]]: ... +def array(*args, **kawrgs): + """ + Create pyarrow.Array instance from a Python object. + + Parameters + ---------- + obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) + can be passed as well. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data. + mask : array[bool], optional + Indicate which values are null (True) or not null (False). + size : int64, optional + Size of the elements. If the input is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. If passed, the mask tasks precedence, but + if a value is unmasked (not-null), but still null according to + pandas semantics, then it is null. Defaults to False if not + passed explicitly by user, or True if a pandas object is + passed in. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + A ChunkedArray instead of an Array is returned if: + + - the object data overflowed binary storage. + - the object's ``__arrow_array__`` protocol method returned a chunked + array. + + Notes + ----- + Timezone will be preserved in the returned array for timezone-aware data, + else no timezone will be returned for naive timestamps. + Internally, UTC values are stored for timezone-aware data with the + timezone set in the data type. + + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by + default converted as MonthDayNanoIntervalArray. relativedelta leapdays + are ignored as are all absolute fields on both objects. datetime.timedelta + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. + + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + ... + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + + [ + 1, + null + ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) + """ + +@overload +def asarray(values: NullableCollection[bool]) -> BooleanArray: ... +@overload +def asarray(values: NullableCollection[int]) -> Int64Array: ... +@overload +def asarray(values: NullableCollection[float]) -> DoubleArray: ... +@overload +def asarray(values: NullableCollection[Decimal]) -> Decimal128Array: ... +@overload +def asarray(values: NullableCollection[dict[str, Any]]) -> StructArray: ... +@overload +def asarray(values: NullableCollection[dt.date]) -> Date32Array: ... +@overload +def asarray(values: NullableCollection[dt.time]) -> Time64Array: ... +@overload +def asarray(values: NullableCollection[dt.timedelta]) -> DurationArray: ... +@overload +def asarray(values: NullableCollection[MonthDayNano]) -> MonthDayNanoIntervalArray: ... +@overload +def asarray(values: NullableCollection[list[Any]]) -> ListArray[Any]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"] | types.NullType, +) -> NullArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"] | types.BoolType, +) -> BooleanArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, +) -> Int8Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"] | types.Int16Type, +) -> Int16Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"] | types.Int32Type, +) -> Int32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"] | types.Int64Type, +) -> Int64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, +) -> UInt8Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.UInt16Type, +) -> UInt16Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> UInt32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.UInt64Type, +) -> UInt64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> HalfFloatArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> FloatArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> DoubleArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"] | types.StringType, +) -> StringArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, +) -> BinaryArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> LargeStringArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, +) -> LargeBinaryArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, +) -> BinaryViewArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, +) -> StringViewArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> Date32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> Date64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> Time32Array[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> Time32Array[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> Time64Array[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> Time64Array[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> TimestampArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> TimestampArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> TimestampArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> TimestampArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> DurationArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> DurationArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> DurationArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> DurationArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> MonthDayNanoIntervalArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, +) -> Array[Scalar[_DataTypeT]]: ... +def asarray(*args, **kwargs): + """ + Convert to pyarrow.Array, inferring type if not provided. + + Parameters + ---------- + values : array-like + This can be a sequence, numpy.ndarray, pyarrow.Array or + pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be + a ChunkedArray, otherwise the output will be a Array. + type : string or DataType + Explicitly construct the array with this type. Attempt to cast if + indicated type is different. + + Returns + ------- + arr : Array or ChunkedArray + """ + +@overload +def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... +@overload +def nulls( + size: int, type: types.NullType | None, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def nulls( + size: int, type: types.BoolType, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... +@overload +def nulls( + size: int, type: types.Int16Type, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def nulls( + size: int, type: types.Int32Type, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def nulls( + size: int, type: types.Int64Type, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def nulls( + size: int, type: types.UInt8Type, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def nulls( + size: int, type: types.UInt16Type, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def nulls( + size: int, type: types.Uint32Type, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def nulls( + size: int, type: types.UInt64Type, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def nulls( + size: int, type: types.Float16Type, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def nulls( + size: int, type: types.Float32Type, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def nulls( + size: int, type: types.Float64Type, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def nulls( + size: int, type: types.Decimal32Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal64Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal128Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal256Type, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def nulls( + size: int, type: types.Date32Type, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def nulls( + size: int, type: types.Date64Type, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def nulls( + size: int, type: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... +@overload +def nulls( + size: int, type: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None +) -> Time64Array[types._Time64Unit]: ... +@overload +def nulls( + size: int, + type: types.TimestampType[types._Unit, types._Tz], + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... +@overload +def nulls( + size: int, type: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None +) -> DurationArray[types._Unit]: ... +@overload +def nulls( + size: int, type: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None +) -> MonthDayNanoIntervalArray: ... +@overload +def nulls( + size: int, + type: types.BinaryType, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def nulls( + size: int, + type: types.LargeBinaryType, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def nulls( + size: int, + type: types.FixedSizeBinaryType, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def nulls( + size: int, + type: types.StringType, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def nulls( + size: int, + type: types.LargeStringType, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def nulls( + size: int, + type: types.BinaryViewType, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def nulls( + size: int, + type: types.StringViewType, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def nulls( + size: int, + type: types.LargeListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.ListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.LargeListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.FixedSizeListType[_DataTypeT, _Size], + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def nulls( + size: int, + type: types.ListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def nulls( + size: int, + type: types.StructType, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def nulls( + size: int, + type: types.MapType[_MapKeyT, _MapItemT], + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def nulls( + size: int, + type: types.DictionaryType[_IndexT, _BasicValueT], + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _BasicValueT]: ... +@overload +def nulls( + size: int, + type: types.RunEndEncodedType[_RunEndType, _BasicValueT], + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... +@overload +def nulls( + size: int, + type: types.UnionType, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def nulls( + size: int, + type: types.FixedShapeTensorType[types._ValueT], + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray[Any]: ... +@overload +def nulls( + size: int, + type: types.Bool8Type, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def nulls( + size: int, + type: types.UuidType, + memory_pool: MemoryPool | None = None, +) -> UuidArray[Any]: ... +@overload +def nulls( + size: int, + type: types.JsonType, + memory_pool: MemoryPool | None = None, +) -> JsonArray[Any]: ... +@overload +def nulls( + size: int, + type: types.OpaqueType, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray[Any]: ... +@overload +def nulls( + size: int, + type: types.ExtensionType, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray[Any]: ... +def nulls(*args, **kwargs): + """ + Create a strongly-typed Array instance with all elements null. + + Parameters + ---------- + size : int + Array length. + type : pyarrow.DataType, default None + Explicit type for the array. By default use NullType. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.nulls(10) + + 10 nulls + + >>> pa.nulls(3, pa.uint32()) + + [ + null, + null, + null + ] + """ + +@overload +def repeat( + value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def repeat( # type: ignore[overload-overlap] + value: bool | scalar.BooleanScalar, size: int, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def repeat( + value: scalar.Int8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int8Array: ... +@overload +def repeat( + value: scalar.Int16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def repeat( + value: scalar.Int32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def repeat( + value: int | scalar.Int64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def repeat( + value: scalar.UInt8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def repeat( + value: scalar.UInt16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def repeat( + value: scalar.UInt32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def repeat( + value: scalar.UInt64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def repeat( + value: scalar.HalfFloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def repeat( + value: scalar.FloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def repeat( + value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def repeat( + value: Decimal | scalar.Decimal32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal32Array: ... +@overload +def repeat( + value: scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal64Array: ... +@overload +def repeat( + value: scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def repeat( + value: scalar.Decimal256Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def repeat( + value: dt.date | scalar.Date32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def repeat( + value: scalar.Date64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def repeat( + value: scalar.Time32Scalar[types._Time32Unit], size: int, memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... +@overload +def repeat( + value: dt.time | scalar.Time64Scalar[types._Time64Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> Time64Array[types._Time64Unit]: ... +@overload +def repeat( + value: scalar.TimestampScalar[types._Unit, types._Tz], + size: int, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... +@overload +def repeat( + value: dt.timedelta | scalar.DurationScalar[types._Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> DurationArray[types._Unit]: ... +@overload +def repeat( # pyright: ignore[reportOverlappingOverload] + value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def repeat( + value: bytes | scalar.BinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def repeat( + value: scalar.LargeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def repeat( + value: scalar.FixedSizeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def repeat( + value: str | scalar.StringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def repeat( + value: scalar.LargeStringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def repeat( + value: scalar.BinaryViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def repeat( + value: scalar.StringViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def repeat( + value: list[Any] | tuple[Any] | scalar.ListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def repeat( + value: scalar.FixedSizeListScalar[_DataTypeT, _Size], + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def repeat( + value: scalar.LargeListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.ListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.LargeListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: dict[str, Any] | scalar.StructScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def repeat( + value: scalar.MapScalar[_MapKeyT, _MapItemT], + size: int, + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def repeat( + value: scalar.DictionaryScalar[_IndexT, _BasicValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _BasicValueT]: ... +@overload +def repeat( + value: scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... +@overload +def repeat( + value: scalar.UnionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def repeat( + value: scalar.FixedShapeTensorScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray[Any]: ... +@overload +def repeat( + value: scalar.Bool8Scalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def repeat( + value: scalar.UuidScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UuidArray[Any]: ... +@overload +def repeat( + value: scalar.JsonScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> JsonArray[Any]: ... +@overload +def repeat( + value: scalar.OpaqueScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray[Any]: ... +@overload +def repeat( + value: scalar.ExtensionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray[Any]: ... +def repeat(*args, **kwargs): + """ + Create an Array instance whose slots are the given scalar. + + Parameters + ---------- + value : Scalar-like object + Either a pyarrow.Scalar or any python object coercible to a Scalar. + size : int + Number of times to repeat the scalar in the output Array. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.repeat(10, 3) + + [ + 10, + 10, + 10 + ] + + >>> pa.repeat([1, 2], 2) + + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + + >>> pa.repeat("string", 3) + + [ + "string", + "string", + "string" + ] + + >>> pa.repeat(pa.scalar({"a": 1, "b": [1, 2]}), 2) + + -- is_valid: all not null + -- child 0 type: int64 + [ + 1, + 1 + ] + -- child 1 type: list + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + """ + +def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: + """ + Attempt to infer Arrow data type that can hold the passed Python + sequence type in an Array object + + Parameters + ---------- + values : array-like + Sequence to infer type from. + mask : ndarray (bool type), optional + Optional exclusion mask where True marks null, False non-null. + from_pandas : bool, default False + Use pandas's NA/null sentinel values for type inference. + + Returns + ------- + type : DataType + """ + +class ArrayStatistics(_Weakrefable): + """ + The class for statistics of an array. + """ + @property + def null_count(self) -> int: + """ + The number of nulls. + """ + @property + def distinct_count(self) -> int: + """ + The number of distinct values. + """ + @property + def min(self) -> Any: + """ + The minimum value. + """ + @property + def is_min_exact(self) -> bool: + """ + Whether the minimum value is an exact value or not. + """ + @property + def max(self) -> Any: + """ + The maximum value. + """ + + @property + def is_max_exact(self) -> bool: + """ + Whether the maximum value is an exact value or not. + """ + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + categories : list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures. + strings_to_categorical : bool, default False + Encode string (UTF8) and binary types to pandas.Categorical. + zero_copy_only : bool, default False + Raise an ArrowException if this function call would require copying + the underlying data. + integer_object_nulls : bool, default False + Cast integers with nulls to objects + date_as_object : bool, default True + Cast dates to objects. If False, convert to datetime64 dtype with + the equivalent time unit (if supported). Note: in pandas version + < 2.0, only datetime64[ns] conversion is supported. + timestamp_as_object : bool, default False + Cast non-nanosecond timestamps (np.datetime64) to objects. This is + useful in pandas version 1.x if you have timestamps that don't fit + in the normal date range of nanosecond timestamps (1678 CE-2262 CE). + Non-nanosecond timestamps are supported in pandas version 2.0. + If False, all timestamps are converted to datetime64 dtype. + use_threads : bool, default True + Whether to parallelize the conversion using multiple threads. + deduplicate_objects : bool, default True + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower. + ignore_metadata : bool, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + safe : bool, default True + For certain data types, a cast is needed in order to store the + data in a pandas DataFrame or Series (e.g. timestamps are always + stored as nanoseconds in pandas). This option controls whether it + is a safe cast or not. + split_blocks : bool, default False + If True, generate one internal "block" for each column when + creating a pandas.DataFrame from a RecordBatch or Table. While this + can temporarily reduce memory note that various pandas operations + can trigger "consolidation" which may balloon memory use. + self_destruct : bool, default False + EXPERIMENTAL: If True, attempt to deallocate the originating Arrow + memory while converting the Arrow object to pandas. If you use the + object after calling to_pandas with this option it will crash your + program. + + Note that you may not see always memory usage improvements. For + example, if multiple columns share an underlying allocation, + memory can't be freed until all columns are converted. + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. + types_mapper : function, default None + A function mapping a pyarrow DataType to a pandas ExtensionDtype. + This can be used to override the default pandas type for conversion + of built-in pyarrow types or in absence of pandas_metadata in the + Table schema. The function receives a pyarrow DataType and is + expected to return a pandas ExtensionDtype or ``None`` if the + default conversion should be used for that type. If you have + a dictionary mapping, you can pass ``dict.get`` as function. + coerce_temporal_nanoseconds : bool, default False + Only applicable to pandas version >= 2.0. + A legacy option to coerce date32, date64, duration, and timestamp + time units to nanoseconds when converting to pandas. This is the + default behavior in pandas version 1.x. Set this option to True if + you'd like to use this coercion when using pandas version >= 2.0 + for backwards compatibility (not recommended otherwise). + + Returns + ------- + pandas.Series or pandas.DataFrame depending on type of object + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + + Convert a Table to pandas DataFrame: + + >>> table = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> table.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(table.to_pandas(), pd.DataFrame) + True + + Convert a RecordBatch to pandas DataFrame: + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(batch.to_pandas(), pd.DataFrame) + True + + Convert a Chunked Array to pandas Series: + + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_pandas() + 0 2 + 1 2 + 2 4 + 3 4 + 4 5 + 5 100 + dtype: int64 + >>> isinstance(n_legs.to_pandas(), pd.Series) + True + """ + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + The base class for all Arrow arrays. + """ + + def diff(self, other: Self) -> str: + """ + Compare contents of this array against another one. + + Return a string containing the result of diffing this array + (on the left side) against the other array (on the right side). + + Parameters + ---------- + other : Array + The other array to compare this array with. + + Returns + ------- + diff : str + A human-readable printout of the differences. + + Examples + -------- + >>> import pyarrow as pa + >>> left = pa.array(["one", "two", "three"]) + >>> right = pa.array(["two", None, "two-and-a-half", "three"]) + >>> print(left.diff(right)) # doctest: +SKIP + + @@ -0, +0 @@ + -"one" + @@ -2, +1 @@ + +null + +"two-and-a-half" + """ + def cast( + self, + target_type: _CastAs, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + cast : Array + """ + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: + """ + Return zero-copy "view" of array as another data type. + + The data types must have compatible columnar buffer layouts + + Parameters + ---------- + target_type : DataType + Type to construct view as. + + Returns + ------- + view : Array + """ + def sum(self, **kwargs) -> _Scalar_co: + """ + Sum the values in a numerical array. + + See :func:`pyarrow.compute.sum` for full usage. + + Parameters + ---------- + **kwargs : dict, optional + Options to pass to :func:`pyarrow.compute.sum`. + + Returns + ------- + sum : Scalar + A scalar containing the sum value. + """ + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: + """ + Compute distinct elements in array. + + Returns + ------- + unique : Array + An array of the same data type, with deduplicated elements. + """ + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : DictionaryArray + A dictionary-encoded version of this array. + """ + def value_count(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + StructArray + An array of structs + """ + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar]: ... + @staticmethod + def from_pandas(*args, **kwargs): + """ + Convert pandas.Series to an Arrow Array. + + This method uses Pandas semantics about what values indicate + nulls. See pyarrow.array for more general conversion from arrays or + sequences to Arrow arrays. + + Parameters + ---------- + obj : ndarray, pandas.Series, array-like + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred + from the data. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + ChunkedArray is returned if object data overflows binary buffer. + """ + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: + """ + Construct an Array from a sequence of buffers. + + The concrete type returned depends on the datatype. + + Parameters + ---------- + type : DataType + The value type of the array. + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing this array. + null_count : int, default -1 + The number of null entries in the array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array], default None + Nested type children with length matching type.num_fields. + + Returns + ------- + array : Array + """ + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the array. + + In other words, the sum of bytes from all buffer + ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + """ + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the Array. + + Note: for data on a non-CPU device, the full array is copied to CPU + memory. + + Parameters + ---------- + indent : int, default 2 + How much to indent the internal items in the string to + the right, by default ``2``. + top_level_indent : int, default 0 + How much to indent right the entire content of the array, + by default ``0``. + window : int + How many primitive items to preview at the begin and end + of the array when the array is bigger than the window. + The other items will be ellipsed. + container_window : int + How many container items (such as a list in a list array) + to preview at the begin and end of the array when the array + is bigger than the window. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + """ + format = to_string + def equals(self, other: Self) -> bool: ... + def __len__(self) -> int: ... + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: + """ + Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array + """ + def is_nan(self) -> BooleanArray: + """ + Return BooleanArray indicating the NaN values. + + Returns + ------- + array : boolean Array + """ + def is_valid(self) -> BooleanArray: + """ + Return BooleanArray indicating the non-null values. + """ + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: + """ + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array + A new array with nulls replaced by the given value. + """ + @overload + def __getitem__(self, key: int) -> _Scalar_co: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or Array (slice) + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this array. + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice. + length : int, default None + Length of slice (default is until end of Array starting from + offset). + + Returns + ------- + sliced : Array + An array with the same datatype, containing the sliced values. + """ + def take(self, indices: Indices) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array + An array with the same datatype, containing the taken values. + """ + def drop_null(self) -> Self: + """ + Remove missing values from an array. + """ + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array + An array of the same type, with only the elements selected by + the boolean mask. + """ + @overload + def index( + self: Array[_ScalarT], + value: _ScalarT, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + @overload + def index( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + value: _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the Array + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : Array + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: + """ + Return a NumPy view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for primitive arrays with the same memory layout as NumPy + (i.e. integers, floating point, ..) and without any nulls. + + For the extension arrays, this method simply delegates to the + underlying storage array. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + def to_pylist( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + *, + map_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + lst : list + """ + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def offset(self) -> int: + """ + A relative position into another array's data. + + The purpose is to enable zero-copy slicing. This value defaults to zero + but must be applied on all operations with the physical storage + buffers. + """ + def buffers(self) -> list[Buffer | None]: + """ + Return a list of Buffer objects pointing to this array's physical + storage. + + To correctly interpret these buffers, you need to also apply the offset + multiplied with the size of the stored data type. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Construct a copy of the array with all buffers on destination + device. + + This method recursively copies the array's buffers and those of its + children onto the destination MemoryManager device and returns the + new Array. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + Array + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: + """ + Import Array from a C ArrowArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: + """ + Import Array from a C ArrowDeviceArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: + """Export a primitive array as a DLPack capsule. + + Parameters + ---------- + stream : int, optional + A Python integer representing a pointer to a stream. Currently not supported. + Stream is provided by the consumer to the producer to instruct the producer + to ensure that operations can safely be performed on the array. + + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + def __dlpack_device__(self) -> tuple[int, int]: + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the array resides. + + Returns + ------- + DeviceAllocationType + """ + + @property + def is_cpu(self) -> bool: + """ + Whether the array is CPU-accessible. + """ + @property + def statistics(self) -> ArrayStatistics | None: + """ + Statistics of the array. + """ + +class NullArray(Array[scalar.NullScalar]): ... + +class BooleanArray(Array[scalar.BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + +class NumericArray(Array[_ScalarT]): ... +class IntegerArray(NumericArray[_ScalarT]): ... +class FloatingPointArray(NumericArray[_ScalarT]): ... +class Int8Array(IntegerArray[scalar.Int8Scalar]): ... +class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... +class Int16Array(IntegerArray[scalar.Int16Scalar]): ... +class UInt16Array(IntegerArray[scalar.UInt16Scalar]): ... +class Int32Array(IntegerArray[scalar.Int32Scalar]): ... +class UInt32Array(IntegerArray[scalar.UInt32Scalar]): ... +class Int64Array(IntegerArray[scalar.Int64Scalar]): ... +class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... +class Date32Array(NumericArray[scalar.Date32Scalar]): ... +class Date64Array(NumericArray[scalar.Date64Scalar]): ... +class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): ... +class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): ... +class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): ... +class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): ... +class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... +class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... +class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... +class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... +class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... +class Decimal32Array(FixedSizeBinaryArray): ... +class Decimal64Array(FixedSizeBinaryArray): ... +class Decimal128Array(FixedSizeBinaryArray): ... +class Decimal256Array(FixedSizeBinaryArray): ... + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + def value_parent_indices(self) -> Int64Array: ... + def value_lengths(self) -> Int32Array: ... + +class ListArray(BaseListArray[_ScalarT]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[int], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Int64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[float], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Float64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[str], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.StringType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[bytes], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.BinaryType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list, + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array | list, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListArray from arrays of int32 offsets and values. + + Parameters + ---------- + offsets : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : ListArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + ListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + """ + @property + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, 5]]) + >>> array.offsets + + [ + 0, + 2, + 2, + 5 + ] + """ + +class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : LargeListArray + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from the sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array( + ... [[1, 2], None, [3, 4, None, 6]], + ... type=pa.large_list(pa.int32()), + ... ) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ + @property + def offsets(self) -> Int64Array: + """ + Return the list offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + """ + +class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + @property + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + @property + def sizes(self) -> Int32Array: + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + +class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + @property + def offsets(self) -> Int64Array: + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + @property + def sizes(self) -> Int64Array: + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + +class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, None]: ... + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + limit_size: _Size, + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct FixedSizeListArray from array of values and a list length. + + Parameters + ---------- + values : Array (any type) + list_size : int + The fixed length of the lists. + type : DataType, optional + If not specified, a default ListType with the values' type and + `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + + Returns + ------- + FixedSizeListArray + + Examples + -------- + + Create from a values array and a list size: + + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + + Or create from a values array, list size and matching type: + + >>> typ = pa.list_(pa.field("values", pa.int64()), 2) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: + """ + Return the underlying array of values which backs the + FixedSizeListArray. + + Note even null elements are included. + + Compare with :meth:`flatten`, which returns only the non-null + sub-list values. + + Returns + ------- + values : Array + + See Also + -------- + FixedSizeListArray.flatten : ... + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, None]], type=pa.list_(pa.int32(), 2)) + >>> array.values + + [ + 1, + 2, + null, + null, + 3, + null + ] + + """ + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + +class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + keys: Array[Scalar[_MapKeyT]], + items: Array[Scalar[_MapItemT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @overload + @classmethod + def from_arrays( # pyright: ignore[reportIncompatibleMethodOverride] + cls, + offsets: Int64Array, + values: Array, + *, + type: MapType[_MapKeyT, _MapItemT], + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """ + Construct MapArray from arrays of int32 offsets and key, item arrays. + + Parameters + ---------- + offsets : array-like or sequence (int32 type) + keys : array-like or sequence (any type) + items : array-like or sequence (any type) + type : DataType, optional + If not specified, a default MapArray with the keys' and items' type is used. + pool : MemoryPool + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + map_array : MapArray + + Examples + -------- + First, let's understand the structure of our dataset when viewed in a rectangular data model. + The total of 5 respondents answered the question "How much did you like the movie x?". + The value -1 in the integer array means that the value is missing. The boolean array + represents the null bitmask corresponding to the missing values in the integer array. + + >>> import pyarrow as pa + >>> movies_rectangular = np.ma.masked_array( + ... [[10, -1, -1], [8, 4, 5], [-1, 10, 3], [-1, -1, -1], [-1, -1, -1]], + ... [ + ... [False, True, True], + ... [False, False, False], + ... [True, False, False], + ... [True, True, True], + ... [True, True, True], + ... ], + ... ) + + To represent the same data with the MapArray and from_arrays, the data is + formed like this: + + >>> offsets = [ + ... 0, # -- row 1 start + ... 1, # -- row 2 start + ... 4, # -- row 3 start + ... 6, # -- row 4 start + ... 6, # -- row 5 start + ... 6, # -- row 5 end + ... ] + >>> movies = [ + ... "Dark Knight", # ---------------------------------- row 1 + ... "Dark Knight", + ... "Meet the Parents", + ... "Superman", # -- row 2 + ... "Meet the Parents", + ... "Superman", # ----------------- row 3 + ... ] + >>> likings = [ + ... 10, # -------- row 1 + ... 8, + ... 4, + ... 5, # --- row 2 + ... 10, + ... 3, # ------ row 3 + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 [] + 4 [] + dtype: object + + If the data in the empty rows needs to be marked as missing, it's possible + to do so by modifying the offsets argument, so that we specify `None` as + the starting positions of the rows we want marked as missing. The end row + offset still has to refer to the existing value from keys (and values): + + >>> offsets = [ + ... 0, # ----- row 1 start + ... 1, # ----- row 2 start + ... 4, # ----- row 3 start + ... None, # -- row 4 start + ... None, # -- row 5 start + ... 6, # ----- row 5 end + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 None + 4 None + dtype: object + """ + @property + def keys(self) -> Array: + """Flattened array of keys across all maps in array""" + @property + def items(self) -> Array: + """Flattened array of items across all maps in array""" + +class UnionArray(Array[scalar.UnionScalar]): + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: + """ + DEPRECATED, use field() instead. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : pyarrow.Field + The given child field. + """ + def field(self, pos: int) -> Array: + """ + Return the given child field as an individual array. + + For sparse unions, the returned array has its offset, length, + and null count adjusted. + + For dense unions, the returned array is unchanged. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : Array + The given child field. + """ + @property + def type_codes(self) -> Int8Array: + """Get the type codes array.""" + @property + def offsets(self) -> Int32Array: + """ + Get the value offsets array (dense arrays only). + + Does not account for any slice offset. + """ + @staticmethod + def from_dense( + type: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + +class StringArray(Array[scalar.StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: + """ + Construct a StringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + +class LargeStringArray(Array[scalar.LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: + """ + Construct a LargeStringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + +class StringViewArray(Array[scalar.StringViewScalar]): ... + +class BinaryArray(Array[scalar.BinaryScalar]): + @property + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this BinaryArray. + """ + +class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): + @property + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this LargeBinaryArray. + """ + +class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... + +class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: + """ + Decodes the DictionaryArray to an Array. + """ + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: + """ + Construct a DictionaryArray from buffers. + + Parameters + ---------- + type : pyarrow.DataType + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing the indices array. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + null_count : int, default -1 + The number of null entries in the indices array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + + Returns + ------- + dict_array : DictionaryArray + """ + @staticmethod + def from_arrays( + indices: Indices, + dictionary: Array | np.ndarray | pd.Series, + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: + """ + Construct a DictionaryArray from indices and values. + + Parameters + ---------- + indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type + Non-negative integers referencing the dictionary values by zero + based index. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + mask : ndarray or pandas.Series, bool type + True values indicate that indices are actually null. + ordered : bool, default False + Set to True if the category values are ordered. + from_pandas : bool, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1). + safe : bool, default True + If True, check that the dictionary indices are in range. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool. + + Returns + ------- + dict_array : DictionaryArray + """ + +class StructArray(Array[scalar.StructScalar]): + def field(self, index: int | str) -> Array: + """ + Retrieves the child array belonging to field. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + + Returns + ------- + result : Array + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: + """ + Return one individual array for each field in the struct. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : List[Array] + """ + @staticmethod + def from_arrays( + arrays: Iterable[Array], + names: list[str] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: types.StructType | None = None, + ) -> StructArray: + """ + Construct StructArray from collection of arrays representing + each field in the struct. + + Either field names, field instances or a struct type must be passed. + + Parameters + ---------- + arrays : sequence of Array + names : List[str] (optional) + Field names for each struct child. + fields : List[Field] (optional) + Field instances for each struct child. + mask : pyarrow.Array[bool] (optional) + Indicate which values are null (True) or not null (False). + memory_pool : MemoryPool (optional) + For memory allocations, if required, otherwise uses default pool. + type : pyarrow.StructType (optional) + Struct type for name and type of each child. + + Returns + ------- + result : StructArray + """ + def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: + """ + Sort the StructArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + by : str or None, default None + If to sort the array by one of its fields + or by the whole array. + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : StructArray + """ + +class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + @overload + @staticmethod + def from_arrays( + run_ends: Int16Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int32Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int64Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... + @staticmethod + def from_arrays(*args, **kwargs): + """ + Construct RunEndEncodedArray from run_ends and values arrays. + + Parameters + ---------- + run_ends : Array (int16, int32, or int64 type) + The run_ends array. + values : Array (any type) + The values array. + type : pyarrow.DataType, optional + The run_end_encoded(run_end_type, value_type) array type. + + Returns + ------- + RunEndEncodedArray + """ + @staticmethod + def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] + type: DataType, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: + """ + Construct a RunEndEncodedArray from all the parameters that make up an + Array. + + RunEndEncodedArrays do not have buffers, only children arrays, but this + implementation is needed to satisfy the Array interface. + + Parameters + ---------- + type : DataType + The run_end_encoded(run_end_type, value_type) type. + length : int + The logical length of the run-end encoded array. Expected to match + the last value of the run_ends array (children[0]) minus the offset. + buffers : List[Buffer] + Empty List or [None]. + null_count : int, default -1 + The number of null entries in the array. Run-end encoded arrays + are specified to not have valid bits and null_count always equals 0. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array] + Nested type children containing the run_ends and values arrays. + + Returns + ------- + RunEndEncodedArray + """ + @property + def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: + """ + An array holding the logical indexes of each run-end. + + The physical offset to the array is applied. + """ + @property + def values(self) -> Array[scalar.Scalar[_BasicValueT]]: + """ + An array holding the values of each run. + + The physical offset to the array is applied. + """ + def find_physical_offset(self) -> int: + """ + Find the physical offset of this REE array. + + This is the offset of the run that contains the value of the first + logical element of this array considering its offset. + + This function uses binary-search, so it has a O(log N) cost. + """ + def find_physical_length(self) -> int: + """ + Find the physical length of this REE array. + + The physical length of an REE is the number of physical values (and + run-ends) necessary to represent the logical range of values from offset + to length. + + This function uses binary-search, so it has a O(log N) cost. + """ + +_ArrayT = TypeVar("_ArrayT", bound=Array) + +class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: + """ + Construct ExtensionArray from type and storage array. + + Parameters + ---------- + typ : DataType + The extension type for the result array. + storage : Array + The underlying storage for the result array. + + Returns + ------- + ext_array : ExtensionArray + """ + +class JsonArray(ExtensionArray[_ArrayT]): + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + +class UuidArray(ExtensionArray[_ArrayT]): ... + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + + def to_numpy_ndarray(self) -> np.ndarray: + """ + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + @classmethod + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + If input array data is not contiguous a copy will be made. + + Parameters + ---------- + obj : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] + """ + +class OpaqueArray(ExtensionArray[_ArrayT]): + """ + Concrete class for opaque extension arrays. + + Examples + -------- + Define the extension type for an opaque array + + >>> import pyarrow as pa + >>> opaque_type = pa.opaque( + ... pa.binary(), + ... type_name="geometry", + ... vendor_name="postgis", + ... ) + + Create an extension array + + >>> arr = [None, b"data"] + >>> storage = pa.array(arr, pa.binary()) + >>> pa.ExtensionArray.from_storage(opaque_type, storage) + + [ + null, + 64617461 + ] + """ + +class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + + Examples + -------- + Define the extension type for an bool8 array + + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + + Create an extension array + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: + """ + Return a NumPy bool view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: # type: ignore[override] + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. + + Parameters + ---------- + obj : numpy.ndarray + + Returns + ------- + bool8_array : Bool8Array + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ + +def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: + """ + Concatenate the given arrays. + + The contents of the input arrays are copied into the returned array. + + Raises + ------ + ArrowInvalid + If not all of the arrays have the same type. + + Parameters + ---------- + arrays : iterable of pyarrow.Array + Arrays to concatenate, must be identically typed. + memory_pool : MemoryPool, default None + For memory allocations. If None, the default pool is used. + + Examples + -------- + >>> import pyarrow as pa + >>> arr1 = pa.array([2, 4, 5, 100]) + >>> arr2 = pa.array([2, 4]) + >>> pa.concat_arrays([arr1, arr2]) + + [ + 2, + 4, + 5, + 100, + 2, + 4 + ] + + """ + +def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: + """ + Create empty array of the given type. + """ + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", +] diff --git a/python/pyarrow/__lib_pxi/benchmark.pyi b/python/pyarrow/__lib_pxi/benchmark.pyi new file mode 100644 index 00000000000..66981bf0f51 --- /dev/null +++ b/python/pyarrow/__lib_pxi/benchmark.pyi @@ -0,0 +1 @@ +def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/pyarrow/__lib_pxi/builder.pyi b/python/pyarrow/__lib_pxi/builder.pyi new file mode 100644 index 00000000000..4a0e9ca4708 --- /dev/null +++ b/python/pyarrow/__lib_pxi/builder.pyi @@ -0,0 +1,89 @@ +from typing import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + +class StringBuilder(_Weakrefable): + """ + Builder class for UTF8 strings. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringViewArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow/__lib_pxi/compat.pyi b/python/pyarrow/__lib_pxi/compat.pyi new file mode 100644 index 00000000000..ae667be453e --- /dev/null +++ b/python/pyarrow/__lib_pxi/compat.pyi @@ -0,0 +1,5 @@ +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/pyarrow/__lib_pxi/config.pyi b/python/pyarrow/__lib_pxi/config.pyi new file mode 100644 index 00000000000..166e10c9734 --- /dev/null +++ b/python/pyarrow/__lib_pxi/config.pyi @@ -0,0 +1,41 @@ +from typing import NamedTuple + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + +class BuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + +cpp_build_info: BuildInfo +cpp_version: str +cpp_version_info: VersionInfo + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + +__all__ = [ + "VersionInfo", + "BuildInfo", + "RuntimeInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/pyarrow/__lib_pxi/device.pyi b/python/pyarrow/__lib_pxi/device.pyi new file mode 100644 index 00000000000..d1b9f39eedd --- /dev/null +++ b/python/pyarrow/__lib_pxi/device.pyi @@ -0,0 +1,88 @@ +import enum + +from pyarrow.lib import _Weakrefable + +class DeviceAllocationType(enum.Flag): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + +class Device(_Weakrefable): + """ + Abstract interface for hardware devices + + This object represents a device with access to some memory spaces. + When handling a Buffer or raw memory address, it allows deciding in which + context the raw memory address should be interpreted + (e.g. CPU-accessible memory, or embedded memory on some particular GPU). + """ + + @property + def type_name(self) -> str: + """ + A shorthand for this device's type. + """ + @property + def device_id(self) -> int: + """ + A device ID to identify this device if there are multiple of this type. + + If there is no "device_id" equivalent (such as for the main CPU device on + non-numa systems) returns -1. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this device is the main CPU device. + + This shorthand method is very useful when deciding whether a memory address + is CPU-accessible. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + Return the DeviceAllocationType of this device. + """ + +class MemoryManager(_Weakrefable): + """ + An object that provides memory management primitives. + + A MemoryManager is always tied to a particular Device instance. + It can also have additional parameters (such as a MemoryPool to + allocate CPU memory). + + """ + @property + def device(self) -> Device: + """ + The device this MemoryManager is tied to. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this MemoryManager is tied to the main CPU device. + + This shorthand method is very useful when deciding whether a memory + address is CPU-accessible. + """ + +def default_cpu_memory_manager() -> MemoryManager: + """ + Return the default CPU MemoryManager instance. + + The returned singleton instance uses the default MemoryPool. + """ + +__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow/__lib_pxi/error.pyi b/python/pyarrow/__lib_pxi/error.pyi new file mode 100644 index 00000000000..981ed51e680 --- /dev/null +++ b/python/pyarrow/__lib_pxi/error.pyi @@ -0,0 +1,53 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowCapacityError(ArrowException): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + +ArrowIOError = IOError + +class StopToken: ... + +def enable_signal_handlers(enable: bool) -> None: ... + +have_signal_refcycle: bool + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/pyarrow/__lib_pxi/io.pyi b/python/pyarrow/__lib_pxi/io.pyi new file mode 100644 index 00000000000..d882fd79d57 --- /dev/null +++ b/python/pyarrow/__lib_pxi/io.pyi @@ -0,0 +1,1474 @@ +import sys + +from collections.abc import Callable +from io import IOBase + +from _typeshed import StrPath + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Any, Literal, SupportsIndex, overload + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from .types import KeyValueMetadata + +def have_libhdfs() -> bool: + """ + Return true if HDFS (HadoopFileSystem) library is set up correctly. + """ + +def io_thread_count() -> int: + """ + Return the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. The number of threads is set to a fixed value at + startup. It can be modified at runtime by calling + :func:`set_io_thread_count()`. + + See Also + -------- + set_io_thread_count : Modify the size of this pool. + cpu_count : The analogous function for the CPU thread pool. + """ + +def set_io_thread_count(count: int) -> None: + """ + Set the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. + + Parameters + ---------- + count : int + The max number of threads that may be used for I/O. + Must be positive. + + See Also + -------- + io_thread_count : Get the size of this pool. + set_cpu_count : The analogous function for the CPU thread pool. + """ + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + +class NativeFile(_Weakrefable): + """ + The base class for all Arrow streams. + + Streams are either readable, writable, or both. + They optionally support seeking. + + While this class exposes methods to read or write data from Python, the + primary intent of using a Arrow stream is to pass it to other Arrow + facilities that will make use of it, such as Arrow IPC routines. + + Be aware that there are subtle differences with regular Python files, + e.g. destroying a writable Arrow stream without closing it explicitly + will not flush any pending data. + """ + + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: + """ + The file mode. Currently instances of NativeFile may support: + + * rb: binary read + * wb: binary write + * rb+: binary read and write + * ab: binary append + """ + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: + """ + Return file size + """ + def metadata(self) -> KeyValueMetadata: + """ + Return file metadata + """ + def tell(self) -> int: + """ + Return current stream position + """ + def seek(self, position: int, whence: int = 0) -> int: + """ + Change current file stream position + + Parameters + ---------- + position : int + Byte offset, interpreted relative to value of whence argument + whence : int, default 0 + Point of reference for seek offset + + Notes + ----- + Values of whence: + * 0 -- start of stream (the default); offset should be zero or positive + * 1 -- current stream position; offset may be negative + * 2 -- end of stream; offset is usually negative + + Returns + ------- + int + The new absolute stream position. + """ + def flush(self) -> None: + """ + Flush the stream, if applicable. + + An error is raised if stream is not writable. + """ + def write(self, data: bytes | SupportPyBuffer) -> int: + """ + Write data to the file. + + Parameters + ---------- + data : bytes-like object or exporter of buffer protocol + + Returns + ------- + int + nbytes: number of bytes written + """ + def read(self, nbytes: int | None = None) -> bytes: + """ + Read and return up to n bytes. + + If *nbytes* is None, then the entire remaining file contents are read. + + Parameters + ---------- + nbytes : int, default None + + Returns + ------- + data : bytes + """ + def get_stream(self, file_offset: int, nbytes: int) -> Self: + """ + Return an input stream that reads a file segment independent of the + state of the file. + + Allows reading portions of a random access file as an input stream + without interfering with each other. + + Parameters + ---------- + file_offset : int + nbytes : int + + Returns + ------- + stream : NativeFile + """ + def read_at(self) -> bytes: + """ + Read indicated number of bytes at offset from the file + + Parameters + ---------- + nbytes : int + offset : int + + Returns + ------- + data : bytes + """ + def read1(self) -> bytes: + """Read and return up to n bytes. + + Unlike read(), if *nbytes* is None then a chunk is read, not the + entire file. + + Parameters + ---------- + nbytes : int, default None + The maximum number of bytes to read. + + Returns + ------- + data : bytes + """ + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: + """ + Read into the supplied buffer + + Parameters + ---------- + b : buffer-like object + A writable buffer object (such as a bytearray). + + Returns + ------- + written : int + number of bytes written + """ + + def readline(self, size: int | None = None) -> bytes: + """Read and return a line of bytes from the file. + + If size is specified, read at most size bytes. + + Line terminator is always b"\\n". + + Parameters + ---------- + size : int + maximum number of bytes read + """ + def readlines(self, hint: int | None = None) -> list[bytes]: + """Read lines of the file + + Parameters + ---------- + hint : int + maximum number of bytes read until we stop + """ + def __iter__(self) -> Self: ... + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: + """ + Read from buffer. + + Parameters + ---------- + nbytes : int, optional + maximum number of bytes read + """ + def truncate(self) -> None: ... + def writelines(self, lines: list[bytes]): + """ + Write lines to the file. + + Parameters + ---------- + lines : iterable + Iterable of bytes-like objects or exporters of buffer protocol + """ + def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: + """ + Read this file completely to a local path or destination stream. + + This method first seeks to the beginning of the file. + + Parameters + ---------- + stream_or_path : str or file-like object + If a string, a local file path to write to; otherwise, + should be a writable stream. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + def upload(self, stream: IOBase, buffer_size: int | None) -> None: + """ + Write from a source stream to this file. + + Parameters + ---------- + stream : file-like object + Source stream to pipe to this file. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + +# ---------------------------------------------------------------------- +# Python file-like objects + +class PythonFile(NativeFile): + """ + A stream backed by a Python file object. + + This class allows using Python file objects with arbitrary Arrow + functions, including functions written in another language than Python. + + As a downside, there is a non-zero redirection cost in translating + Arrow stream calls to Python method calls. Furthermore, Python's + Global Interpreter Lock may limit parallelism in some situations. + + Examples + -------- + >>> import io + >>> import pyarrow as pa + >>> pa.PythonFile(io.BytesIO()) + + + Create a stream for writing: + + >>> buf = io.BytesIO() + >>> f = pa.PythonFile(buf, mode="w") + >>> f.writable() + True + >>> f.write(b"PythonFile") + 10 + >>> buf.getvalue() + b'PythonFile' + >>> f.close() + >>> f + + + Create a stream for reading: + + >>> buf = io.BytesIO(b"PythonFile") + >>> f = pa.PythonFile(buf, mode="r") + >>> f.mode + 'rb' + >>> f.read() + b'PythonFile' + >>> f + + >>> f.close() + >>> f + + """ + def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: + """ + Parameters + ---------- + pos : int, optional + """ + +class MemoryMappedFile(NativeFile): + """ + A stream that represents a memory-mapped file. + + Supports 'r', 'r+', 'w' modes. + + Examples + -------- + Create a new file with memory map: + + >>> import pyarrow as pa + >>> mmap = pa.create_memory_map("example_mmap.dat", 10) + >>> mmap + + >>> mmap.close() + + Open an existing file with memory map: + + >>> with pa.memory_map("example_mmap.dat") as mmap: + ... mmap + + """ + @classmethod + def create(cls, path: str, size: int) -> Self: + """ + Create a MemoryMappedFile + + Parameters + ---------- + path : str + Where to create the file. + size : int + Size of the memory mapped file. + """ + def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + def resize(self, new_size: int) -> None: + """ + Resize the map and underlying file. + + Parameters + ---------- + new_size : new size in bytes + """ + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: + """ + Open memory map at file path. Size of the memory map cannot change. + + Parameters + ---------- + path : str + mode : {'r', 'r+', 'w'}, default 'r' + Whether the file is opened for reading ('r'), writing ('w') + or both ('r+'). + + Returns + ------- + mmap : MemoryMappedFile + + Examples + -------- + Reading from a memory map without any memory allocation or copying: + + >>> import pyarrow as pa + >>> with pa.output_stream("example_mmap.txt") as stream: + ... stream.write(b"Constructing a buffer referencing the mapped memory") + 51 + >>> with pa.memory_map("example_mmap.txt") as mmap: + ... mmap.read_at(6, 45) + b'memory' + """ + +create_memory_map = MemoryMappedFile.create + +class OSFile(NativeFile): + """ + A stream backed by a regular file descriptor. + + Examples + -------- + Create a new file to write to: + + >>> import pyarrow as pa + >>> with pa.OSFile("example_osfile.arrow", mode="w") as f: + ... f.writable() + ... f.write(b"OSFile") + ... f.seekable() + True + 6 + False + + Open the file to read: + + >>> with pa.OSFile("example_osfile.arrow", mode="r") as f: + ... f.mode + ... f.read() + 'rb' + b'OSFile' + + Open the file to append: + + >>> with pa.OSFile("example_osfile.arrow", mode="ab") as f: + ... f.mode + ... f.write(b" is super!") + 'ab' + 10 + >>> with pa.OSFile("example_osfile.arrow") as f: + ... f.read() + b'OSFile is super!' + + Inspect created OSFile: + + >>> pa.OSFile("example_osfile.arrow") + + """ + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"], + memory_pool: MemoryPool | None = None, + ) -> None: ... + +class FixedSizeBufferWriter(NativeFile): + """ + A stream writing to a Arrow buffer. + + Examples + -------- + Create a stream to write to ``pyarrow.Buffer``: + + >>> import pyarrow as pa + >>> buf = pa.allocate_buffer(5) + >>> with pa.output_stream(buf) as stream: + ... stream.write(b"abcde") + ... stream + 5 + + + Inspect the buffer: + + >>> buf.to_pybytes() + b'abcde' + >>> buf + + """ + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + def set_memcopy_threshold(self, threshold: int) -> None: ... + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + """ + The base class for all Arrow buffers. + + A buffer represents a contiguous memory area. Many buffers will own + their memory, though not all of them do. + """ + def __len__(self) -> int: ... + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: + """ + The buffer size in bytes. + """ + @property + def address(self) -> int: + """ + The buffer's address, as an integer. + + The returned address may point to CPU or device memory. + Use `is_cpu()` to disambiguate. + """ + def hex(self) -> bytes: + """ + Compute hexadecimal representation of the buffer. + + Returns + ------- + : bytes + """ + @property + def is_mutable(self) -> bool: + """ + Whether the buffer is mutable. + """ + @property + def is_cpu(self) -> bool: + """ + Whether the buffer is CPU-accessible. + """ + @property + def device(self) -> Device: + """ + The device where the buffer resides. + + Returns + ------- + Device + """ + @property + def memory_manager(self) -> MemoryManager: + """ + The memory manager associated with the buffer. + + Returns + ------- + MemoryManager + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the buffer resides. + + Returns + ------- + DeviceAllocationType + """ + @property + def parent(self) -> Buffer | None: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> int: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Slice this buffer. Memory is not copied. + + You can also use the Python slice notation ``buffer[start:stop]``. + + Parameters + ---------- + offset : int, default 0 + Offset from start of buffer to slice. + length : int, default None + Length of slice (default is until end of Buffer starting from + offset). + + Returns + ------- + sliced : Buffer + A logical view over this buffer. + """ + def equals(self, other: Self) -> bool: + """ + Determine if two buffers contain exactly the same data. + + Parameters + ---------- + other : Buffer + + Returns + ------- + are_equal : bool + True if buffer contents and size are equal + """ + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: + """ + Return this buffer as a Python bytes object. Memory is copied. + """ + def __buffer__(self, flags: int, /) -> memoryview: ... + +class ResizableBuffer(Buffer): + """ + A base class for buffers that can be resized. + """ + + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: + """ + Resize buffer to indicated size. + + Parameters + ---------- + new_size : int + New size of buffer (padding may be added internally). + shrink_to_fit : bool, default False + If this is true, the buffer is shrunk when new_size is less + than the current size. + If this is false, the buffer is never shrunk. + """ + +@overload +def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[False] +) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[True] +) -> ResizableBuffer: ... +def allocate_buffer(*args, **kwargs): + """ + Allocate a mutable buffer. + + Parameters + ---------- + size : int + Number of bytes to allocate (plus internal padding) + memory_pool : MemoryPool, optional + The pool to allocate memory from. + If not given, the default memory pool is used. + resizable : bool, default False + If true, the returned buffer is resizable. + + Returns + ------- + buffer : Buffer or ResizableBuffer + """ + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + """ + An output stream that writes to a resizable buffer. + + The buffer is produced as a result when ``getvalue()`` is called. + + Examples + -------- + Create an output stream, write data to it and finalize it with + ``getvalue()``: + + >>> import pyarrow as pa + >>> f = pa.BufferOutputStream() + >>> f.write(b"pyarrow.Buffer") + 14 + >>> f.closed + False + >>> f.getvalue() + + >>> f.closed + True + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: + """ + Finalize output stream and return result as pyarrow.Buffer. + + Returns + ------- + value : Buffer + """ + +class MockOutputStream(NativeFile): ... + +class BufferReader(NativeFile): + """ + Zero-copy reader from objects convertible to Arrow buffer. + + Parameters + ---------- + obj : Python bytes or pyarrow.Buffer + + Examples + -------- + Create an Arrow input stream and inspect it: + + >>> import pyarrow as pa + >>> data = b"reader data" + >>> buf = memoryview(data) + >>> with pa.input_stream(buf) as stream: + ... stream.size() + ... stream.read(6) + ... stream.seek(7) + ... stream.read(15) + 11 + b'reader' + 7 + b'data' + """ + def __init__(self, obj) -> None: ... + +class CompressedInputStream(NativeFile): + """ + An input stream wrapper which decompresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + + Create an input stream with decompression referencing the + buffer with compressed data: + + >>> cdata = raw.getvalue() + >>> with pa.input_stream(cdata, compression="gzip") as compressed: + ... compressed.read() + b'Compressed stream' + + which actually translates to the use of ``BufferReader``and + ``CompressedInputStream``: + + >>> raw = pa.BufferReader(cdata) + >>> with pa.CompressedInputStream(raw, "gzip") as compressed: + ... compressed.read() + b'Compressed stream' + """ + + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class CompressedOutputStream(NativeFile): + """ + An output stream wrapper which compresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + """ + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class BufferedInputStream(NativeFile): + """ + An input stream that performs buffered reads from + an unbuffered input stream, which can mitigate the overhead + of many small reads in some cases. + + Parameters + ---------- + stream : NativeFile + The input stream to wrap with the buffer + buffer_size : int + Size of the temporary read buffer. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: + """ + Release the raw InputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw input stream + """ + +class BufferedOutputStream(NativeFile): + """ + An output stream that performs buffered reads from + an unbuffered output stream, which can mitigate the overhead + of many small writes in some cases. + + Parameters + ---------- + stream : NativeFile + The writable output stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: + """ + Flush any buffered writes and release the raw OutputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw output stream. + """ + +class TransformInputStream(NativeFile): + """ + Transform an input stream. + + Parameters + ---------- + stream : NativeFile + The stream to transform. + transform_func : callable + The transformation to apply. + """ + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: + """ + Add a transcoding transformation to the stream. + Incoming data will be decoded according to ``src_encoding`` and + then re-encoded according to ``dest_encoding``. + + Parameters + ---------- + stream : NativeFile + The stream to which the transformation should be applied. + src_encoding : str + The codec to use when reading data. + dest_encoding : str + The codec to use for emitted data. + """ + +def py_buffer(obj: SupportPyBuffer) -> Buffer: + """ + Construct an Arrow buffer from a Python bytes-like or buffer-like object + + Parameters + ---------- + obj : object + the object from which the buffer should be constructed. + """ + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: + """ + Construct an Arrow buffer with the given *address* and *size*. + + The buffer will be optionally backed by the Python *base* object, if given. + The *base* object will be kept alive as long as this buffer is alive, + including across language boundaries (for example if the buffer is + referenced by C++ code). + + Parameters + ---------- + address : int + The starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + The size of device buffer in bytes. + base : {None, object} + Object that owns the referenced memory. + """ + +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + +class CacheOptions(_Weakrefable): + """ + Cache options for a pre-buffered fragment scan. + + Parameters + ---------- + hole_size_limit : int, default 8KiB + The maximum distance in bytes between two consecutive ranges; beyond + this value, ranges are not combined. + range_size_limit : int, default 32MiB + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, + they are not combined + lazy : bool, default True + lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + range that is currently being read. + prefetch_limit : int, default 0 + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target + range. + """ + + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: + """ + Create suitable CacheOptions based on provided network metrics. + + Typically this will be used with object storage solutions like Amazon S3, + Google Cloud Storage and Azure Blob Storage. + + Parameters + ---------- + time_to_first_byte_millis : int + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. + transfer_bandwidth_mib_per_sec : int + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + integer. + ideal_bandwidth_utilization_frac : int, default 0.9 + Transfer bandwidth utilization fraction (per connection) to maximize the net + data load. The value is a positive float less than 1. + max_ideal_request_size_mib : int, default 64 + The maximum single data request size (in MiB) to maximize the net data load. + + Returns + ------- + CacheOptions + """ + +class Codec(_Weakrefable): + """ + Compression codec. + + Parameters + ---------- + compression : str + Type of compression codec to initialize, valid values are: 'gzip', + 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and + 'snappy'. + compression_level : int, None + Optional parameter specifying how aggressively to compress. The + possible ranges and effect of this parameter depend on the specific + codec chosen. Higher values compress more but typically use more + resources (CPU/RAM). Some codecs support negative values. + + gzip + The compression_level maps to the memlevel parameter of + deflateInit2. Higher levels use more RAM but are faster + and should have higher compression ratios. + + bz2 + The compression level maps to the blockSize100k parameter of + the BZ2_bzCompressInit function. Higher levels use more RAM + but are faster and should have higher compression ratios. + + brotli + The compression level maps to the BROTLI_PARAM_QUALITY + parameter. Higher values are slower and should have higher + compression ratios. + + lz4/lz4_frame/lz4_raw + The compression level parameter is not supported and must + be None + + zstd + The compression level maps to the compressionLevel parameter + of ZSTD_initCStream. Negative values are supported. Higher + values are slower and should have higher compression ratios. + + snappy + The compression level parameter is not supported and must + be None + + + Raises + ------ + ValueError + If invalid compression value is passed. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.Codec.is_available("gzip") + True + >>> codec = pa.Codec("gzip") + >>> codec.name + 'gzip' + >>> codec.compression_level + 9 + """ + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + @classmethod + def detect(cls, path: StrPath) -> Self: + """ + Detect and instantiate compression codec based on file extension. + + Parameters + ---------- + path : str, path-like + File-path to detect compression from. + + Raises + ------ + TypeError + If the passed value is not path-like. + ValueError + If the compression can't be detected from the path. + + Returns + ------- + Codec + """ + @staticmethod + def is_available(compression: Compression) -> bool: + """ + Returns whether the compression support has been built and enabled. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + + Returns + ------- + bool + """ + @staticmethod + def supports_compression_level(compression: Compression) -> int: + """ + Returns true if the compression level parameter is supported + for the given codec. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def default_compression_level(compression: Compression) -> int: + """ + Returns the compression level that Arrow will use for the codec if + None is specified. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def minimum_compression_level(compression: Compression) -> int: + """ + Returns the smallest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def maximum_compression_level(compression: Compression) -> int: + """ + Returns the largest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @property + def name(self) -> Compression: + """Returns the name of the codec""" + @property + def compression_level(self) -> int: + """Returns the compression level parameter of the codec""" + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + def compress(self, *args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + def decompress(self, *args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + decompressed_size : int, default None + Size of the decompressed result + asbytes : boolean, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def compress(*args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def decompress(*args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + Input object to decompress data from. + decompressed_size : int, default None + Size of the decompressed result + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +def input_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> BufferReader: + """ + Create an Arrow input stream. + + Parameters + ---------- + source : str, Path, buffer, or file-like object + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Examples + -------- + Create a readable BufferReader (NativeFile) from a Buffer or a memoryview object: + + >>> import pyarrow as pa + >>> buf = memoryview(b"some data") + >>> with pa.input_stream(buf) as stream: + ... stream.read(4) + b'some' + + Create a readable OSFile (NativeFile) from a string or file path: + + >>> import gzip + >>> with gzip.open("example.gz", "wb") as f: + ... f.write(b"some data") + 9 + >>> with pa.input_stream("example.gz") as stream: + ... stream.read() + b'some data' + + Create a readable PythonFile (NativeFile) from a a Python file object: + + >>> with open("example.txt", mode="w") as f: + ... f.write("some text") + 9 + >>> with pa.input_stream("example.txt") as stream: + ... stream.read(6) + b'some t' + """ + +def output_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> NativeFile: + """ + Create an Arrow output stream. + + Parameters + ---------- + source : str, Path, buffer, file-like object + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + + Examples + -------- + Create a writable NativeFile from a pyarrow Buffer: + + >>> import pyarrow as pa + >>> data = b"buffer data" + >>> empty_obj = bytearray(11) + >>> buf = pa.py_buffer(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read(6) + b'buffer' + + or from a memoryview object: + + >>> buf = memoryview(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read() + b'buffer data' + + Create a writable NativeFile from a string or file path: + + >>> with pa.output_stream("example_second.txt") as stream: + ... stream.write(b"Write some data") + 15 + >>> with pa.input_stream("example_second.txt") as stream: + ... stream.read() + b'Write some data' + """ + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/python/pyarrow/__lib_pxi/ipc.pyi b/python/pyarrow/__lib_pxi/ipc.pyi new file mode 100644 index 00000000000..3d72892061e --- /dev/null +++ b/python/pyarrow/__lib_pxi/ipc.pyi @@ -0,0 +1,705 @@ +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile +from .types import DictionaryMemo, KeyValueMetadata + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + +class WriteStats(NamedTuple): + """IPC write statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class ReadStats(NamedTuple): + """IPC read statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class IpcReadOptions(_Weakrefable): + """ + Serialization options for reading IPC format. + + Parameters + ---------- + ensure_native_endian : bool, default True + Whether to convert incoming data to platform-native endianness. + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like decompression + included_fields : list + If empty (the default), return all deserialized fields. + If non-empty, the values are the indices of fields to read on + the top-level schema + """ + + ensure_native_endian: bool + use_threads: bool + included_fields: list[int] + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + included_fields: list[int] | None = None, + ) -> None: ... + +class IpcWriteOptions(_Weakrefable): + """ + Serialization options for the IPC format. + + Parameters + ---------- + metadata_version : MetadataVersion, default MetadataVersion.V5 + The metadata version to write. V5 is the current and latest, + V4 is the pre-1.0 metadata version (with incompatible Union layout). + allow_64bit : bool, default False + If true, allow field lengths that don't fit in a signed 32-bit int. + use_legacy_format : bool, default False + Whether to use the pre-Arrow 0.15 IPC format. + compression : str, Codec, or None + compression codec to use for record batch buffers. + If None then batch buffers will be uncompressed. + Must be "lz4", "zstd" or None. + To specify a compression_level use `pyarrow.Codec` + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like compression. + emit_dictionary_deltas : bool + Whether to emit dictionary deltas. Default is false for maximum + stream compatibility. + unify_dictionaries : bool + If true then calls to write_table will attempt to unify dictionaries + across all batches in the table. This can help avoid the need for + replacement dictionaries (which the file format does not support) + but requires computing the unified dictionary and then remapping + the indices arrays. + + This parameter is ignored when writing to the IPC stream format as + the IPC stream format can support replacement dictionaries. + """ + + metadata_version: MetadataVersion + allow_64bit: bool + use_legacy_format: bool + compression: Codec | Literal["lz4", "zstd"] | None + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + +class Message(_Weakrefable): + """ + Container for an Arrow IPC message with metadata and optional body + """ + + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + def serialize_to( + self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None + ): + """ + Write message to generic OutputStream + + Parameters + ---------- + sink : NativeFile + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + """ + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write message as encapsulated IPC message + + Parameters + ---------- + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + """ + +class MessageReader(_Weakrefable): + """ + Interface for reading Message objects from some source (like an + InputStream) + """ + @classmethod + def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: + """ + Open stream from source, if you want to use memory map use + MemoryMappedFile as source. + + Parameters + ---------- + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object + A readable source, like an InputStream + """ + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: + """ + Read next Message from the stream. + + Raises + ------ + StopIteration + At end of stream + """ + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + +class _CRecordBatchWriter(_Weakrefable): + """The base RecordBatchWriter wrapper. + + Provides common implementations of convenience methods. Should not + be instantiated directly by user code. + """ + def write(self, table_or_batch: Table | RecordBatch): + """ + Write RecordBatch or Table to stream. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + """ + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + custom_metadata : mapping or KeyValueMetadata + Keys and values must be string-like / coercible to bytes + """ + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: + """ + Current IPC write statistics. + """ + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + def __dealloc__(self) -> None: ... + def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: + """ + Read contents of stream to a pandas.DataFrame. + + Read all record batches as a pyarrow.Table then convert it to a + pandas.DataFrame using Table.to_pandas. + + Parameters + ---------- + **options + Arguments to forward to :meth:`Table.to_pandas`. + + Returns + ------- + df : pandas.DataFrame + """ + +class RecordBatchReader(_Weakrefable): + """Base class for reading stream of record batches. + + Record batch readers function as iterators of record batches that also + provide the schema (without the need to get any batches). + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatchReader.from_*`` functions instead. + + Notes + ----- + To import and export using the Arrow C stream interface, use the + ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this + interface is intended for expert users. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([("x", pa.int64())]) + >>> def iter_record_batches(): + ... for i in range(2): + ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) + >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) + >>> print(reader.schema) + x: int64 + >>> for batch in reader: + ... print(batch) + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + """ + + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: + """ + Read next RecordBatch from the stream. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + RecordBatch + """ + __next__ = read_next_batch + @property + def schema(self) -> Schema: + """ + Shared schema of the record batches in the stream. + + Returns + ------- + Schema + """ + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: + """ + Read next RecordBatch from the stream along with its custom metadata. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: + """ + Iterate over record batches from the stream along with their custom + metadata. + + Yields + ------ + RecordBatchWithMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table. + + Returns + ------- + Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def close(self) -> None: + """ + Release any resources associated with the reader. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: + """ + Wrap this reader with one that casts each batch lazily as it is pulled. + Currently only a safe cast to target_schema is implemented. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + + Returns + ------- + RecordBatchReader + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowArrayStream struct, given its pointer. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + Be careful: if you don't pass the ArrowArrayStream struct to a + consumer, array memory will leak. This is a low-level function + intended for expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream struct, + given its pointer. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: + """ + Create RecordBatchReader from a Arrow-compatible stream object. + + This accepts objects implementing the Arrow PyCapsule Protocol for + streams, i.e. objects that have a ``__arrow_c_stream__`` method. + + Parameters + ---------- + data : Arrow-compatible stream object + Any object that implements the Arrow PyCapsule Protocol for + streams. + schema : Schema, default None + The schema to which the stream should be casted, if supported + by the stream object. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: + """ + Create RecordBatchReader from an iterable of batches. + + Parameters + ---------- + schema : Schema + The shared schema of the record batches + batches : Iterable[RecordBatch] + The batches that this reader will return. + + Returns + ------- + reader : RecordBatchReader + """ + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: + """ + Current IPC read statistics. + """ + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... + +class RecordBatchWithMetadata(NamedTuple): + """RecordBatch with its custom metadata + + Parameters + ---------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + + batch: RecordBatch + custom_metadata: KeyValueMetadata + +class _RecordBatchFileReader(_Weakrefable): + @property + def num_record_batches(self) -> int: + """ + The number of record batches in the IPC file. + """ + def get_batch(self, i: int) -> RecordBatch: + """ + Read the record batch with the given index. + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + """ + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: + """ + Read the record batch with the given index along with + its custom metadata + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + +def get_tensor_size(tensor: Tensor) -> int: + """ + Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. + """ + +def get_record_batch_size(batch: RecordBatch) -> int: + """ + Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. + """ + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: + """ + Write pyarrow.Tensor to pyarrow.NativeFile object its current position. + + Parameters + ---------- + tensor : pyarrow.Tensor + dest : pyarrow.NativeFile + + Returns + ------- + bytes_written : int + Total number of bytes written to the file + """ + +def read_tensor(source: NativeFile) -> Tensor: + """Read pyarrow.Tensor from pyarrow.NativeFile object from current + position. If the file source supports zero copy (e.g. a memory map), then + this operation does not allocate any memory. This function not assume that + the stream is aligned + + Parameters + ---------- + source : pyarrow.NativeFile + + Returns + ------- + tensor : Tensor + + """ + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: + """ + Read length-prefixed message from file or buffer-like object + + Parameters + ---------- + source : pyarrow.NativeFile, file-like object, or buffer-like object + + Returns + ------- + message : Message + """ + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: + """ + Read Schema from message or buffer + + Parameters + ---------- + obj : buffer or Message + dictionary_memo : DictionaryMemo, optional + Needed to be able to reconstruct dictionary-encoded fields + with read_record_batch + + Returns + ------- + schema : Schema + """ + +def read_record_batch( + obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None +) -> RecordBatch: + """ + Read RecordBatch from message, given a known schema. If reading data from a + complete IPC stream, use ipc.open_stream instead + + Parameters + ---------- + obj : Message or Buffer-like + schema : Schema + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + + Returns + ------- + batch : RecordBatch + """ + +__all__ = [ + "MetadataVersion", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/pyarrow/__lib_pxi/memory.pyi b/python/pyarrow/__lib_pxi/memory.pyi new file mode 100644 index 00000000000..57a3bb4f1b3 --- /dev/null +++ b/python/pyarrow/__lib_pxi/memory.pyi @@ -0,0 +1,174 @@ +from pyarrow.lib import _Weakrefable + +class MemoryPool(_Weakrefable): + """ + Base class for memory allocation. + + Besides tracking its number of allocated bytes, a memory pool also + takes care of the required 64-byte alignment for Arrow data. + """ + + def release_unused(self) -> None: + """ + Attempt to return to the OS any memory being held onto by the pool. + + This function should not be called except potentially for + benchmarking or debugging as it could be expensive and detrimental to + performance. + + This is best effort and may not have any effect on some memory pools + or in some situations (e.g. fragmentation). + """ + def bytes_allocated(self) -> int: + """ + Return the number of bytes that are currently allocated from this + memory pool. + """ + def total_bytes_allocated(self) -> int: + """ + Return the total number of bytes that have been allocated from this + memory pool. + """ + def max_memory(self) -> int | None: + """ + Return the peak memory allocation in this memory pool. + This can be an approximate number in multi-threaded applications. + + None is returned if the pool implementation doesn't know how to + compute this number. + """ + def num_allocations(self) -> int: + """ + Return the number of allocations or reallocations that were made + using this memory pool. + """ + def print_stats(self) -> None: + """ + Print statistics about this memory pool. + + The output format is implementation-specific. Not all memory pools + implement this method. + """ + @property + def backend_name(self) -> str: + """ + The name of the backend used by this MemoryPool (e.g. "jemalloc"). + """ + +class LoggingMemoryPool(MemoryPool): ... +class ProxyMemoryPool(MemoryPool): ... + +def default_memory_pool() -> MemoryPool: + """ + Return the process-global memory pool. + + Examples + -------- + >>> default_memory_pool() + + """ + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but with separate allocation statistics. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but also dumps allocation logs on stderr. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def system_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the C malloc heap. + """ + +def jemalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the jemalloc heap. + + NotImplementedError is raised if jemalloc support is not enabled. + """ + +def mimalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the mimalloc heap. + + NotImplementedError is raised if mimalloc support is not enabled. + """ + +def set_memory_pool(pool: MemoryPool) -> None: + """ + Set the default memory pool. + + Parameters + ---------- + pool : MemoryPool + The memory pool that should be used by default. + """ + +def log_memory_allocations(enable: bool = True) -> None: + """ + Enable or disable memory allocator logging for debugging purposes + + Parameters + ---------- + enable : bool, default True + Pass False to disable logging + """ + +def total_allocated_bytes() -> int: + """ + Return the currently allocated bytes from the default memory pool. + Other memory pools may not be accounted for. + """ + +def jemalloc_set_decay_ms(decay_ms: int) -> None: + """ + Set arenas.dirty_decay_ms and arenas.muzzy_decay_ms to indicated number of + milliseconds. A value of 0 (the default) results in dirty / muzzy memory + pages being released right away to the OS, while a higher value will result + in a time-based decay. See the jemalloc docs for more information + + It's best to set this at the start of your application. + + Parameters + ---------- + decay_ms : int + Number of milliseconds to set for jemalloc decay conf parameters. Note + that this change will only affect future memory arenas + """ + +def supported_memory_backends() -> list[str]: + """ + Return a list of available memory pool backends + """ + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/pyarrow/__lib_pxi/pandas_shim.pyi b/python/pyarrow/__lib_pxi/pandas_shim.pyi new file mode 100644 index 00000000000..0e80fae4ebf --- /dev/null +++ b/python/pyarrow/__lib_pxi/pandas_shim.pyi @@ -0,0 +1,51 @@ +from types import ModuleType +from typing import Any, Iterable, TypeGuard + +import pandas as pd + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> pd.Series: ... + def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> ModuleType: ... + @property + def pd(self) -> ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + @property + def categorical_type(self) -> type[pd.Categorical]: ... + @property + def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + def is_array_like( + self, obj: Any + ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... + def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/pyarrow/__lib_pxi/scalar.pyi b/python/pyarrow/__lib_pxi/scalar.pyi new file mode 100644 index 00000000000..81ab5012067 --- /dev/null +++ b/python/pyarrow/__lib_pxi/scalar.pyi @@ -0,0 +1,1017 @@ +import collections.abc +import datetime as dt +import sys + +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Any, Generic, Iterator, Literal, Mapping, overload + +import numpy as np + +from pyarrow._compute import CastOptions +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from typing_extensions import Protocol, TypeVar + +from . import types +from .types import ( + _AsPyType, + _DataTypeT, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) + +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=types.DataType, covariant=True) + +class Scalar(_Weakrefable, Generic[_DataType_co]): + """ + The base class for scalars. + """ + @property + def type(self) -> _DataType_co: + """ + Data type of the Scalar object. + """ + @property + def is_valid(self) -> bool: + """ + Holds a valid (non-null) value. + """ + @overload + def cast( + self, + target_type: None, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + @overload + def cast( + self, + target_type: _DataTypeT, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Scalar[_DataTypeT]: ... + def cast(self, *args, **kwargs): + """ + Cast scalar value to another data type. + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast scalar to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + scalar : A Scalar of the given target data type. + """ + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def equals(self, other: Scalar) -> bool: ... + def __hash__(self) -> int: ... + @overload + def as_py( + self: Scalar[types._BasicDataType[_AsPyType]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> _AsPyType: ... + @overload + def as_py( + self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[ + types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] + ] + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[int, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[int, Any]]: ... + @overload + def as_py( + self: Scalar[types.StructType], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[str, Any]]: ... + @overload + def as_py( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[_AsPyTypeK, Any]]: ... + @overload + def as_py( + self: Scalar[Any], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> Any: ... + def as_py(self, *args, **kwargs): + """ + Return this value as a Python representation. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + """ + +_NULL: TypeAlias = None +NA = _NULL + +class NullScalar(Scalar[types.NullType]): ... +class BooleanScalar(Scalar[types.BoolType]): ... +class UInt8Scalar(Scalar[types.UInt8Type]): ... +class Int8Scalar(Scalar[types.Int8Type]): ... +class UInt16Scalar(Scalar[types.UInt16Type]): ... +class Int16Scalar(Scalar[types.Int16Type]): ... +class UInt32Scalar(Scalar[types.Uint32Type]): ... +class Int32Scalar(Scalar[types.Int32Type]): ... +class UInt64Scalar(Scalar[types.UInt64Type]): ... +class Int64Scalar(Scalar[types.Int64Type]): ... +class HalfFloatScalar(Scalar[types.Float16Type]): ... +class FloatScalar(Scalar[types.Float32Type]): ... +class DoubleScalar(Scalar[types.Float64Type]): ... +class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): ... +class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): ... +class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): ... +class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): ... +class Date32Scalar(Scalar[types.Date32Type]): ... + +class Date64Scalar(Scalar[types.Date64Type]): + @property + def value(self) -> dt.date | None: ... + +class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): + @property + def value(self) -> dt.time | None: ... + +class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): + @property + def value(self) -> dt.time | None: ... + +class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): + @property + def value(self) -> int | None: ... + +class DurationScalar(Scalar[types.DurationType[_Unit]]): + @property + def value(self) -> dt.timedelta | None: ... + +class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): + @property + def value(self) -> MonthDayNano | None: ... + +class BinaryScalar(Scalar[types.BinaryType]): + def as_buffer(self) -> Buffer: ... + +class LargeBinaryScalar(Scalar[types.LargeBinaryType]): + def as_buffer(self) -> Buffer: ... + +class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): + def as_buffer(self) -> Buffer: ... + +class StringScalar(Scalar[types.StringType]): + def as_buffer(self) -> Buffer: ... + +class LargeStringScalar(Scalar[types.LargeStringType]): + def as_buffer(self) -> Buffer: ... + +class BinaryViewScalar(Scalar[types.BinaryViewType]): + def as_buffer(self) -> Buffer: ... + +class StringViewScalar(Scalar[types.StringViewType]): + def as_buffer(self) -> Buffer: ... + +class ListScalar(Scalar[types.ListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[str]: ... + def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + +class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: ... + @overload + def __iter__( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] + ], + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]],], + ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], + ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... + +class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): + @property + def index(self) -> Scalar[types._IndexT]: ... + @property + def value(self) -> Scalar[types._BasicValueT]: ... + @property + def dictionary(self) -> Array: ... + +class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): + @property + def value(self) -> tuple[int, types._BasicValueT] | None: ... + +class UnionScalar(Scalar[types.UnionType]): + @property + def value(self) -> Any | None: ... + @property + def type_code(self) -> str: ... + +class ExtensionScalar(Scalar[types.ExtensionType]): + @property + def value(self) -> Any | None: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: + """ + Construct ExtensionScalar from type and storage value. + + Parameters + ---------- + typ : DataType + The extension type for the result scalar. + value : object + The storage value for the result scalar. + + Returns + ------- + ext_scalar : ExtensionScalar + """ + +class Bool8Scalar(Scalar[types.Bool8Type]): ... +class UuidScalar(Scalar[types.UuidType]): ... +class JsonScalar(Scalar[types.JsonType]): ... +class OpaqueScalar(Scalar[types.OpaqueType]): ... + +class FixedShapeTensorScalar(ExtensionScalar): + def to_numpy(self) -> np.ndarray: + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ + +_V = TypeVar("_V") + +class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... + +@overload +def scalar( + value: str, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: bytes, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: bool, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: int, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: float, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Decimal, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: dt.datetime, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[Literal["us"]]: ... +@overload +def scalar( + value: dt.date, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: dt.time, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[Literal["us"]]: ... +@overload +def scalar( + value: dt.timedelta, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[Literal["us"]]: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: MonthDayNano, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Mapping[str, Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: NullableCollection[str], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.StringType]]: ... +@overload +def scalar( + value: NullableCollection[bytes], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BinaryType]]: ... +@overload +def scalar( + value: NullableCollection[bool], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BoolType]]: ... +@overload +def scalar( + value: NullableCollection[int], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Int64Type]]: ... +@overload +def scalar( + value: NullableCollection[float], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Float64Type]]: ... +@overload +def scalar( + value: NullableCollection[Decimal], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Decimal32Type]]: ... +@overload +def scalar( + value: NullableCollection[dt.datetime], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.TimestampType[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[dt.date], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Date32Type]]: ... +@overload +def scalar( + value: NullableCollection[dt.time], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Time64Type[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[dt.timedelta], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.DurationType[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[MonthDayNano], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... +@overload +def scalar( + value: NullableCollection[Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[Any]: ... +@overload +def scalar( + value: Any, + type: types.NullType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> NullScalar: ... +@overload +def scalar( + value: Any, + type: types.BoolType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: Any, + type: types.UInt8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt8Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Uint32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int32Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Float16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> HalfFloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Any, + type: types.Date32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Date64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date64Scalar: ... +@overload +def scalar( + value: Any, + type: types.MonthDayNanoIntervalType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Any, + type: types.StringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeStringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeStringScalar: ... +@overload +def scalar( + value: Any, + type: types.StringViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringViewScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeBinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryViewScalar: ... +@overload +def scalar( + value: Any, + type: types.TimestampType[types._Unit, types._Tz], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[types._Unit, types._Tz]: ... +@overload +def scalar( + value: Any, + type: types.Time32Type[types._Time32Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time32Scalar[types._Time32Unit]: ... +@overload +def scalar( + value: Any, + type: types.Time64Type[types._Time64Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[types._Time64Unit]: ... +@overload +def scalar( + value: Any, + type: types.DurationType[types._Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[types._Unit]: ... +@overload +def scalar( + value: Any, + type: types.Decimal32Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal32Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal64Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal64Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal128Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal256Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal256Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.ListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.ListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.FixedSizeListType[_DataTypeT, types._Size], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListScalar[_DataTypeT, types._Size]: ... +@overload +def scalar( + value: Any, + type: types.DictionaryType[types._IndexT, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DictionaryScalar[types._IndexT, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.MapType[types._K, types._ValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MapScalar[types._K, types._ValueT]: ... +@overload +def scalar( + value: Any, + type: types.StructType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: Any, + type: types.UnionType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UnionScalar: ... +@overload +def scalar( + value: Any, + type: types.RunEndEncodedType[types._RunEndType, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedScalar[types._RunEndType, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.Bool8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Bool8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UuidType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UuidScalar: ... +@overload +def scalar( + value: Any, + type: types.JsonType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> JsonScalar: ... +@overload +def scalar( + value: Any, + type: types.OpaqueType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> OpaqueScalar: ... +@overload +def scalar( + value: Any, + type: _DataTypeT, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT]: ... +def scalar(*args, **kwargs): + """ + Create a pyarrow.Scalar instance from a Python object. + + Parameters + ---------- + value : Any + Python object coercible to arrow's type system. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the value. + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. Defaults to False if not passed explicitly by user, + or True if a pandas object is passed in. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + scalar : pyarrow.Scalar + + Examples + -------- + >>> import pyarrow as pa + + >>> pa.scalar(42) + + + >>> pa.scalar("string") + + + >>> pa.scalar([1, 2]) + + + >>> pa.scalar([1, 2], type=pa.list_(pa.int16())) + + """ + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "scalar", +] diff --git a/python/pyarrow/__lib_pxi/table.pyi b/python/pyarrow/__lib_pxi/table.pyi new file mode 100644 index 00000000000..ffba4262e8c --- /dev/null +++ b/python/pyarrow/__lib_pxi/table.pyi @@ -0,0 +1,5617 @@ +import datetime as dt +import sys + +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import ( + Any, + Collection, + Generator, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Sequence, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, MemoryManager, MemoryPool, MonthDayNano, Schema +from pyarrow.lib import Field as _Field + +from . import array, scalar, types +from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from .ipc import RecordBatchReader +from .scalar import Int64Scalar, Scalar +from .tensor import Tensor +from .types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +Field: TypeAlias = _Field[DataType] +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed +AggregateOptions: TypeAlias = ( + ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions +) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + An array-like composed from a (possibly empty) collection of pyarrow.Arrays + + Warnings + -------- + Do not call this class's constructor directly. + + Examples + -------- + To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: + + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) + True + """ + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: + """ + Return data type of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + """ + def length(self) -> int: + """ + Return length of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.length() + 6 + """ + __len__ = length + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the ChunkedArray + + Parameters + ---------- + indent : int + How much to indent right the content of the array, + by default ``0``. + window : int + How many items to preview within each chunk at the begin and end + of the chunk when the chunk is bigger than the window. + The other elements will be ellipsed. + container_window : int + How many chunks to preview at the begin and end + of the array when the array is bigger than the window. + The other elements will be ellipsed. + This setting also applies to list columns. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_string(skip_new_lines=True) + '[[2,2,4],[4,5,100]]' + """ + format = to_string + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def null_count(self) -> int: + """ + Number of null entries + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.null_count + 1 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the chunked array. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.nbytes + 49 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the chunked array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.get_total_buffer_size() + 49 + """ + def __sizeof__(self) -> int: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> _Scalar_co: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or ChunkedArray (slice) + """ + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_null() + + [ + [ + false, + false, + false, + false, + true, + false + ] + ] + """ + def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the NaN values. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) + >>> arr.is_nan() + + [ + [ + false, + true, + false, + false, + null, + false + ] + ] + """ + def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the non-null values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_valid() + + [ + [ + true, + true, + true + ], + [ + true, + false, + true + ] + ] + """ + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: + """ + Replace each null element in values with fill_value. + + See :func:`pyarrow.compute.fill_null` for full usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array or ChunkedArray + A new array with nulls replaced by the given value. + + Examples + -------- + >>> import pyarrow as pa + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.fill_null(fill_value) + + [ + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + ] + """ + def equals(self, other: Self) -> bool: + """ + Return whether the contents of two chunked arrays are equal. + + Parameters + ---------- + other : pyarrow.ChunkedArray + Chunked array to compare against. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> n_legs.equals(n_legs) + True + >>> n_legs.equals(animals) + False + """ + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: + """ + Return a NumPy copy of this array (experimental). + + Parameters + ---------- + zero_copy_only : bool, default False + Introduced for signature consistence with pyarrow.Array.to_numpy. + This must be False here since NumPy arrays' buffer must be contiguous. + + Returns + ------- + array : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_numpy() + array([ 2, 2, 4, 4, 5, 100]) + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + @overload + def cast( + self, + target_type: None = None, + safe: bool | None = None, + options: CastOptions | None = None, + ) -> Self: ... + @overload + def cast( + self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None + ) -> ChunkedArray[Scalar[_CastAs]]: ... + def cast(self, *args, **kwargs): + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + + Change the data type of an array: + + >>> n_legs_seconds = n_legs.cast(pa.duration("s")) + >>> n_legs_seconds.type + DurationType(duration[s]) + """ + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : ChunkedArray + A dictionary-encoded version of this array. + + Examples + -------- + >>> import pyarrow as pa + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> animals.dictionary_encode() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: + """ + Flatten this ChunkedArray. If it has a struct type, the column is + flattened into one array per struct field. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : list of ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> c_arr = pa.chunked_array(n_legs.value_counts()) + >>> c_arr + + [ + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + ] + >>> c_arr.flatten() + [ + [ + [ + 2, + 4, + 5, + 100 + ] + ], + [ + [ + 2, + 2, + 1, + 1 + ] + ]] + >>> c_arr.type + StructType(struct) + >>> n_legs.type + DataType(int64) + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: + """ + Flatten this ChunkedArray into a single non-chunked array. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.combine_chunks() + + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + """ + def unique(self) -> ChunkedArray[_Scalar_co]: + """ + Compute distinct elements in array + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.unique() + + [ + 2, + 4, + 5, + 100 + ] + """ + def value_counts(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + An array of structs + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.value_counts() + + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this ChunkedArray + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.slice(2, 2) + + [ + [ + 4 + ], + [ + 4 + ] + ] + """ + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the chunked array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array or ChunkedArray + An array of the same type, with only the elements selected by + the boolean mask. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> mask = pa.array([True, False, None, True, False, True]) + >>> n_legs.filter(mask) + + [ + [ + 2 + ], + [ + 4, + 100 + ] + ] + >>> n_legs.filter(mask, null_selection_behavior="emit_null") + + [ + [ + 2, + null + ], + [ + 4, + 100 + ] + ] + """ + @overload + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + @overload + def index( + self, + value: Scalar[_DataTypeT], + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.index(4) + + >>> n_legs.index(4, start=3) + + """ + def take(self, indices: Indices) -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array or ChunkedArray + An array with the same datatype, containing the taken values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.take([1, 4, 5]) + + [ + [ + 2, + 5, + 100 + ] + ] + """ + def drop_null(self) -> Self: + """ + Remove missing values from a chunked array. + See :func:`pyarrow.compute.drop_null` for full description. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.drop_null() + + [ + [ + 2, + 2 + ], + [ + 4, + 5, + 100 + ] + ] + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the ChunkedArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : ChunkedArray + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent chunked array, but where all + chunks share the same dictionary values. Dictionary indices are + transposed accordingly. + + If there are no dictionaries in the chunked array, it is returned + unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> c_arr + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ] + ] + >>> c_arr.unify_dictionaries() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + @property + def num_chunks(self) -> int: + """ + Number of underlying chunks. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.num_chunks + 2 + """ + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: + """ + Select a chunk by its index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.chunk(1) + + [ + 4, + 5, + 100 + ] + """ + @property + def chunks(self) -> list[Array[_Scalar_co]]: + """ + Convert to a list of single-chunked arrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.chunks + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ]] + """ + @overload + def iterchunks( + self: ChunkedArray[scalar.NullScalar], + ) -> Generator[array.NullArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BooleanScalar], + ) -> Generator[array.BooleanArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt8Scalar], + ) -> Generator[array.UInt8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int8Scalar], + ) -> Generator[array.Int8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt16Scalar], + ) -> Generator[array.UInt16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int16Scalar], + ) -> Generator[array.Int16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt32Scalar], + ) -> Generator[array.UInt32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int32Scalar], + ) -> Generator[array.Int32Array, None, None]: + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt64Scalar], + ) -> Generator[array.UInt64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int64Scalar], + ) -> Generator[array.Int64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.HalfFloatScalar], + ) -> Generator[array.HalfFloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FloatScalar], + ) -> Generator[array.FloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DoubleScalar], + ) -> Generator[array.DoubleArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal32Scalar], + ) -> Generator[array.Decimal32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal64Scalar], + ) -> Generator[array.Decimal64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal128Scalar], + ) -> Generator[array.Decimal128Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal256Scalar], + ) -> Generator[array.Decimal256Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date32Scalar], + ) -> Generator[array.Date32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date64Scalar], + ) -> Generator[array.Date64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time32Scalar[types._Time32Unit]], + ) -> Generator[array.Time32Array[types._Time32Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time64Scalar[types._Time64Unit]], + ) -> Generator[array.Time64Array[types._Time64Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DurationScalar[types._Unit]], + ) -> Generator[array.DurationArray[types._Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MonthDayNanoIntervalScalar], + ) -> Generator[array.MonthDayNanoIntervalArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryScalar], + ) -> Generator[array.BinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeBinaryScalar], + ) -> Generator[array.LargeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeBinaryScalar], + ) -> Generator[array.FixedSizeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringScalar], + ) -> Generator[array.StringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeStringScalar], + ) -> Generator[array.LargeStringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryViewScalar], + ) -> Generator[array.BinaryViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringViewScalar], + ) -> Generator[array.StringViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.ListScalar[_DataTypeT]], + ) -> Generator[array.ListArray[scalar.ListScalar[_DataTypeT]], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeListScalar[_DataTypeT, types._Size]], + ) -> Generator[array.FixedSizeListArray[_DataTypeT, types._Size], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListScalar[_DataTypeT]], + ) -> Generator[array.LargeListArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListViewScalar[_DataTypeT]], + ) -> Generator[array.LargeListViewArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StructScalar], + ) -> Generator[array.StructArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MapScalar[array._MapKeyT, array._MapItemT]], + ) -> Generator[array.MapArray[array._MapKeyT, array._MapItemT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DictionaryScalar[types._IndexT, types._BasicValueT]], + ) -> Generator[array.DictionaryArray[types._IndexT, types._BasicValueT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.RunEndEncodedScalar], + ) -> Generator[array.RunEndEncodedArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UnionScalar], + ) -> Generator[array.UnionArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Bool8Scalar], + ) -> Generator[array.Bool8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UuidScalar], + ) -> Generator[array.UuidArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.JsonScalar], + ) -> Generator[array.JsonArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.OpaqueScalar], + ) -> Generator[array.OpaqueArray, None, None]: ... + def iterchunks(self): + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.to_pylist() + [2, 2, 4, 4, None, 100] + """ + def __arrow_c_stream__(self, requested_schema=None) -> Any: + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ + @property + def is_cpu(self) -> bool: + """ + Whether all chunks in the ChunkedArray are CPU-accessible. + """ + +@overload +def chunked_array( + values: Iterable[NullableCollection[bool]], + type: None = None, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[int]], + type: None = None, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[float]], + type: None = None, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[Decimal]], + type: None = None, +) -> ChunkedArray[scalar.Decimal128Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dict[str, Any]]], + type: None = None, +) -> ChunkedArray[scalar.StructScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.datetime]], + type: None = None, +) -> ChunkedArray[scalar.TimestampScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.date]], + type: None = None, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.time]], + type: None = None, +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.timedelta]], + type: None = None, +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[MonthDayNano]], + type: None = None, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[str]], + type: None = None, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[bytes]], + type: None = None, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[list[Any]]], + type: None = None, +) -> ChunkedArray[scalar.ListScalar[Any]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["null"] | types.NullType, +) -> ChunkedArray[scalar.NullScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["bool", "boolean"] | types.BoolType, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i1", "int8"] | types.Int8Type, +) -> ChunkedArray[scalar.Int8Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i2", "int16"] | types.Int16Type, +) -> ChunkedArray[scalar.Int16Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i4", "int32"] | types.Int32Type, +) -> ChunkedArray[scalar.Int32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i8", "int64"] | types.Int64Type, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u1", "uint8"] | types.UInt8Type, +) -> ChunkedArray[scalar.UInt8Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u2", "uint16"] | types.UInt16Type, +) -> ChunkedArray[scalar.UInt16Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> ChunkedArray[scalar.UInt32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u8", "uint64"] | types.UInt64Type, +) -> ChunkedArray[scalar.UInt64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> ChunkedArray[scalar.HalfFloatScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> ChunkedArray[scalar.FloatScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["string", "str", "utf8"] | types.StringType, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["binary"] | types.BinaryType, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> ChunkedArray[scalar.LargeStringScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["large_binary"] | types.LargeBinaryType, +) -> ChunkedArray[scalar.LargeBinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["binary_view"] | types.BinaryViewType, +) -> ChunkedArray[scalar.BinaryViewScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["string_view"] | types.StringViewType, +) -> ChunkedArray[scalar.StringViewScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> ChunkedArray[scalar.Date64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[Array[_ScalarT]], + type: None = None, +) -> ChunkedArray[_ScalarT]: ... +def chunked_array(value, type=None): + """ + Construct chunked array from list of array-like objects + + Parameters + ---------- + arrays : Array, list of Array, or array-like + Must all be the same data type. Can be empty only if type also passed. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be + passed as well. + type : DataType or string coercible to DataType + + Returns + ------- + ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + """ + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + @overload + def __getitem__(self, key: int | str) -> _ColumnT: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return column at given index or column name + + Parameters + ---------- + key : integer, str, or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + Array (from RecordBatch) or ChunkedArray (from Table) for column input. + RecordBatch or Table for slice input. + """ + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: + """ + Select single column from Table or RecordBatch. + + Parameters + ---------- + i : int or string + The index or name of the column to retrieve. + + Returns + ------- + column : Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Select a column by numeric index: + + >>> table.column(0) + + [ + [ + 2, + 4, + 5, + 100 + ] + ] + + Select a column by its name: + + >>> table.column("animals") + + [ + [ + "Flamingo", + "Horse", + "Brittle stars", + "Centipede" + ] + ] + """ + @property + def column_names(self) -> list[str]: + """ + Names of the Table or RecordBatch columns. + + Returns + ------- + list of str + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=["n_legs", "animals"], + ... ) + >>> table.column_names + ['n_legs', 'animals'] + """ + @property + def columns(self) -> list[_ColumnT]: + """ + List of all columns in numerical order. + + Returns + ------- + columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.columns + [ + [ + [ + null, + 4, + 5, + null + ] + ], + [ + [ + "Flamingo", + "Horse", + null, + "Centipede" + ] + ]] + """ + def drop_null(self) -> Self: + """ + Remove rows that contain missing values from a Table or RecordBatch. + + See :func:`pyarrow.compute.drop_null` for full usage. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, with rows containing + no missing values. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [None, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", None, "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.drop_null() + pyarrow.Table + year: double + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def field(self, i: int | str) -> Field: + """ + Select a schema field by its column name or numeric index. + + Parameters + ---------- + i : int or string + The index or name of the field to retrieve. + + Returns + ------- + Field + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.field(0) + pyarrow.Field + >>> table.field(1) + pyarrow.Field + """ + @classmethod + def from_pydict( + cls, + mapping: Mapping[str, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from Arrow arrays or columns. + + Parameters + ---------- + mapping : dict or Mapping + A mapping of strings to Arrays or Python lists. + schema : Schema, default None + If not passed, will be inferred from the Mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> pydict = {"n_legs": n_legs, "animals": animals} + + Construct a Table from a dictionary of arrays: + + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a dictionary of arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pydict(pydict, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from list of rows / dictionaries. + + Parameters + ---------- + mapping : list of dicts of rows + A mapping of strings to row values. + schema : Schema, default None + If not passed, will be inferred from the first row of the + mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + + Construct a Table from a list of rows: + + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4]] + animals: [["Flamingo","Dog"]] + + Construct a Table from a list of rows with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def itercolumns(self) -> Generator[_ColumnT, None, None]: + """ + Iterator over all columns in their numerical order. + + Yields + ------ + Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> for i in table.itercolumns(): + ... print(i.null_count) + 2 + 1 + """ + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: + """ + Dimensions of the table or record batch: (#rows, #columns). + + Returns + ------- + (int, int) + Number of rows and number of columns. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table.shape + (4, 2) + """ + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: + """ + Sort the Table or RecordBatch by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + Table or RecordBatch + A new tabular object sorted according to the sort keys. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.sort_by("animal") + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,2021,2021,2020,2022,2022]] + n_legs: [[5,100,4,2,4,2]] + animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] + """ + def take(self, indices: Indices) -> Self: + """ + Select rows from a Table or RecordBatch. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the tabular object whose rows will be returned. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, containing the taken rows. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.take([1, 3]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def filter( + self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: + """ + Select rows from the table or record batch based on a boolean mask. + + The Table can be filtered based on a mask, which will be passed to + :func:`pyarrow.compute.filter` to perform the filtering, or it can + be filtered through a boolean :class:`.Expression` + + Parameters + ---------- + mask : Array or array-like or .Expression + The boolean mask or the :class:`.Expression` to filter the table with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled, does nothing if + an :class:`.Expression` is used. + + Returns + ------- + filtered : Table or RecordBatch + A tabular object of the same schema, with only the rows selected + by applied filtering + + Examples + -------- + Using a Table (works similarly for RecordBatch): + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Define an expression and select rows: + + >>> import pyarrow.compute as pc + >>> expr = pc.field("year") <= 2020 + >>> table.filter(expr) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2019]] + n_legs: [[2,5]] + animals: [["Flamingo","Brittle stars"]] + + Define a mask and select rows: + + >>> mask = [True, True, False, None] + >>> table.filter(mask) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022]] + n_legs: [[2,4]] + animals: [["Flamingo","Horse"]] + >>> table.filter(mask, null_selection_behavior="emit_null") + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,null]] + n_legs: [[2,4,null]] + animals: [["Flamingo","Horse",null]] + """ + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list[Any]]: + """ + Convert the Table or RecordBatch to a dict or OrderedDict. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + dict + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> table.to_pydict() + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} + """ + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: + """ + Convert the Table or RecordBatch to a list of rows / dictionaries. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + list + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] + >>> table = pa.table(data, names=["n_legs", "animals"]) + >>> table.to_pylist() + [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... + """ + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: + """ + Return human-readable string representation of Table or RecordBatch. + + Parameters + ---------- + show_metadata : bool, default False + Display Field-level and Schema-level KeyValueMetadata. + preview_cols : int, default 0 + Display values of the columns for the first N columns. + + Returns + ------- + str + """ + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new Table or RecordBatch. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Raises + ------ + KeyError + If any of the passed column names do not exist. + + Returns + ------- + Table or RecordBatch + A tabular object without the column(s). + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Drop one column: + + >>> table.drop_columns("animals") + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + + Drop one or more columns: + + >>> table.drop_columns(["n_legs", "animals"]) + pyarrow.Table + ... + ---- + """ + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: + """ + Append column at end of columns. + + Parameters + ---------- + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + Table or RecordBatch + New table or record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Append column at the end: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.append_column("year", [year]) + pyarrow.Table + n_legs: int64 + animals: string + year: int64 + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + year: [[2021,2022,2019,2021]] + """ + +class RecordBatch(_Tabular[Array]): + """ + Batch of rows of columns of equal length + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatch.from_*`` functions instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Constructing a RecordBatch from arrays: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Constructing a RecordBatch from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_pandas(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Constructing a RecordBatch from pylist: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + >>> pa.RecordBatch.from_pylist(pylist).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Dog + + You can also construct a RecordBatch using :func:`pyarrow.record_batch`: + + >>> pa.record_batch([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def replace_schema_metadata( + self, metadata: dict[str | bytes, str | bytes] | None = None + ) -> Self: + """ + Create shallow copy of record batch by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + shallow_copy : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + + Constructing a RecordBatch with schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) + >>> batch.schema + n_legs: int64 + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Shallow copy of a RecordBatch with deleted schema metadata: + + >>> batch.replace_schema_metadata().schema + n_legs: int64 + """ + @property + def num_columns(self) -> int: + """ + Number of columns + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_columns + 2 + """ + + @property + def num_rows(self) -> int: + """ + Number of rows + + Due to the definition of a RecordBatch, all columns have the same + number of rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_rows + 6 + """ + @property + def schema(self) -> Schema: + """ + Schema of the RecordBatch and its columns + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.schema + n_legs: int64 + animals: string + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the record batch. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.nbytes + 116 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the record batch + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.get_total_buffer_size() + 120 + """ + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to RecordBatch at position i. + + A new record batch is returned with the column added, the original record batch + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.add_column(0, "year", year) + pyarrow.RecordBatch + year: int64 + n_legs: int64 + animals: string + ---- + year: [2021,2022,2019,2021] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Original record batch is left unchanged: + + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def remove_column(self, i: int) -> Self: + """ + Create new RecordBatch with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New record batch without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.remove_column(1) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: + """ + Replace column in RecordBatch at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.set_column(1, "year", year) + pyarrow.RecordBatch + n_legs: int64 + year: int64 + ---- + n_legs: [2,4,5,100] + year: [2021,2022,2019,2021] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new record batch with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> new_names = ["n", "name"] + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write RecordBatch to Buffer as encapsulated IPC message, which does not + include a Schema. + + To reconstruct a RecordBatch from the encapsulated IPC message Buffer + returned by this function, a Schema must be passed separately. See + Examples. + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> buf = batch.serialize() + >>> buf + + + Reconstruct RecordBatch from IPC message Buffer and original Schema + + >>> pa.ipc.read_record_batch(buf, batch.schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this RecordBatch + + Parameters + ---------- + offset : int, default 0 + Offset from start of record batch to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> batch.slice(offset=3).to_pandas() + n_legs animals + 0 4 Horse + 1 5 Brittle stars + 2 100 Centipede + >>> batch.slice(length=2).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + >>> batch.slice(offset=3, length=1).to_pandas() + n_legs animals + 0 4 Horse + """ + def equals(self, other: Self, check_metadata: bool = False) -> bool: + """ + Check if contents of two record batches are equal. + + Parameters + ---------- + other : pyarrow.RecordBatch + RecordBatch to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch_0 = pa.record_batch([]) + >>> batch_1 = pa.RecordBatch.from_arrays( + ... [n_legs, animals], + ... names=["n_legs", "animals"], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> batch.equals(batch) + True + >>> batch.equals(batch_0) + False + >>> batch.equals(batch_1) + True + >>> batch.equals(batch_1, check_metadata=True) + False + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the RecordBatch. + + Returns a new RecordBatch with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + + Select columns my indices: + + >>> batch.select([1]) + pyarrow.RecordBatch + animals: string + ---- + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Select columns by names: + + >>> batch.select(["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,2,4,4,5,100] + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast record batch values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast batch values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> batch.cast(target_schema=my_schema) + pyarrow.RecordBatch + n_legs: duration[s] + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a RecordBatch from multiple pyarrow.Arrays + + Parameters + ---------- + arrays : list of pyarrow.Array + One for each field in RecordBatch + names : list of str, optional + Names for the batch fields. If not passed, schema must be passed + schema : Schema, default None + Schema for the created batch. If not passed, names must be passed + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from pyarrow Arrays using names: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Construct a RecordBatch from pyarrow Arrays using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow RecordBatch + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the RecordBatch. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``RecordBatch``. The default of None will store the index as a + column, except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + + Returns + ------- + pyarrow.RecordBatch + + + Examples + -------- + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Convert pandas DataFrame to RecordBatch: + + >>> import pyarrow as pa + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_pandas(df, schema=my_schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch specifying columns: + + >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a RecordBatch from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``RecordBatch``. + + Parameters + ---------- + struct_array : StructArray + Array to construct the record batch from. + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.RecordBatch.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array(self) -> StructArray: + """ + Convert to a struct array. + """ + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: + """ + Convert to a :class:`~pyarrow.Tensor`. + + RecordBatches that can be converted have fields of type signed or unsigned + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], + ... names=["a", "b"], + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None): + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this schema. + If None, the batch will be returned as-is, with a schema matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the batch as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema + and ArrowArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowDeviceArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this data type. + If None, the batch will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a + C ArrowSchema and ArrowDeviceArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowDeviceArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the arrays in the RecordBatch reside. + + Returns + ------- + DeviceAllocationType + """ + @property + def is_cpu(self) -> bool: + """ + Whether the RecordBatch's arrays are CPU-accessible. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Copy the entire RecordBatch to destination device. + + This copies each column of the record batch to create + a new record batch where all underlying buffers for the columns have + been copied to the destination MemoryManager. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + RecordBatch + """ + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + +class Table(_Tabular[ChunkedArray[Any]]): + """ + A collection of top-level named, equal length Arrow arrays. + + Warnings + -------- + Do not call this class's constructor directly, use one of the ``from_*`` + methods instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a RecordBatch: + + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a dictionary of arrays: + + >>> pydict = {"n_legs": n_legs, "animals": animals} + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,null]] + animals: [["Flamingo","Centipede"]] + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [ + ... pa.field("year", pa.int64()), + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... ], + ... metadata={"year": "Year of entry"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + year: int64 + n_legs: int64 + animals: string + -- schema metadata -- + year: 'Year of entry' + + Construct a Table with :func:`pyarrow.table`: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this Table. + + Parameters + ---------- + offset : int, default 0 + Offset from start of table to slice. + length : int, default None + Length of slice (default is until end of table starting from + offset). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.slice(length=3) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019]] + n_legs: [[2,4,5]] + animals: [["Flamingo","Horse","Brittle stars"]] + >>> table.slice(offset=2) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019,2021]] + n_legs: [[5,100]] + animals: [["Brittle stars","Centipede"]] + >>> table.slice(offset=2, length=1) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019]] + n_legs: [[5]] + animals: [["Brittle stars"]] + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the Table. + + Returns a new Table with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.select([0, 1]) + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + >>> table.select(["year"]) + pyarrow.Table + year: int64 + ---- + year: [[2020,2022,2019,2021]] + """ + def replace_schema_metadata( + self, metadata: dict[str | bytes, str | bytes] | None = None + ) -> Self: + """ + Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None), + which deletes any existing metadata. + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Constructing a Table with pyarrow schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> table = pa.table(df, my_schema) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + + Create a shallow copy of a Table with deleted schema metadata: + + >>> table.replace_schema_metadata().schema + n_legs: int64 + animals: string + + Create a shallow copy of a Table with new schema metadata: + + >>> metadata = {"animals": "Which animal"} + >>> table.replace_schema_metadata(metadata=metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Which animal' + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Flatten this Table. + + Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> month = pa.array([4, 6]) + >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) + >>> table + pyarrow.Table + a: struct + child 0, animals: string + child 1, n_legs: int64 + child 2, year: int64 + month: int64 + ---- + a: [ + -- is_valid: all not null + -- child 0 type: string + ["Parrot",null] + -- child 1 type: int64 + [2,4] + -- child 2 type: int64 + [null,2022]] + month: [[4,6]] + + Flatten the columns with struct field: + + >>> table.flatten() + pyarrow.Table + a.animals: string + a.n_legs: int64 + a.year: int64 + month: int64 + ---- + a.animals: [["Parrot",null]] + a.n_legs: [[2,4]] + a.year: [[null,2022]] + month: [[4,6]] + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the ChunkedArray of each column are + concatenated into zero or one chunk. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + >>> table.combine_chunks() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4,4,5,100]] + animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent table, but where all chunks of + each column share the same dictionary values. Dictionary indices + are transposed accordingly. + + Columns without dictionaries are returned unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> table = pa.table([c_arr], names=["animals"]) + >>> table + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog"] -- indices: + [0,1,2], -- dictionary: + ["Horse","Brittle stars","Centipede"] -- indices: + [0,1,2]] + + Unify dictionaries across both chunks: + + >>> table.unify_dictionaries() + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [0,1,2], -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [3,4,5]] + """ + def equals(self, other: Self, check_metadata: bool = False) -> Self: + """ + Check if contents of two tables are equal. + + Parameters + ---------- + other : pyarrow.Table + Table to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.Table.from_arrays([n_legs, animals], names=names) + >>> table_0 = pa.Table.from_arrays([]) + >>> table_1 = pa.Table.from_arrays( + ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> table.equals(table) + True + >>> table.equals(table_0) + False + >>> table.equals(table_1) + True + >>> table.equals(table_1, check_metadata=True) + False + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast table values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast table values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> table.cast(target_schema=my_schema) + pyarrow.Table + n_legs: duration[s] + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + safe: bool = True, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow Table. + + The column types in the resulting Arrow Table are inferred from the + dtypes of the pandas.Series in the DataFrame. In the case of non-object + Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the + Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough + information to always lead to a meaningful Arrow type. In the case that + we cannot infer a type, e.g. because the DataFrame is of length 0 or + the Series only contains None/nan objects, the type is set to + null. This behavior can be avoided by constructing an explicit schema + and passing it to this function. + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the Arrow Table. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``Table``. The default of None will store the index as a column, + except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + safe : bool, default True + Check for overflows or other unsafe conversions. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[ArrayOrChunkedArray[Any]], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a Table from Arrow arrays. + + Parameters + ---------- + arrays : list of pyarrow.Array or pyarrow.ChunkedArray + Equal-length arrays that should form the table. + names : list of str, optional + Names for the table columns. If not passed, schema must be passed. + schema : Schema, default None + Schema for the created table. If not passed, names must be passed. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"animals": "Name of the animal species"}, + ... ) + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Name of the animal species' + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a Table from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``Table``. + + Parameters + ---------- + struct_array : StructArray or ChunkedArray + Array to construct the table from. + + Returns + ------- + pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.Table.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[scalar.StructScalar]: + """ + Convert to a chunked array of struct type. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for ChunkedArray chunks. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + ChunkedArray + """ + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: + """ + Construct a Table from a sequence or iterator of Arrow RecordBatches. + + Parameters + ---------- + batches : sequence or iterator of RecordBatch + Sequence of RecordBatch to be converted, all schemas must be equal. + schema : Schema, default None + If not passed, will be inferred from the first RecordBatch. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Construct a Table from a RecordBatch: + + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a sequence of RecordBatches: + + >>> pa.Table.from_batches([batch, batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: + """ + Convert Table to a list of RecordBatch objects. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + list[RecordBatch] + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatch: + + >>> table.to_batches()[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Convert a Table to a list of RecordBatches: + + >>> table.to_batches(max_chunksize=2)[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + >>> table.to_batches(max_chunksize=2)[1].to_pandas() + n_legs animals + 0 5 Brittle stars + 1 100 Centipede + """ + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: + """ + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + RecordBatchReader + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatchReader: + + >>> table.to_reader() + + + >>> reader = table.to_reader() + >>> reader.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + >>> reader.read_all() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @property + def schema(self) -> Schema: + """ + Schema of the table and its columns. + + Returns + ------- + Schema + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... + """ + @property + def num_columns(self) -> int: + """ + Number of columns in this table. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_columns + 2 + """ + @property + def num_rows(self) -> int: + """ + Number of rows in this table. + + Due to the definition of a table, all columns have the same number of + rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_rows + 4 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the table. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.nbytes + 72 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the table. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.get_total_buffer_size() + 76 + """ + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.add_column(0, "year", [year]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2021,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Original table is left unchanged: + + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def remove_column(self, i: int) -> Self: + """ + Create new Table with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New table without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.remove_column(1) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + """ + def set_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: + """ + Replace column in Table at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.set_column(1, "year", [year]) + pyarrow.Table + n_legs: int64 + year: int64 + ---- + n_legs: [[2,4,5,100]] + year: [[2021,2022,2019,2021]] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new table with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> new_names = ["n", "name"] + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def drop(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new table. + + Alias of Table.drop_columns, but kept for backwards compatibility. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Returns + ------- + Table + New table without the column(s). + """ + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: + """ + Declare a grouping over the columns of the table. + + Resulting grouping can then be used to perform aggregations + with a subsequent ``aggregate()`` method. + + Parameters + ---------- + keys : str or list[str] + Name of the columns that should be used as the grouping key. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the + default), no stable ordering of the output is guaranteed. + + Returns + ------- + TableGroupBy + + See Also + -------- + TableGroupBy.aggregate + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.group_by("year").aggregate([("n_legs", "sum")]) + pyarrow.Table + year: int64 + n_legs_sum: int64 + ---- + year: [[2020,2022,2021,2019]] + n_legs_sum: [[2,6,104,5]] + """ + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: + """ + Perform a join between this table and another one. + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + keys : str or list[str] + The columns from current table that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to left column names. This prevents confusion + when the columns in left and right tables have colliding names. + right_suffix : str, default None + Which suffix to add to the right column names. This prevents confusion + when the columns in left and right tables have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whether to use multithreading or not. + + Returns + ------- + Table + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) + >>> df2 = pd.DataFrame( + ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} + ... ) + >>> t1 = pa.Table.from_pandas(df1) + >>> t2 = pa.Table.from_pandas(df2) + + Left outer join: + + >>> t1.join(t2, "id").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2]] + year: [[2019,2020,2022]] + n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] + + Full outer join: + + >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2,4]] + year: [[2019,2020,2022,null]] + n_legs: [[5,null,null,100]] + animal: [["Brittle stars",null,null,"Centipede"]] + + Right outer join: + + >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") + pyarrow.Table + year: int64 + id: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,null]] + id: [[3,4]] + n_legs: [[5,100]] + animal: [["Brittle stars","Centipede"]] + + Right anti join + + >>> t1.join(t2, "id", join_type="right anti") + pyarrow.Table + id: int64 + n_legs: int64 + animal: string + ---- + id: [[4]] + n_legs: [[100]] + animal: [["Centipede"]] + """ + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: + """ + Perform an asof join between this table and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both tables must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + on : str + The column from current table that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input dataset must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current table that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row ``right.on - left.on <= tolerance``. The + ``tolerance`` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_table that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left table. + right_by : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + + Returns + ------- + Table + + Example + -------- + >>> import pyarrow as pa + >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) + >>> t2 = pa.table( + ... { + ... "id": [3, 4], + ... "year": [2020, 2021], + ... "n_legs": [5, 100], + ... "animal": ["Brittle stars", "Centipede"], + ... } + ... ) + + >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[1,3,2,3,3]] + year: [[2020,2021,2022,2022,2023]] + n_legs: [[null,5,null,5,null]] + animal: [[null,"Brittle stars",null,"Brittle stars",null]] + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the table as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @property + def is_cpu(self) -> bool: + """ + Whether all ChunkedArrays are CPU-accessible. + """ + +def record_batch( + data: dict[str, list[Any] | Array[Any]] + | Collection[Array[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, +) -> RecordBatch: + """ + Create a pyarrow.RecordBatch from another Python data structure or sequence + of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of Arrays, + a pandas DataFame, or any tabular object implementing the + Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or + ``__arrow_c_device_array__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the RecordBatch. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + + Returns + ------- + RecordBatch + + See Also + -------- + RecordBatch.from_arrays, RecordBatch.from_pandas, table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from a python dictionary: + + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Creating a RecordBatch from a list of arrays with names: + + >>> pa.record_batch([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Creating a RecordBatch from a list of arrays with names and metadata: + + >>> my_metadata = {"n_legs": "How many legs does an animal have?"} + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'How many legs does an animal have?' + + Creating a RecordBatch from a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + >>> pa.record_batch(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Creating a RecordBatch from a pandas DataFrame with schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.record_batch(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + >>> pa.record_batch(df, my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + """ + +@overload +def table( + data: dict[str, list[Any] | Array[Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: ... +@overload +def table( + data: Collection[ArrayOrChunkedArray[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: ... +def table(*args, **kwargs): + """ + Create a pyarrow.Table from a Python data structure or sequence of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of arrays or + chunked arrays, a pandas DataFame, or any tabular object implementing + the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, + ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the Arrow Table. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + If passed, the output will have exactly this schema (raising an error + when columns are not found in the data and ignoring additional data not + specified in the schema, when data is a dict or DataFrame). + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + nthreads : int, default None + For pandas.DataFrame inputs: if greater than 1, convert columns to + Arrow in parallel using indicated number of threads. By default, + this follows :func:`pyarrow.cpu_count` (may use up to system CPU count + threads). + + Returns + ------- + Table + + See Also + -------- + Table.from_arrays, Table.from_pandas, Table.from_pydict + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from a python dictionary: + + >>> pa.table({"n_legs": n_legs, "animals": animals}) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.table(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.table(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... + + Construct a Table from chunked arrays: + + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + """ + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: + """ + Concatenate pyarrow.Table objects. + + If promote_options="none", a zero-copy concatenation will be performed. The schemas + of all the Tables must be the same (except the metadata), otherwise an + exception will be raised. The result Table will share the metadata with the + first table. + + If promote_options="default", any null type arrays will be casted to the type of other + arrays in the column of the same name. If a table is missing a particular + field, null values of the appropriate type will be generated to take the + place of the missing field. The new schema will share the metadata with the + first table. Each field in the new schema will share the metadata with the + first table which has the field defined. Note that type promotions may + involve additional allocations on the given ``memory_pool``. + + If promote_options="permissive", the behavior of default plus types will be promoted + to the common denominator that fits all the fields. + + Parameters + ---------- + tables : iterable of pyarrow.Table objects + Pyarrow tables to concatenate into a single Table. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + promote_options : str, default none + Accepts strings "none", "default" and "permissive". + **kwargs : dict, optional + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) + >>> pa.concat_tables([t1, t2]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] + + """ + +class TableGroupBy: + """ + A grouping of columns in a table on which to perform aggregations. + + Parameters + ---------- + table : pyarrow.Table + Input table to execute the aggregation on. + keys : str or list[str] + Name of the grouped columns. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the default), + no stable ordering of the output is guaranteed. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table( + ... [ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], + ... names=["keys", "values"], + ... ) + + Grouping of columns: + + >>> pa.TableGroupBy(t, "keys") + + + Perform aggregations: + + >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + """ + + keys: str | list[str] + def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: + """ + Perform an aggregation over the grouped columns of the table. + + Parameters + ---------- + aggregations : list[tuple(str, str)] or \ +list[tuple(str, str, FunctionOptions)] + List of tuples, where each tuple is one aggregation specification + and consists of: aggregation column name followed + by function name and optionally aggregation function option. + Pass empty list to get a single row for each group. + The column name can be a string, an empty list or a list of + column names, for unary, nullary and n-ary aggregation functions + respectively. + + For the list of function names and respective aggregation + function options see :ref:`py-grouped-aggrs`. + + Returns + ------- + Table + Results of the aggregation functions. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + + Sum the column "values" over the grouped column "keys": + + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + + Count the rows over the grouped column "keys": + + >>> t.group_by("keys").aggregate([([], "count_all")]) + pyarrow.Table + keys: string + count_all: int64 + ---- + keys: [["a","b","c"]] + count_all: [[2,2,1]] + + Do multiple aggregations: + + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + keys: string + values_sum: int64 + keys_count: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + + Count the number of non-null values for column "values" + over the grouped column "keys": + + >>> import pyarrow.compute as pc + >>> t.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + keys: string + values_count: int64 + ---- + keys: [["a","b","c"]] + values_count: [[2,2,1]] + + Get a single row for each group in column "keys": + + >>> t.group_by("keys").aggregate([]) + pyarrow.Table + keys: string + ---- + keys: [["a","b","c"]] + """ + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: + """ + Concatenate pyarrow.RecordBatch objects. + + All recordbatches must share the same Schema, + the operation implies a copy of the data to merge + the arrays of the different RecordBatches. + + Parameters + ---------- + recordbatches : iterable of pyarrow.RecordBatch objects + Pyarrow record batches to concatenate into a single RecordBatch. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.record_batch( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.record_batch( + ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] + ... ) + >>> pa.concat_batches([t1, t2]) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100,2,4] + animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] + + """ + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", +] diff --git a/python/pyarrow/__lib_pxi/tensor.pyi b/python/pyarrow/__lib_pxi/tensor.pyi new file mode 100644 index 00000000000..d849abd0f1f --- /dev/null +++ b/python/pyarrow/__lib_pxi/tensor.pyi @@ -0,0 +1,688 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +import numpy as np + +from pyarrow.lib import _Weakrefable +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO + +class Tensor(_Weakrefable): + """ + A n-dimensional array a.k.a Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + + @classmethod + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Create a Tensor from a numpy array. + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list, optional + Names of each dimension of the Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + def to_numpy(self) -> np.ndarray: + """ + Convert arrow::Tensor to numpy.ndarray with zero copy + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.to_numpy() + array([[ 2, 2, 4], + [ 4, 5, 100]], dtype=int32) + """ + def equals(self, other: Tensor) -> bool: + """ + Return true if the tensors contains exactly equal data. + + Parameters + ---------- + other : Tensor + The other tensor to compare for equality. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) + >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a", "b"]) + >>> tensor.equals(tensor) + True + >>> tensor.equals(tensor2) + False + """ + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_name(0) + 'dim1' + >>> tensor.dim_name(1) + 'dim2' + """ + @property + def dim_names(self) -> list[str]: + """ + Names of this tensor dimensions. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_names + ['dim1', 'dim2'] + """ + @property + def is_mutable(self) -> bool: + """ + Is this tensor mutable or immutable. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_mutable + True + """ + @property + def is_contiguous(self) -> bool: + """ + Is this tensor contiguous in memory. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_contiguous + True + """ + @property + def ndim(self) -> int: + """ + The dimension (n) of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.ndim + 2 + """ + @property + def size(self) -> str: + """ + The size of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.size + 6 + """ + @property + def shape(self) -> tuple[int, ...]: + """ + The shape of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.shape + (2, 3) + """ + @property + def strides(self) -> tuple[int, ...]: + """ + Strides of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.strides + (12, 4) + """ + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCOOTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCOOTensor + """ + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCOOTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + coords : numpy.ndarray + Coordinates of the data. + shape : tuple + Shape of the tensor. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: + """ + Convert pydata/sparse.COO to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : pydata.sparse.COO + The sparse multidimensional array that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> coo_matrix: + """ + Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. + """ + def to_pydata_sparse(self) -> COO: + """ + Convert arrow::SparseCOOTensor to pydata/sparse.COO. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCOOTensor to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCOOTensor + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + +class SparseCSRMatrix(_Weakrefable): + """ + A sparse CSR matrix. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSRMatrix + + Parameters + ---------- + obj : numpy.ndarray + The dense numpy array that should be converted. + dim_names : list, optional + The names of the dimensions. + + Returns + ------- + pyarrow.SparseCSRMatrix + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSRMatrix from numpy.ndarrays. + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSRMatrix to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCSRMatrix + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSCMatrix(_Weakrefable): + """ + A sparse CSC matrix. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSCMatrix + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSCMatrix from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : scipy.sparse.csc_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSCMatrix to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSCMatrix + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSFTensor(_Weakrefable): + """ + A sparse CSF tensor. + + CSF is a generalization of compressed sparse row (CSR) index. + + CSF index recursively compresses each dimension of a tensor into a set + of prefix trees. Each path from a root to leaf forms one tensor + non-zero index. CSF is implemented with two arrays of buffers and one + arrays of integers. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSFTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSFTensor + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSFTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse tensor. + indptr : numpy.ndarray + The sparsity structure. + Each two consecutive dimensions in a tensor correspond to + a buffer in indices. + A pair of consecutive values at `indptr[dim][i]` + `indptr[dim][i + 1]` signify a range of nodes in + `indices[dim + 1]` who are children of `indices[dim][i]` node. + indices : numpy.ndarray + Stores values of nodes. + Each tensor dimension corresponds to a buffer in indptr. + shape : tuple + Shape of the matrix. + axis_order : list, optional + the sequence in which dimensions were traversed to + produce the prefix tree. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSFTensor + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSFTensor to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSFTensor + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/pyarrow/__lib_pxi/types.pyi b/python/pyarrow/__lib_pxi/types.pyi new file mode 100644 index 00000000000..7fe6c36e332 --- /dev/null +++ b/python/pyarrow/__lib_pxi/types.pyi @@ -0,0 +1,4413 @@ +import datetime as dt +import sys + +from collections.abc import Mapping, Sequence +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import Any, Generic, Iterable, Iterator, Literal, overload + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar, deprecated + +from .io import Buffer +from .scalar import ExtensionScalar + +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + +class _Weakrefable: ... +class _Metadata(_Weakrefable): ... + +class DataType(_Weakrefable): + """ + Base class of all Arrow data types. + + Each data type is an *instance* of this class. + + Examples + -------- + Instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + """ + def field(self, i: int) -> Field: + """ + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + """ + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: + """ + Bit width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().bit_width + 64 + """ + @property + def byte_width(self) -> int: + """ + Byte width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().byte_width + 8 + """ + @property + def num_fields(self) -> int: + """ + The number of child fields. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().num_fields + 0 + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.string()).num_fields + 1 + >>> struct = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct.num_fields + 2 + """ + @property + def num_buffers(self) -> int: + """ + Number of data buffers required to construct Array type + excluding children. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().num_buffers + 2 + >>> pa.string().num_buffers + 3 + """ + def __hash__(self) -> int: ... + def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: + """ + Return true if type is equivalent to passed value. + + Parameters + ---------- + other : DataType or string convertible to DataType + check_metadata : bool + Whether nested Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().equals(pa.string()) + False + >>> pa.int64().equals(pa.int64()) + True + """ + def to_pandas_dtype(self) -> np.generic: + """ + Return the equivalent NumPy / Pandas dtype. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().to_pandas_dtype() + + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import DataType from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a DataType from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class _BasicDataType(DataType, Generic[_AsPyType]): ... +class NullType(_BasicDataType[None]): ... +class BoolType(_BasicDataType[bool]): ... +class UInt8Type(_BasicDataType[int]): ... +class Int8Type(_BasicDataType[int]): ... +class UInt16Type(_BasicDataType[int]): ... +class Int16Type(_BasicDataType[int]): ... +class Uint32Type(_BasicDataType[int]): ... +class Int32Type(_BasicDataType[int]): ... +class UInt64Type(_BasicDataType[int]): ... +class Int64Type(_BasicDataType[int]): ... +class Float16Type(_BasicDataType[float]): ... +class Float32Type(_BasicDataType[float]): ... +class Float64Type(_BasicDataType[float]): ... +class Date32Type(_BasicDataType[dt.date]): ... +class Date64Type(_BasicDataType[dt.date]): ... +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): ... +class StringType(_BasicDataType[str]): ... +class LargeStringType(_BasicDataType[str]): ... +class StringViewType(_BasicDataType[str]): ... +class BinaryType(_BasicDataType[bytes]): ... +class LargeBinaryType(_BasicDataType[bytes]): ... +class BinaryViewType(_BasicDataType[bytes]): ... + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + """ + Concrete class for timestamp data types. + + Examples + -------- + >>> import pyarrow as pa + + Create an instance of timestamp type: + + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + + Create an instance of timestamp type with timezone: + + >>> pa.timestamp("s", tz="UTC") + TimestampType(timestamp[s, tz=UTC]) + """ + @property + def unit(self) -> _Unit: + """ + The timestamp unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("us") + >>> t.unit + 'us' + """ + @property + def tz(self) -> _Tz: + """ + The timestamp time zone, if any, or None. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("s", tz="UTC") + >>> t.tz + 'UTC' + """ + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + """ + Concrete class for time32 data types. + + Supported time unit resolutions are 's' [second] + and 'ms' [millisecond]. + + Examples + -------- + Create an instance of time32 type: + + >>> import pyarrow as pa + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + @property + def unit(self) -> _Time32Unit: + """ + The time unit ('s' or 'ms'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time32("ms") + >>> t.unit + 'ms' + """ + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + """ + Concrete class for time64 data types. + + Supported time unit resolutions are 'us' [microsecond] + and 'ns' [nanosecond]. + + Examples + -------- + Create an instance of time64 type: + + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + """ + @property + def unit(self) -> _Time64Unit: + """ + The time unit ('us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time64("us") + >>> t.unit + 'us' + """ + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + """ + Concrete class for duration data types. + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("s") + DurationType(duration[s]) + """ + @property + def unit(self) -> _Unit: + """ + The duration unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.duration("s") + >>> t.unit + 's' + """ + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + """ + Concrete class for fixed-size binary data types. + + Examples + -------- + Create an instance of fixed-size binary type: + + >>> import pyarrow as pa + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + """ + +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) + +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal32 data types. + + Examples + -------- + Create an instance of decimal32 type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.scale + 2 + """ + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal64 data types. + + Examples + -------- + Create an instance of decimal64 type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.scale + 2 + """ + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal128 data types. + + Examples + -------- + Create an instance of decimal128 type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.scale + 2 + """ + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal256 data types. + + Examples + -------- + Create an instance of decimal256 type: + + >>> import pyarrow as pa + >>> pa.decimal256(76, 38) + Decimal256Type(decimal256(76, 38)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.precision + 76 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.scale + 38 + """ + +class ListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list data types. + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_type + DataType(string) + """ + +class LargeListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list data types + (like ListType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.string()) + LargeListType(large_list) + """ + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list(pa.string()).value_type + DataType(string) + """ + +class ListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + +class LargeListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + """ + Concrete class for fixed size list data types. + + Examples + -------- + Create an instance of FixedSizeListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_type + DataType(int32) + """ + @property + def list_size(self) -> _Size: + """ + The size of the fixed size lists. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).list_size + 2 + """ + +class DictionaryMemo(_Weakrefable): + """ + Tracking container for dictionary-encoded fields. + """ + +_IndexT = TypeVar( + "_IndexT", + UInt8Type, + Int8Type, + UInt16Type, + Int16Type, + Uint32Type, + Int32Type, + UInt64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) + +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + """ + Concrete class for dictionary data types. + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + """ + + @property + def ordered(self) -> _Ordered: + """ + Whether the dictionary is ordered, i.e. whether the ordering of values + in the dictionary is important. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()).ordered + False + """ + @property + def index_type(self) -> _IndexT: + """ + The data type of dictionary indices (a signed integer type). + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).index_type + DataType(int16) + """ + @property + def value_type(self) -> _BasicValueT: + """ + The dictionary value type. + + The dictionary values are found in an instance of DictionaryArray. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).value_type + DataType(string) + """ + +_K = TypeVar("_K", bound=DataType) + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + """ + Concrete class for map data types. + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + """ + + @property + def key_field(self) -> Field[_K]: + """ + The field for keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_field + pyarrow.Field + """ + @property + def key_type(self) -> _K: + """ + The data type of keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_type + DataType(string) + """ + @property + def item_field(self) -> Field[_ValueT]: + """ + The field for items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_field + pyarrow.Field + """ + @property + def item_type(self) -> _ValueT: + """ + The data type of items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_type + DataType(int32) + """ + @property + def keys_sorted(self) -> _Ordered: + """ + Should the entries be sorted according to keys. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True).keys_sorted + True + """ + +_Size = TypeVar("_Size", default=int) + +class StructType(DataType): + """ + Concrete class for struct data types. + + ``StructType`` supports direct indexing using ``[...]`` (implemented via + ``__getitem__``) to access its fields. + It will return the struct field with the given index or name. + + Examples + -------- + >>> import pyarrow as pa + + Accessing fields using direct indexing: + + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type[0] + pyarrow.Field + >>> struct_type["y"] + pyarrow.Field + + Accessing fields using ``field()``: + + >>> struct_type.field(1) + pyarrow.Field + >>> struct_type.field("x") + pyarrow.Field + + # Creating a schema from the struct type's fields: + >>> pa.schema(list(struct_type)) + x: int32 + y: string + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Index of the field with a name 'y': + + >>> struct_type.get_field_index("y") + 1 + + Index of the field that does not exist: + + >>> struct_type.get_field_index("z") + -1 + """ + def field(self, i: int | str) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or str + + Returns + ------- + pyarrow.Field + + Examples + -------- + + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Select the second field: + + >>> struct_type.field(1) + pyarrow.Field + + Select the field named 'x': + + >>> struct_type.field("x") + pyarrow.Field + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type.get_all_field_indices("x") + [0] + """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + @property + def names(self) -> list[str]: + """ + Lists the field names. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.names + ['a', 'b', 'c'] + """ + @property + def fields(self) -> list[Field]: + """ + Lists all fields within the StructType. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.fields + [pyarrow.Field, pyarrow.Field, pyarrow.Field] + """ + +class UnionType(DataType): + """ + Base class for union data types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + + Create an instance of a sparse UnionType using ``pa.union``: + + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ + @property + def mode(self) -> Literal["sparse", "dense"]: + """ + The mode of the union ("dense" or "sparse"). + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.mode + 'sparse' + """ + @property + def type_codes(self) -> list[int]: + """ + The type code to indicate each data type in this union. + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.type_codes + [0, 1] + """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + def field(self, i: int) -> Field: + """ + Return a child field by its numeric index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union[0] + pyarrow.Field + """ + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + +class SparseUnionType(UnionType): + """ + Concrete class for sparse union types. + + Examples + -------- + Create an instance of a sparse UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ + @property + def mode(self) -> Literal["sparse"]: ... + +class DenseUnionType(UnionType): + """ + Concrete class for dense union types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + """ + + @property + def mode(self) -> Literal["dense"]: ... + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + """ + Concrete class for run-end encoded types. + """ + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _BasicValueT: ... + +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + +class BaseExtensionType(DataType): + """ + Concrete base class for extension types. + """ + def __arrow_ext_class__(self) -> type[ExtensionArray]: + """ + The associated array extension class + """ + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: + """ + The associated scalar class + """ + @property + def extension_name(self) -> str: + """ + The extension type name. + """ + @property + def storage_type(self) -> DataType: + """ + The underlying storage type. + """ + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + +class ExtensionType(BaseExtensionType): + """ + Concrete base class for Python-defined extension types. + + Parameters + ---------- + storage_type : DataType + The underlying storage type for the extension type. + extension_name : str + A unique name distinguishing this extension type. The name will be + used when deserializing IPC data. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Create an instance of RationalType extension type: + + >>> rational_type = RationalType(pa.int32()) + + Inspect the extension type: + + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) + + Wrap an array as an extension array: + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, + ... ) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Or do the same with creating an ExtensionArray: + + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. + """ + + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + def __arrow_ext_serialize__(self) -> bytes: + """ + Serialized representation of metadata to reconstruct the type object. + + This method should return a bytes object, and those serialized bytes + are stored in the custom metadata of the Field holding an extension + type in an IPC message. + The bytes are passed to ``__arrow_ext_deserialize`` and should hold + sufficient information to reconstruct the data type instance. + """ + @classmethod + def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: + """ + Return an extension type instance from the storage type and serialized + metadata. + + This method should return an instance of the ExtensionType subclass + that matches the passed storage type and serialized metadata (the + return value of ``__arrow_ext_serialize__``). + """ + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + """ + Concrete class for fixed shape tensor extension type. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) + FixedShapeTensorType(extension) + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + @property + def value_type(self) -> _ValueT: + """ + Data type of an individual tensor. + """ + @property + def shape(self) -> list[int]: + """ + Shape of the tensors. + """ + @property + def dim_names(self) -> list[str] | None: + """ + Explicit names of the dimensions. + """ + @property + def permutation(self) -> list[int] | None: + """ + Indices of the dimensions ordering. + """ + +class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ + +class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + +class JsonType(BaseExtensionType): + """ + Concrete class for JSON extension type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + +class OpaqueType(BaseExtensionType): + """ + Concrete class for opaque extension type. + + Opaque is a placeholder for a type from an external (often non-Arrow) + system that could not be interpreted. + + Examples + -------- + Create an instance of opaque extension type: + + >>> import pyarrow as pa + >>> pa.opaque(pa.int32(), "geometry", "postgis") + OpaqueType(extension) + """ + @property + def type_name(self) -> str: + """ + The name of the type in the external system. + """ + @property + def vendor_name(self) -> str: + """ + The name of the external system. + """ + +@deprecated( + "This class is deprecated and its deserialization is disabled by default. " + ":class:`ExtensionType` is recommended instead." +) +class PyExtensionType(ExtensionType): + """ + Concrete base class for Python-defined extension types based on pickle + for (de)serialization. + + .. warning:: + This class is deprecated and its deserialization is disabled by default. + :class:`ExtensionType` is recommended instead. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + """ + def __init__(self, storage_type: DataType) -> None: ... + @classmethod + def set_auto_load(cls, value: bool) -> None: + """ + Enable or disable auto-loading of serialized PyExtensionType instances. + + Parameters + ---------- + value : bool + Whether to enable auto-loading. + """ + +class UnknownExtensionType(PyExtensionType): # type: ignore + """ + A concrete class for Python-defined extension types that refer to + an unknown Python implementation. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + serialized : bytes + The serialised output. + """ + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + +def register_extension_type(ext_type: PyExtensionType) -> None: # type: ignore + """ + Register a Python extension type. + + Registration is based on the extension name (so different registered types + need unique extension names). Registration needs an extension type + instance, but then works for any instance of the same subclass regardless + of parametrization of the type. + + Parameters + ---------- + ext_type : BaseExtensionType instance + The ExtensionType subclass to register. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +def unregister_extension_type(type_name: str) -> None: + """ + Unregister a Python extension type. + + Parameters + ---------- + type_name : str + The name of the ExtensionType subclass to unregister. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + """ + KeyValueMetadata + + Parameters + ---------- + __arg0__ : dict + A dict of the key-value metadata + **kwargs : optional + additional key-value metadata + """ + def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... + def equals(self, other: KeyValueMetadata) -> bool: ... + def __len__(self) -> int: ... + def __contains__(self, __key: object) -> bool: ... + def __getitem__(self, __key: Any) -> Any: ... + def __iter__(self) -> Iterator[bytes]: ... + def get_all(self, key: str) -> list[bytes]: ... + def to_dict(self) -> dict[bytes, bytes]: + """ + Convert KeyValueMetadata to dict. If a key occurs twice, the value for + the first one is returned + """ + +def ensure_metadata( + meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False +) -> KeyValueMetadata | None: ... + +class Field(_Weakrefable, Generic[_DataTypeT]): + """ + A named field, with a data type, nullability, and optional metadata. + + Notes + ----- + Do not use this class's constructor directly; use pyarrow.field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + """ + + def equals(self, other: Field, check_metadata: bool = False) -> bool: + """ + Test if this field is equal to the other + + Parameters + ---------- + other : pyarrow.Field + check_metadata : bool, default False + Whether Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.equals(f2) + False + >>> f1.equals(f1) + True + """ + def __hash__(self) -> int: ... + @property + def nullable(self) -> bool: + """ + The field nullability. + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.nullable + True + >>> f2.nullable + False + """ + @property + def name(self) -> str: + """ + The field name. + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field.name + 'key' + """ + @property + def metadata(self) -> dict[bytes, bytes] | None: + """ + The field metadata (if any is set). + + Returns + ------- + metadata : dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + """ + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: + """ + Add metadata as dict of string keys and values to Field + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + + Create new field by adding metadata to existing one: + + >>> field_new = field.with_metadata({"key": "Something important"}) + >>> field_new + pyarrow.Field + >>> field_new.metadata + {b'key': b'Something important'} + """ + def remove_metadata(self) -> Self: + """ + Create new field without metadata, if any + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + + Create new field by removing the metadata from the existing one: + + >>> field_new = field.remove_metadata() + >>> field_new.metadata + """ + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced type + + Parameters + ---------- + new_type : pyarrow.DataType + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing type of an existing one: + + >>> field_new = field.with_type(pa.int64()) + >>> field_new + pyarrow.Field + """ + def with_name(self, name: str) -> Self: + """ + A copy of this field with the replaced name + + Parameters + ---------- + name : str + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing the name of an existing one: + + >>> field_new = field.with_name("lock") + >>> field_new + pyarrow.Field + """ + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced nullability + + Parameters + ---------- + nullable : bool + + Returns + ------- + field: pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + >>> field.nullable + True + + Create new field by replacing the nullability of an existing one: + + >>> field_new = field.with_nullable(False) + >>> field_new + pyarrow.Field + >>> field_new.nullable + False + """ + def flatten(self) -> list[Field]: + """ + Flatten this field. If a struct field, individual child fields + will be returned with their names prefixed by the parent's name. + + Returns + ------- + fields : List[pyarrow.Field] + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("bar", pa.float64(), nullable=False) + >>> f2 = pa.field("foo", pa.int32()).with_metadata({"key": "Something important"}) + >>> ff = pa.field("ff", pa.struct([f1, f2]), nullable=False) + + Flatten a struct field: + + >>> ff + pyarrow.Field not null> + >>> ff.flatten() + [pyarrow.Field, pyarrow.Field] + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import Field from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a Field from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class Schema(_Weakrefable): + """ + A named collection of types a.k.a schema. A schema defines the + column names and types in a record batch or table data structure. + They also contain metadata about the columns. For example, schemas + converted from Pandas contain metadata about their original Pandas + types so they can be converted back to the same types. + + Warnings + -------- + Do not call this class's constructor directly. Instead use + :func:`pyarrow.schema` factory function which makes a new Arrow + Schema object. + + Examples + -------- + Create a new Arrow Schema object: + + >>> import pyarrow as pa + >>> pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + some_int: int32 + some_string: string + + Create Arrow Schema with metadata: + + >>> pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + + def __len__(self) -> int: ... + def __getitem__(self, key: str) -> Field: ... + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] + def __iter__(self) -> Iterator[Field]: ... + def __hash__(self) -> int: ... + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: + """ + Return deserialized-from-JSON pandas metadata field (if it exists) + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> schema = pa.Table.from_pandas(df).schema + + Select pandas metadata field from Arrow Schema: + + >>> schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, 'stop': 4, 'step': 1}], ... + """ + @property + def names(self) -> list[str]: + """ + The schema's field names. + + Returns + ------- + list of str + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the names of the schema's fields: + + >>> schema.names + ['n_legs', 'animals'] + """ + @property + def types(self) -> list[DataType]: + """ + The schema's field types. + + Returns + ------- + list of DataType + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the types of the schema's fields: + + >>> schema.types + [DataType(int64), DataType(string)] + """ + @property + def metadata(self) -> dict[bytes, bytes]: + """ + The schema's metadata (if any is set). + + Returns + ------- + metadata: dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + + Get the metadata of the schema's fields: + + >>> schema.metadata + {b'n_legs': b'Number of legs per animal'} + """ + def empty_table(self) -> Table: + """ + Provide an empty table according to the schema. + + Returns + ------- + table: pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Create an empty table with schema's fields: + + >>> schema.empty_table() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[]] + animals: [[]] + """ + def equals(self, other: Schema, check_metadata: bool = False) -> bool: + """ + Test if this schema is equal to the other + + Parameters + ---------- + other : pyarrow.Schema + check_metadata : bool, default False + Key/value metadata must be equal too + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> schema1 = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema2 = pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + + Test two equal schemas: + + >>> schema1.equals(schema1) + True + + Test two unequal schemas: + + >>> schema1.equals(schema2) + False + """ + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: + """ + Returns implied schema from dataframe + + Parameters + ---------- + df : pandas.DataFrame + preserve_index : bool, default True + Whether to store the index as an additional column (or columns, for + MultiIndex) in the resulting `Table`. + The default of None will store the index as a column, except for + RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({"int": [1, 2], "str": ["a", "b"]}) + + Create an Arrow Schema from the schema of a pandas dataframe: + + >>> pa.Schema.from_pandas(df) + int: int64 + str: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, ... + """ + def field(self, i: int | str | bytes) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or string + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Select the second field: + + >>> schema.field(1) + pyarrow.Field + + Select the field of the column named 'n_legs': + + >>> schema.field("n_legs") + pyarrow.Field + """ + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: + """ + DEPRECATED + + Parameters + ---------- + name : str + + Returns + ------- + field: pyarrow.Field + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the index of the field named 'animals': + + >>> schema.get_field_index("animals") + 1 + + Index in case of several fields with the given name: + + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema.get_field_index("animals") + -1 + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ] + ... ) + + Get the indexes of the fields named 'animals': + + >>> schema.get_all_field_indices("animals") + [1, 2] + """ + def append(self, field: Field) -> Schema: + """ + Append a field at the end of the schema. + + In contrast to Python's ``list.append()`` it does return a new + object, leaving the original Schema unmodified. + + Parameters + ---------- + field : Field + + Returns + ------- + schema: Schema + New object with appended field. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Append a field 'extra' at the end of the schema: + + >>> schema_new = schema.append(pa.field("extra", pa.bool_())) + >>> schema_new + n_legs: int64 + animals: string + extra: bool + + Original schema is unmodified: + + >>> schema + n_legs: int64 + animals: string + """ + def insert(self, i: int, field: Field) -> Schema: + """ + Add a field at position i to the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Insert a new field on the second position: + + >>> schema.insert(1, pa.field("extra", pa.bool_())) + n_legs: int64 + extra: bool + animals: string + """ + def remove(self, i: int) -> Schema: + """ + Remove the field at index i from the schema. + + Parameters + ---------- + i : int + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Remove the second field of the schema: + + >>> schema.remove(1) + n_legs: int64 + """ + def set(self, i: int, field: Field) -> Schema: + """ + Replace a field at position i in the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Replace the second field of the schema with a new field 'extra': + + >>> schema.set(1, pa.field("replaced", pa.bool_())) + n_legs: int64 + replaced: bool + """ + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: + """ + DEPRECATED + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + """ + def with_metadata(self, metadata: dict) -> Schema: + """ + Add metadata as dict of string keys and values to Schema + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Add metadata to existing schema field: + + >>> schema.with_metadata({"n_legs": "Number of legs per animal"}) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write Schema to Buffer as encapsulated IPC message + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Write schema to Buffer: + + >>> schema.serialize() + + """ + def remove_metadata(self) -> Schema: + """ + Create new schema without metadata, if any + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Create a new schema with removing the metadata from the original: + + >>> schema.remove_metadata() + n_legs: int64 + animals: string + """ + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + ) -> str: + """ + Return human-readable representation of Schema + + Parameters + ---------- + truncate_metadata : boolean, default True + Limit metadata key/value display to a single line of ~80 characters + or less + show_field_metadata : boolean, default True + Display Field-level KeyValueMetadata + show_schema_metadata : boolean, default True + Display Schema-level KeyValueMetadata + + Returns + ------- + str : the formatted output + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: + """ + Import Schema from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: + """ + Import a Schema from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +def unify_schemas( + schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" +) -> Schema: + """ + Unify schemas by merging fields by name. + + The resulting schema will contain the union of fields from all schemas. + Fields with the same name will be merged. Note that two fields with + different types will fail merging by default. + + - The unified field will inherit the metadata from the schema where + that field is first defined. + - The first N fields in the schema will be ordered the same as the + N fields in the first schema. + + The resulting schema will inherit its metadata from the first input + schema. + + Parameters + ---------- + schemas : list of Schema + Schemas to merge into a single one. + promote_options : str, default default + Accepts strings "default" and "permissive". + Default: null and only null can be unified with another type. + Permissive: types are promoted to the greater common denominator. + + Returns + ------- + Schema + + Raises + ------ + ArrowInvalid : + If any input schema contains fields with duplicate names. + If Fields of the same name are not mergeable. + """ + +@overload +def field(name: SupportArrowSchema) -> Field[Any]: ... +@overload +def field( + name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT]: ... +def field(*args, **kwargs): + """ + Create a pyarrow.Field instance. + + Parameters + ---------- + name : str or bytes + Name of the field. + Alternatively, you can also pass an object that implements the Arrow + PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). + type : pyarrow.DataType or str + Arrow datatype of the field or a string matching one. + nullable : bool, default True + Whether the field's values are nullable. + metadata : dict, default None + Optional field metadata, the keys and values must be coercible to + bytes. + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + + A str can also be passed for the type parameter: + + >>> pa.field("key", "int32") + pyarrow.Field + """ + +def null() -> NullType: + """ + Create instance of null type. + + Examples + -------- + Create an instance of a null type: + + >>> import pyarrow as pa + >>> pa.null() + DataType(null) + >>> print(pa.null()) + null + + Create a ``Field`` type with a null type and a name: + + >>> pa.field("null_field", pa.null()) + pyarrow.Field + """ + +def bool_() -> BoolType: + """ + Create instance of boolean type. + + Examples + -------- + Create an instance of a boolean type: + + >>> import pyarrow as pa + >>> pa.bool_() + DataType(bool) + >>> print(pa.bool_()) + bool + + Create a ``Field`` type with a boolean type + and a name: + + >>> pa.field("bool_field", pa.bool_()) + pyarrow.Field + """ + +def uint8() -> UInt8Type: + """ + Create instance of unsigned int8 type. + + Examples + -------- + Create an instance of unsigned int8 type: + + >>> import pyarrow as pa + >>> pa.uint8() + DataType(uint8) + >>> print(pa.uint8()) + uint8 + + Create an array with unsigned int8 type: + + >>> pa.array([0, 1, 2], type=pa.uint8()) + + [ + 0, + 1, + 2 + ] + """ + +def int8() -> Int8Type: + """ + Create instance of signed int8 type. + + Examples + -------- + Create an instance of int8 type: + + >>> import pyarrow as pa + >>> pa.int8() + DataType(int8) + >>> print(pa.int8()) + int8 + + Create an array with int8 type: + + >>> pa.array([0, 1, 2], type=pa.int8()) + + [ + 0, + 1, + 2 + ] + """ + +def uint16() -> UInt16Type: + """ + Create instance of unsigned uint16 type. + + Examples + -------- + Create an instance of unsigned int16 type: + + >>> import pyarrow as pa + >>> pa.uint16() + DataType(uint16) + >>> print(pa.uint16()) + uint16 + + Create an array with unsigned int16 type: + + >>> pa.array([0, 1, 2], type=pa.uint16()) + + [ + 0, + 1, + 2 + ] + """ + +def int16() -> Int16Type: + """ + Create instance of signed int16 type. + + Examples + -------- + Create an instance of int16 type: + + >>> import pyarrow as pa + >>> pa.int16() + DataType(int16) + >>> print(pa.int16()) + int16 + + Create an array with int16 type: + + >>> pa.array([0, 1, 2], type=pa.int16()) + + [ + 0, + 1, + 2 + ] + """ + +def uint32() -> Uint32Type: + """ + Create instance of unsigned uint32 type. + + Examples + -------- + Create an instance of unsigned int32 type: + + >>> import pyarrow as pa + >>> pa.uint32() + DataType(uint32) + >>> print(pa.uint32()) + uint32 + + Create an array with unsigned int32 type: + + >>> pa.array([0, 1, 2], type=pa.uint32()) + + [ + 0, + 1, + 2 + ] + """ + +def int32() -> Int32Type: + """ + Create instance of signed int32 type. + + Examples + -------- + Create an instance of int32 type: + + >>> import pyarrow as pa + >>> pa.int32() + DataType(int32) + >>> print(pa.int32()) + int32 + + Create an array with int32 type: + + >>> pa.array([0, 1, 2], type=pa.int32()) + + [ + 0, + 1, + 2 + ] + """ + +def int64() -> Int64Type: + """ + Create instance of signed int64 type. + + Examples + -------- + Create an instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> print(pa.int64()) + int64 + + Create an array with int64 type: + + >>> pa.array([0, 1, 2], type=pa.int64()) + + [ + 0, + 1, + 2 + ] + """ + +def uint64() -> UInt64Type: + """ + Create instance of unsigned uint64 type. + + Examples + -------- + Create an instance of unsigned int64 type: + + >>> import pyarrow as pa + >>> pa.uint64() + DataType(uint64) + >>> print(pa.uint64()) + uint64 + + Create an array with unsigned uint64 type: + + >>> pa.array([0, 1, 2], type=pa.uint64()) + + [ + 0, + 1, + 2 + ] + """ + +def tzinfo_to_string(tz: dt.tzinfo) -> str: + """ + Converts a time zone object into a string indicating the name of a time + zone, one of: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + tz : datetime.tzinfo + Time zone object + + Returns + ------- + name : str + Time zone name + """ + +def string_to_tzinfo(name: str) -> dt.tzinfo: + """ + Convert a time zone name into a time zone object. + + Supported input strings are: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + name: str + Time zone name. + + Returns + ------- + tz : datetime.tzinfo + Time zone object + """ + +@overload +def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... +@overload +def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... +def timestamp(*args, **kwargs): + """ + Create instance of timestamp type with resolution and optional time zone. + + Parameters + ---------- + unit : str + one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns' + [nanosecond] + tz : str, default None + Time zone name. None indicates time zone naive + + Examples + -------- + Create an instance of timestamp type: + + >>> import pyarrow as pa + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + >>> pa.timestamp("s", tz="America/New_York") + TimestampType(timestamp[s, tz=America/New_York]) + >>> pa.timestamp("s", tz="+07:30") + TimestampType(timestamp[s, tz=+07:30]) + + Use timestamp type when creating a scalar object: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("s", tz="UTC")) + + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("us")) + + + Returns + ------- + timestamp_type : TimestampType + """ + +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: + """ + Create instance of 32-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + one of 's' [second], or 'ms' [millisecond] + + Returns + ------- + type : pyarrow.Time32Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time32("s") + Time32Type(time32[s]) + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: + """ + Create instance of 64-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + One of 'us' [microsecond], or 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.Time64Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + >>> pa.time64("ns") + Time64Type(time64[ns]) + """ + +def duration(unit: _Unit) -> DurationType[_Unit]: + """ + Create instance of a duration type with unit resolution. + + Parameters + ---------- + unit : str + One of 's' [second], 'ms' [millisecond], 'us' [microsecond], or + 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.DurationType + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("us") + DurationType(duration[us]) + >>> pa.duration("s") + DurationType(duration[s]) + + Create an array with duration type: + + >>> pa.array([0, 1, 2], type=pa.duration("s")) + + [ + 0, + 1, + 2 + ] + """ + +def month_day_nano_interval() -> MonthDayNanoIntervalType: + """ + Create instance of an interval type representing months, days and + nanoseconds between two dates. + + Examples + -------- + Create an instance of an month_day_nano_interval type: + + >>> import pyarrow as pa + >>> pa.month_day_nano_interval() + DataType(month_day_nano_interval) + + Create a scalar with month_day_nano_interval type: + + >>> pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()) + + """ + +def date32() -> Date32Type: + """ + Create instance of 32-bit date (days since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 32-bit date type: + + >>> import pyarrow as pa + >>> pa.date32() + DataType(date32[day]) + + Create a scalar with 32-bit date type: + + >>> from datetime import date + >>> pa.scalar(date(2012, 1, 1), type=pa.date32()) + + """ + +def date64() -> Date64Type: + """ + Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 64-bit date type: + + >>> import pyarrow as pa + >>> pa.date64() + DataType(date64[ms]) + + Create a scalar with 64-bit date type: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.date64()) + + """ + +def float16() -> Float16Type: + """ + Create half-precision floating point type. + + Examples + -------- + Create an instance of float16 type: + + >>> import pyarrow as pa + >>> pa.float16() + DataType(halffloat) + >>> print(pa.float16()) + halffloat + + Create an array with float16 type: + + >>> arr = np.array([1.5, np.nan], dtype=np.float16) + >>> a = pa.array(arr, type=pa.float16()) + >>> a + + [ + 15872, + 32256 + ] + + Note that unlike other float types, if you convert this array + to a python list, the types of its elements will be ``np.float16`` + + >>> [type(val) for val in a.to_pylist()] + [, ] + """ + +def float32() -> Float32Type: + """ + Create single-precision floating point type. + + Examples + -------- + Create an instance of float32 type: + + >>> import pyarrow as pa + >>> pa.float32() + DataType(float) + >>> print(pa.float32()) + float + + Create an array with float32 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float32()) + + [ + 0, + 1, + 2 + ] + """ + +def float64() -> Float64Type: + """ + Create double-precision floating point type. + + Examples + -------- + Create an instance of float64 type: + + >>> import pyarrow as pa + >>> pa.float64() + DataType(double) + >>> print(pa.float64()) + double + + Create an array with float64 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float64()) + + [ + 0, + 1, + 2 + ] + """ + +@overload +def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... +@overload +def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... +def decimal32(*args, **kwargs): + """ + Create decimal type with precision and scale and 32-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal32(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 32-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal32(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 32-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 9 significant digits, consider + using ``decimal64``, ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 9 + scale : int + + Returns + ------- + decimal_type : Decimal32Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal32(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... +@overload +def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... +def decimal64(*args, **kwargs): + """ + Create decimal type with precision and scale and 64-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal64(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 64-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal64(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 64-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 18 significant digits, consider + using ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 18 + scale : int + + Returns + ------- + decimal_type : Decimal64Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal64(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... +@overload +def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... +def decimal128(*args, **kwargs): + """ + Create decimal type with precision and scale and 128-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal128(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 128-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal128(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 128-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 38 significant digits, consider + using ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 38 + scale : int + + Returns + ------- + decimal_type : Decimal128Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal128(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... +@overload +def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... +def decimal256(*args, **kwargs): + """ + Create decimal type with precision and scale and 256-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + For most use cases, the maximum precision offered by ``decimal128`` + is sufficient, and it will result in a more compact and more efficient + encoding. ``decimal256`` is useful if you need a precision higher + than 38 significant digits. + + Parameters + ---------- + precision : int + Must be between 1 and 76 + scale : int + + Returns + ------- + decimal_type : Decimal256Type + """ + +def string() -> StringType: + """ + Create UTF8 variable-length string type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string() + DataType(string) + + and use the string type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.string()) + + [ + "foo", + "bar", + "baz" + ] + """ + +utf8 = string +""" +Alias for string(). + +Examples +-------- +Create an instance of a string type: + +>>> import pyarrow as pa +>>> pa.utf8() +DataType(string) + +and use the string type to create an array: + +>>> pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + +[ + "foo", + "bar", + "baz" +] +""" + +@overload +def binary(length: Literal[-1] = ...) -> BinaryType: ... +@overload +def binary(length: int) -> FixedSizeBinaryType: ... +def binary(length): + """ + Create variable-length or fixed size binary type. + + Parameters + ---------- + length : int, optional, default -1 + If length == -1 then return a variable length binary type. If length is + greater than or equal to 0 then return a fixed size binary type of + width `length`. + + Examples + -------- + Create an instance of a variable-length binary type: + + >>> import pyarrow as pa + >>> pa.binary() + DataType(binary) + + and use the variable-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary()) + + [ + 666F6F, + 626172, + 62617A + ] + + Create an instance of a fixed-size binary type: + + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + + and use the fixed-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary(3)) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_binary() -> LargeBinaryType: + """ + Create large variable-length binary type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer binary(). + + Examples + -------- + Create an instance of large variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_binary() + DataType(large_binary) + + and use the type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.large_binary()) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_string() -> LargeStringType: + """ + Create large UTF8 variable-length string type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer string(). + + Examples + -------- + Create an instance of large UTF8 variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_string() + DataType(large_string) + + and use the type to create an array: + + >>> pa.array(["foo", "bar"] * 50, type=pa.large_string()) + + [ + "foo", + "bar", + ... + "foo", + "bar" + ] + """ + +large_utf8 = large_string +""" +Alias for large_string(). + +Examples +-------- +Create an instance of large UTF8 variable-length binary type: + +>>> import pyarrow as pa +>>> pa.large_utf8() +DataType(large_string) + +and use the type to create an array: + +>>> pa.array(['foo', 'bar'] * 50, type=pa.large_utf8()) + +[ + "foo", + "bar", + ... + "foo", + "bar" +] +""" + +def binary_view() -> BinaryViewType: + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + +def string_view() -> StringViewType: + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ + +@overload +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... +) -> ListType[_DataTypeT]: ... +@overload +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size +) -> FixedSizeListType[_DataTypeT, _Size]: ... +def list_(*args, **kwargs): + """ + Create ListType instance from child data type or field. + + Parameters + ---------- + value_type : DataType or Field + list_size : int, optional, default -1 + If length == -1 then return a variable length list type. If length is + greater than or equal to 0 then return a fixed size list type. + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + + Use the ListType to create a scalar: + + >>> pa.scalar(["foo", None], type=pa.list_(pa.string(), 2)) + + + or an array: + + >>> pa.array([[1, 2], [3, 4]], pa.list_(pa.int32(), 2)) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + +def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: + """ + Create LargeListType instance from child data type or field. + + This data type may not be supported by all Arrow implementations. + Unless you need to represent data larger than 2**31 elements, you should + prefer list_(). + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.int8()) + LargeListType(large_list) + + Use the LargeListType to create an array: + + >>> pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) + + [ + [ + -1, + 3 + ], + [ + -1, + 3 + ], + ... + """ + +def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT], +) -> LargeListViewType[_DataTypeT]: + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + +@overload +def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... +@overload +def map_( + key_type: _K, item_type: _ValueT, key_sorted: _Ordered +) -> MapType[_K, _ValueT, _Ordered]: ... +def map_(*args, **kwargs): + """ + Create MapType instance from key and item data types or fields. + + Parameters + ---------- + key_type : DataType or Field + item_type : DataType or Field + keys_sorted : bool + + Returns + ------- + map_type : DataType + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + + Use MapType to create an array: + + >>> data = [[{"key": "a", "value": 1}, {"key": "b", "value": 2}], [{"key": "c", "value": 3}]] + >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) + + [ + keys: + [ + "a", + "b" + ] + values: + [ + 1, + 2 + ], + keys: + [ + "c" + ] + values: + [ + 3 + ] + ] + """ + +@overload +def dictionary( + index_type: _IndexT, value_type: _BasicValueT +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +@overload +def dictionary( + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +def dictionary(*args, **kwargs): + """ + Dictionary (categorical, or simply encoded) type. + + Parameters + ---------- + index_type : DataType + value_type : DataType + ordered : bool + + Returns + ------- + type : DictionaryType + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + + Use dictionary type to create an array: + + >>> pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())) + + ... + -- dictionary: + [ + "a", + "b", + "d" + ] + -- indices: + [ + 0, + 1, + null, + 2 + ] + """ + +def struct( + fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] + | Mapping[str, Field[Any]], +) -> StructType: + """ + Create StructType instance from fields. + + A struct is a nested type parameterized by an ordered sequence of types + (which can all be distinct), called its fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + + Examples + -------- + Create an instance of StructType from an iterable of tuples: + + >>> import pyarrow as pa + >>> fields = [ + ... ("f1", pa.int32()), + ... ("f2", pa.string()), + ... ] + >>> struct_type = pa.struct(fields) + >>> struct_type + StructType(struct) + + Retrieve a field from a StructType: + + >>> struct_type[0] + pyarrow.Field + >>> struct_type["f1"] + pyarrow.Field + + Create an instance of StructType from an iterable of Fields: + + >>> fields = [ + ... pa.field("f1", pa.int32()), + ... pa.field("f2", pa.string(), nullable=False), + ... ] + >>> pa.struct(fields) + StructType(struct) + + Returns + ------- + type : DataType + """ + +def sparse_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> SparseUnionType: + """ + Create SparseUnionType from child fields. + + A sparse union is a nested type where each logical value is taken from + a single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + In a sparse union, each child array should have the same length as the + union array, regardless of the actual number of union values that + refer to it. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : SparseUnionType + """ + +def dense_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> DenseUnionType: + """ + Create DenseUnionType from child fields. + + A dense union is a nested type where each logical value is taken from + a single child, at a specific offset. A buffer of 8-bit type ids + indicates which child a given logical value is to be taken from, + and a buffer of 32-bit offsets indicates at which physical position + in the given child array the logical value is to be taken from. + + Unlike a sparse union, a dense union allows encoding only the child array + values which are actually referred to by the union array. This is + counterbalanced by the additional footprint of the offsets buffer, and + the additional indirection cost when looking up values. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : DenseUnionType + """ + +@overload +def union( + child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None +) -> SparseUnionType: ... +@overload +def union( + child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None +) -> DenseUnionType: ... +def union(*args, **kwargs): + """ + Create UnionType from child fields. + + A union is a nested type where each logical value is taken from a + single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + Unions come in two flavors: sparse and dense + (see also `pyarrow.sparse_union` and `pyarrow.dense_union`). + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + mode : str + Must be 'sparse' or 'dense' + type_codes : list of integers, default None + + Returns + ------- + type : UnionType + """ + +def run_end_encoded( + run_end_type: _RunEndType, value_type: _BasicValueT +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: + """ + Create RunEndEncodedType from run-end and value types. + + Parameters + ---------- + run_end_type : pyarrow.DataType + The integer type of the run_ends array. Must be 'int16', 'int32', or 'int64'. + value_type : pyarrow.DataType + The type of the values array. + + Returns + ------- + type : RunEndEncodedType + """ + +def json_(storage_type: DataType = ...) -> JsonType: + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType, default pyarrow.string() + The underlying data type. Can be on of the following types: + string, large_string, string_view. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json_(pa.utf8()) + JsonType(extension) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) + + [ + "{"a": 1}", + "{"b": 2}" + ] + """ + +def uuid() -> UuidType: + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + +def fixed_shape_tensor( + value_type: _ValueT, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, +) -> FixedShapeTensorType[_ValueT]: + """ + Create instance of fixed shape tensor extension type with shape and optional + names of tensor dimensions and indices of the desired logical + ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple or list of integers + The physical shape of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a table with fixed shape tensor extension array: + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] + + Create an instance of fixed shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=["C", "H", "W"]) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : FixedShapeTensorType + """ + +def bool8() -> Bool8Type: + """ + Create instance of bool8 extension type. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(int8) + + Create a table with a bool8 array: + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[-1,0,1,2,null]] + + Returns + ------- + type : Bool8Type + """ + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: + """ + Create instance of opaque extension type. + + Parameters + ---------- + storage_type : DataType + The underlying data type. + type_name : str + The name of the type in the external system. + vendor_name : str + The name of the external system. + + Examples + -------- + Create an instance of an opaque extension type: + + >>> import pyarrow as pa + >>> type = pa.opaque(pa.binary(), "other", "jdbc") + >>> type + OpaqueType(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(binary) + >>> type.type_name + 'other' + >>> type.vendor_name + 'jdbc' + + Create a table with an opaque array: + + >>> arr = [None, b"foobar"] + >>> storage = pa.array(arr, pa.binary()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[null,666F6F626172]] + + Returns + ------- + type : OpaqueType + """ + +@overload +def type_for_alias(name: Literal["null"]) -> NullType: ... +@overload +def type_for_alias(name: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def type_for_alias(name: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def type_for_alias(name: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def type_for_alias(name: Literal["u1", "uint8"]) -> UInt8Type: ... +@overload +def type_for_alias(name: Literal["u2", "uint16"]) -> UInt16Type: ... +@overload +def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def type_for_alias(name: Literal["u8", "uint64"]) -> UInt64Type: ... +@overload +def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def type_for_alias(name: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def type_for_alias(name: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def type_for_alias(name: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def type_for_alias(name: Literal["binary"]) -> BinaryType: ... +@overload +def type_for_alias( + name: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def type_for_alias(name: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def type_for_alias(name: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def type_for_alias(name: Literal["string_view"]) -> StringViewType: ... +@overload +def type_for_alias(name: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def type_for_alias(name: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def type_for_alias(name: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def type_for_alias(name: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def type_for_alias(name): + """ + Return DataType given a string alias if one exists. + + Parameters + ---------- + name : str + The alias of the DataType that should be retrieved. + + Returns + ------- + type : DataType + """ + +@overload +def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... +@overload +def ensure_type(ty: _DataTypeT) -> _DataTypeT: ... +@overload +def ensure_type(ty: Literal["null"]) -> NullType: ... +@overload +def ensure_type(ty: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def ensure_type(ty: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def ensure_type(ty: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def ensure_type(ty: Literal["u1", "uint8"]) -> UInt8Type: ... +@overload +def ensure_type(ty: Literal["u2", "uint16"]) -> UInt16Type: ... +@overload +def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def ensure_type(ty: Literal["u8", "uint64"]) -> UInt64Type: ... +@overload +def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def ensure_type(ty: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def ensure_type(ty: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def ensure_type(ty: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def ensure_type(ty: Literal["binary"]) -> BinaryType: ... +@overload +def ensure_type( + ty: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def ensure_type(ty: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def ensure_type(ty: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def ensure_type(ty: Literal["string_view"]) -> StringViewType: ... +@overload +def ensure_type(ty: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def ensure_type(ty: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def ensure_type(ty: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def ensure_type(ty: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def schema( + fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], + metadata: dict[bytes | str, bytes | str] | None = None, +) -> Schema: + """ + Construct pyarrow.Schema from collection of fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Can also pass an object that implements the Arrow PyCapsule Protocol + for schemas (has an ``__arrow_c_schema__`` method). + metadata : dict, default None + Keys and values must be coercible to bytes. + + Examples + -------- + Create a Schema from iterable of tuples: + + >>> import pyarrow as pa + >>> pa.schema( + ... [ + ... ("some_int", pa.int32()), + ... ("some_string", pa.string()), + ... pa.field("some_required_string", pa.string(), nullable=False), + ... ] + ... ) + some_int: int32 + some_string: string + some_required_string: string not null + + Create a Schema from iterable of Fields: + + >>> pa.schema([pa.field("some_int", pa.int32()), pa.field("some_string", pa.string())]) + some_int: int32 + some_string: string + + DataTypes can also be passed as strings. The following is equivalent to the + above example: + + >>> pa.schema([pa.field("some_int", "int32"), pa.field("some_string", "string")]) + some_int: int32 + some_string: string + + Or more concisely: + + >>> pa.schema([("some_int", "int32"), ("some_string", "string")]) + some_int: int32 + some_string: string + + Returns + ------- + schema : pyarrow.Schema + """ + +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: + """ + Convert NumPy dtype to pyarrow.DataType. + + Parameters + ---------- + dtype : the numpy dtype to convert + + + Examples + -------- + Create a pyarrow DataType from NumPy dtype: + + >>> import pyarrow as pa + >>> import numpy as np + >>> pa.from_numpy_dtype(np.dtype("float16")) + DataType(halffloat) + >>> pa.from_numpy_dtype("U") + DataType(string) + >>> pa.from_numpy_dtype(bool) + DataType(bool) + >>> pa.from_numpy_dtype(np.str_) + DataType(string) + """ + +def is_boolean_value(obj: Any) -> bool: + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_integer_value(obj: Any) -> bool: + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_float_value(obj: Any) -> bool: + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "UInt8Type", + "Int8Type", + "UInt16Type", + "Int16Type", + "Uint32Type", + "Int32Type", + "UInt64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "ensure_metadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "tzinfo_to_string", + "string_to_tzinfo", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "type_for_alias", + "ensure_type", + "schema", + "from_numpy_dtype", + "is_boolean_value", + "is_integer_value", + "is_float_value", +] diff --git a/python/pyarrow/_azurefs.pyi b/python/pyarrow/_azurefs.pyi new file mode 100644 index 00000000000..317943ce20f --- /dev/null +++ b/python/pyarrow/_azurefs.pyi @@ -0,0 +1,74 @@ +from typing import Literal + +from ._fs import FileSystem + +class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. + + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default `DefaultAzureCredential `__ + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any authentication parameters are provided when + initialising the FileSystem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. If sas_token and account_key are None the + default credential will be used. The parameters account_key and sas_token are + mutually exclusive. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like Azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. + blob_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + dfs_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + sas_token : str, default None + SAS token for the storage account, used as an alternative to account_key. If sas_token + and account_key are None the default credential will be used. The parameters + account_key and sas_token are mutually exclusive. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") + >>> azurite_fs = fs.AzureFileSystem( + ... account_name="devstoreaccount1", + ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ... blob_storage_authority="127.0.0.1:10000", + ... dfs_storage_authority="127.0.0.1:10000", + ... blob_storage_scheme="http", + ... dfs_storage_scheme="http", + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + + def __init__( + self, + account_name: str, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_schema: Literal["http", "https"] = "https", + dfs_storage_schema: Literal["http", "https"] = "https", + sas_token: str | None = None, + ) -> None: ... diff --git a/python/pyarrow/_compute.pyi b/python/pyarrow/_compute.pyi new file mode 100644 index 00000000000..3d61ae42787 --- /dev/null +++ b/python/pyarrow/_compute.pyi @@ -0,0 +1,1721 @@ +from typing import ( + Any, + Callable, + Iterable, + Literal, + Sequence, + TypeAlias, + TypedDict, + overload, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + +class Kernel(lib._Weakrefable): + """ + A kernel object. + + Kernels handle the execution of a Function for a certain signature. + """ + +class Function(lib._Weakrefable): + """ + A compute function. + + A function implements a certain logical computation over a range of + possible input signatures. Each signature accepts a range of input + types and is implemented by a given Kernel. + + Functions can be of different kinds: + + * "scalar" functions apply an item-wise computation over all items + of their inputs. Each item in the output only depends on the values + of the inputs at the same position. Examples: addition, comparisons, + string predicates... + + * "vector" functions apply a collection-wise computation, such that + each item in the output may depend on the values of several items + in each input. Examples: dictionary encoding, sorting, extracting + unique values... + + * "scalar_aggregate" functions reduce the dimensionality of the inputs by + applying a reduction function. Examples: sum, min_max, mode... + + * "hash_aggregate" functions apply a reduction function to an input + subdivided by grouping criteria. They may not be directly called. + Examples: hash_sum, hash_min_max... + + * "meta" functions dispatch to other functions. + """ + @property + def arity(self) -> int: + """ + The function arity. + + If Ellipsis (i.e. `...`) is returned, the function takes a variable + number of arguments. + """ + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: + """ + The function kind. + """ + @property + def name(self) -> str: + """ + The function name. + """ + @property + def num_kernels(self) -> int: + """ + The number of kernels implementing this function. + """ + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: + """ + Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If + not passed, will be inferred from passed data. + """ + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: + """ + Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup + """ + + def list_functions(self) -> list[str]: + """ + Return all function names in the registry. + """ + +class HashAggregateFunction(Function): ... +class HashAggregateKernel(Kernel): ... +class ScalarAggregateFunction(Function): ... +class ScalarAggregateKernel(Kernel): ... +class ScalarFunction(Function): ... +class ScalarKernel(Kernel): ... +class VectorFunction(Function): ... +class VectorKernel(Kernel): ... + +# ==================== _compute.pyx Option classes ==================== +class ArraySortOptions(FunctionOptions): + """ + Options for the `array_sort_indices` function. + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + +class AssumeTimezoneOptions(FunctionOptions): + """ + Options for the `assume_timezone` function. + + Parameters + ---------- + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + """ + + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + +class CastOptions(FunctionOptions): + """ + Options for the `cast` function. + + Parameters + ---------- + target_type : DataType, optional + The PyArrow type to cast to. + allow_int_overflow : bool, default False + Whether integer overflow is allowed when casting. + allow_time_truncate : bool, default False + Whether time precision truncation is allowed when casting. + allow_time_overflow : bool, default False + Whether date/time range overflow is allowed when casting. + allow_decimal_truncate : bool, default False + Whether decimal precision truncation is allowed when casting. + allow_float_truncate : bool, default False + Whether floating-point precision truncation is allowed when casting. + allow_invalid_utf8 : bool, default False + Whether producing invalid utf8 data is allowed when casting. + """ + + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + +class CountOptions(FunctionOptions): + """ + Options for the `count` function. + + Parameters + ---------- + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + """ + def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... + +class CumulativeOptions(FunctionOptions): + """ + Options for `cumulative_*` functions. + + - cumulative_sum + - cumulative_sum_checked + - cumulative_prod + - cumulative_prod_checked + - cumulative_max + - cumulative_min + + Parameters + ---------- + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class CumulativeSumOptions(FunctionOptions): + """ + Options for `cumulative_sum` function. + + Parameters + ---------- + start : Scalar, default None + Starting value for sum computation + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class DayOfWeekOptions(FunctionOptions): + """ + Options for the `day_of_week` function. + + Parameters + ---------- + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + """ + + def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... + +class DictionaryEncodeOptions(FunctionOptions): + """ + Options for dictionary encoding. + + Parameters + ---------- + null_encoding : str, default "mask" + How to encode nulls in the input. + Accepted values are "mask" (null inputs emit a null in the indices + array), "encode" (null inputs emit a non-null index pointing to + a null value in the dictionary array). + """ + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + +class RunEndEncodeOptions(FunctionOptions): + """ + Options for run-end encoding. + + Parameters + ---------- + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + """ + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + +class ElementWiseAggregateOptions(FunctionOptions): + """ + Options for element-wise aggregate functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + """ + def __init__(self, *, skip_nulls: bool = True) -> None: ... + +class ExtractRegexOptions(FunctionOptions): + """ + Options for the `extract_regex` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class ExtractRegexSpanOptions(FunctionOptions): + """ + Options for the `extract_regex_span` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class FilterOptions(FunctionOptions): + """ + Options for selecting with a boolean filter. + + Parameters + ---------- + null_selection_behavior : str, default "drop" + How to handle nulls in the selection filter. + Accepted values are "drop", "emit_null". + """ + + def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... + +class IndexOptions(FunctionOptions): + """ + Options for the `index` function. + + Parameters + ---------- + value : Scalar + The value to search for. + """ + def __init__(self, value: lib.Scalar) -> None: ... + +class JoinOptions(FunctionOptions): + """ + Options for the `binary_join_element_wise` function. + + Parameters + ---------- + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + """ + @overload + def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + @overload + def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... + +class ListSliceOptions(FunctionOptions): + """ + Options for list array slicing. + + Parameters + ---------- + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + """ + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + +class ListFlattenOptions(FunctionOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + def __init__(self, recursive: bool = False) -> None: ... + +class MakeStructOptions(FunctionOptions): + """ + Options for the `make_struct` function. + + Parameters + ---------- + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + """ + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + +class MapLookupOptions(FunctionOptions): + """ + Options for the `map_lookup` function. + + Parameters + ---------- + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + """ + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + +class MatchSubstringOptions(FunctionOptions): + """ + Options for looking for a substring. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + """ + + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + +class ModeOptions(FunctionOptions): + """ + Options for the `mode` function. + + Parameters + ---------- + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class NullOptions(FunctionOptions): + """ + Options for the `is_null` function. + + Parameters + ---------- + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + """ + def __init__(self, *, nan_is_null: bool = False) -> None: ... + +class PadOptions(FunctionOptions): + """ + Options for padding strings. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + """ + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + +class PairwiseOptions(FunctionOptions): + """ + Options for `pairwise` functions. + + Parameters + ---------- + period : int, default 1 + Period for applying the period function. + """ + def __init__(self, period: int = 1) -> None: ... + +class PartitionNthOptions(FunctionOptions): + """ + Options for the `partition_nth_indices` function. + + Parameters + ---------- + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + """ + def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... + +class WinsorizeOptions(FunctionOptions): + """ + Options for the `winsorize` function. + + Parameters + ---------- + lower_limit : float, between 0 and 1 + The quantile below which all values are replaced with the quantile's value. + upper_limit : float, between 0 and 1 + The quantile above which all values are replaced with the quantile's value. + """ + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + +class QuantileOptions(FunctionOptions): + """ + Options for the `quantile` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float], + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class RandomOptions(FunctionOptions): + """ + Options for random generation. + + Parameters + ---------- + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + """ + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + +class RankOptions(FunctionOptions): + """ + Options for the `rank` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + """ + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + +class RankQuantileOptions(FunctionOptions): + """ + Options for the `rank_quantile` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + """ + + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + +class PivotWiderOptions(FunctionOptions): + """ + Options for the `pivot_wider` function. + + Parameters + ---------- + key_names : sequence of str + The pivot key names expected in the pivot key column. + For each entry in `key_names`, a column with the same name is emitted + in the struct output. + unexpected_key_behavior : str, default "ignore" + The behavior when pivot keys not in `key_names` are encountered. + Accepted values are "ignore", "raise". + If "ignore", unexpected keys are silently ignored. + If "raise", unexpected keys raise a KeyError. + """ + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + +class ReplaceSliceOptions(FunctionOptions): + """ + Options for replacing slices. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + """ + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + +class ReplaceSubstringOptions(FunctionOptions): + """ + Options for replacing matched substrings. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + """ + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + +class RoundBinaryOptions(FunctionOptions): + """ + Options for rounding numbers when ndigits is provided by a second array + + Parameters + ---------- + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +class RoundOptions(FunctionOptions): + """ + Options for rounding numbers. + + Parameters + ---------- + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + +class RoundTemporalOptions(FunctionOptions): + """ + Options for rounding temporal values. + + Parameters + ---------- + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + """ + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + +class RoundToMultipleOptions(FunctionOptions): + """ + Options for rounding numbers to a multiple. + + Parameters + ---------- + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... + +class ScalarAggregateOptions(FunctionOptions): + """ + Options for scalar aggregations. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + +class SelectKOptions(FunctionOptions): + """ + Options for top/bottom k-selection. + + Parameters + ---------- + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + """ + + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + +class SetLookupOptions(FunctionOptions): + """ + Options for the `is_in` and `index_in` functions. + + Parameters + ---------- + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + """ + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + +class SliceOptions(FunctionOptions): + """ + Options for slicing. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + """ + + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... + +class SortOptions(FunctionOptions): + """ + Options for the `sort_indices` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" + ) -> None: ... + +class SplitOptions(FunctionOptions): + """ + Options for splitting on whitespace. + + Parameters + ---------- + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + + def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... + +class SplitPatternOptions(FunctionOptions): + """ + Options for splitting on a string pattern. + + Parameters + ---------- + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + +class StrftimeOptions(FunctionOptions): + """ + Options for the `strftime` function. + + Parameters + ---------- + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + """ + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... + +class StrptimeOptions(FunctionOptions): + """ + Options for the `strptime` function. + + Parameters + ---------- + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + """ + def __init__( + self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False + ) -> None: ... + +class StructFieldOptions(FunctionOptions): + """ + Options for the `struct_field` function. + + Parameters + ---------- + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + """ + def __init__( + self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int + ) -> None: ... + +class TakeOptions(FunctionOptions): + """ + Options for the `take` and `array_take` functions. + + Parameters + ---------- + boundscheck : boolean, default True + Whether to check indices are within bounds. If False and an + index is out of bounds, behavior is undefined (the process + may crash). + """ + def __init__(self, boundscheck: bool = True) -> None: ... + +class TDigestOptions(FunctionOptions): + """ + Options for the `tdigest` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class TrimOptions(FunctionOptions): + """ + Options for trimming characters from strings. + + Parameters + ---------- + characters : str + Individual characters to be trimmed from the string. + """ + def __init__(self, characters: str) -> None: ... + +class Utf8NormalizeOptions(FunctionOptions): + """ + Options for the `utf8_normalize` function. + + Parameters + ---------- + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + +class VarianceOptions(FunctionOptions): + """ + Options for the `variance` and `stddev` functions. + + Parameters + ---------- + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class SkewOptions(FunctionOptions): + """ + Options for the `skew` and `kurtosis` functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + +class WeekOptions(FunctionOptions): + """ + Options for the `week` function. + + Parameters + ---------- + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + """ + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + +# ==================== _compute.pyx Functions ==================== + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: + """ + Call a named function. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If not + passed, inferred from data. + """ + +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: + """ + Get a function by name. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup + """ + +def list_functions() -> list[str]: + """ + Return all function names in the global registry. + """ + +# ==================== _compute.pyx Udf ==================== + +def call_tabular_function( + function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None +) -> lib.RecordBatchReader: + """ + Get a record batch iterator from a tabular function. + + Parameters + ---------- + function_name : str + Name of the function. + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. Currently, only an empty args is supported. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +class _FunctionDoc(TypedDict): + summary: str + description: str + +def register_scalar_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined scalar function. + + This API is EXPERIMENTAL. + + A scalar function is a function that executes elementwise + operations on arrays or scalars, i.e. a scalar function must + be computed row-by-row with no state where each output row + is computed only from its corresponding input row. + In other words, all argument arrays have the same length, + and the output array is of the same length as the arguments. + Scalar functions are the only functions allowed in query engine + expressions. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> + >>> def add_constant(ctx, array): + ... return pc.add(array, 1, memory_pool=ctx.memory_pool) + >>> + >>> func_name = "py_add_func" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.int64() + >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> answer = pc.call_function(func_name, [pa.array([20])]) + >>> answer + + [ + 21 + ] + """ + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined tabular function. + + This API is EXPERIMENTAL. + + A tabular function is one accepting a context argument of type + UdfContext and returning a generator of struct arrays. + The in_types argument must be empty and the out_type argument + specifies a schema. Each struct array must have field types + corresponding to the schema. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The only argument is the context argument of type + UdfContext. It must return a callable that + returns on each invocation a StructArray matching + the out_type, where an empty array indicates end. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + Must be an empty dictionary (reserved for future use). + out_type : Union[Schema, DataType] + Schema of the function's output, or a corresponding flat struct type. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined non-decomposable aggregate function. + + This API is EXPERIMENTAL. + + A non-decomposable aggregation function is a function that executes + aggregate operations on the whole data that it is aggregating. + In other words, non-decomposable aggregate function cannot be + split into consume/merge/finalize steps. + + This is often used with ordered or segmented aggregation where groups + can be emit before accumulating all of the input data. + + Note that currently the size of any input column cannot exceed 2 GB + for a single segment (all groups combined). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return a Scalar matching the + out_type. + To define a varargs function, pass a callable that takes + *args. The in_type needs to match in type of inputs when + the function gets called. + function_name : str + Name of the function. This name must be unique, i.e., + there should only be one function registered with + this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import numpy as np + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple median udf" + >>> func_doc["description"] = "compute median" + >>> + >>> def compute_median(ctx, array): + ... return pa.scalar(np.median(array)) + >>> + >>> func_name = "py_compute_median" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.float64() + >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_compute_median' + >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) + >>> answer + + >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) + >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) + >>> result + pyarrow.Table + k: int64 + v_py_compute_median: double + ---- + k: [[1,2]] + v_py_compute_median: [[15,35]] + """ + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ + +class UdfContext: + """ + Per-invocation function context/state. + + This object will always be the first argument to a user-defined + function. It should not be used outside of a call to the function. + """ + + @property + def batch_length(self) -> int: + """ + The common length of all input arguments (int). + + In the case that all arguments are scalars, this value + is used to pass the "actual length" of the arguments, + e.g. because the scalar values are encoding a column + with a constant value. + """ + @property + def memory_pool(self) -> lib.MemoryPool: + """ + A memory pool for allocations (:class:`MemoryPool`). + + This is the memory pool supplied by the user when they invoked + the function and it should be used in any calls to arrow that the + UDF makes if that call accepts a memory_pool. + """ + +# ==================== _compute.pyx Expression ==================== +class Expression(lib._Weakrefable): + """ + A logical expression to be evaluated against some input. + + To create an expression: + + - Use the factory function ``pyarrow.compute.scalar()`` to create a + scalar (not necessary when combined, see example below). + - Use the factory function ``pyarrow.compute.field()`` to reference + a field (column in table). + - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. + - Combine expressions using python operators ``&`` (logical and), + ``|`` (logical or) and ``~`` (logical not). + Note: python keywords ``and``, ``or`` and ``not`` cannot be used + to combine expressions. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. + + Examples + -------- + + >>> import pyarrow.compute as pc + >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) + 7))> + >>> pc.field("a") != 3 + + >>> pc.field("a").isin([1, 2, 3]) + + """ + + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: + """ + Deserialize an expression from Substrait + + The serialized message must be an ExtendedExpression message that has + only a single expression. The name of the expression and the schema + the expression was bound to will be ignored. Use + pyarrow.substrait.deserialize_expressions if this information is needed + or if the message might contain multiple expressions. + + Parameters + ---------- + message : bytes or Buffer or a protobuf Message + The Substrait message to deserialize + + Returns + ------- + Expression + The deserialized expression + """ + def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: + """ + Serialize the expression using Substrait + + The expression will be serialized as an ExtendedExpression message that has a + single expression named "expression" + + Parameters + ---------- + schema : Schema + The input schema the expression will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + but the result may not be accepted by other compute libraries. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ + def __invert__(self) -> Expression: ... + def __and__(self, other) -> Expression: ... + def __or__(self, other) -> Expression: ... + def __add__(self, other) -> Expression: ... + def __mul__(self, other) -> Expression: ... + def __sub__(self, other) -> Expression: ... + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... # type: ignore[override] + def __lt__(self, value: object) -> Expression: ... # type: ignore[override] + def __ge__(self, value: object) -> Expression: ... # type: ignore[override] + def __le__(self, value: object) -> Expression: ... # type: ignore[override] + def __truediv__(self, other) -> Expression: ... + def is_valid(self) -> bool: + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ + def is_null(self, nan_is_null: bool = False) -> Expression: + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ + def is_nan(self) -> Expression: + """ + Check whether the expression is NaN. + + This creates a new expression equivalent to calling the + `is_nan` compute function on this expression. + + Returns + ------- + is_nan : Expression + """ + def cast( + self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + ) -> Expression: + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Expression + """ + def isin(self, values: lib.Array | Iterable) -> Expression: + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ + +# ==================== _compute.py ==================== diff --git a/python/pyarrow/_csv.pyi b/python/pyarrow/_csv.pyi new file mode 100644 index 00000000000..2f49f8c9a6c --- /dev/null +++ b/python/pyarrow/_csv.pyi @@ -0,0 +1,641 @@ +from dataclasses import dataclass, field +from typing import IO, Any, Callable, Literal + +from _typeshed import StrPath + +from . import lib + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + """ + Options for reading CSV files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + Minimum valid value for block size is 1 + skip_rows : int, optional (default 0) + The number of rows to skip before the column names (if any) + and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + column_names : list, optional + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + autogenerate_column_names : bool, optional (default False) + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + encoding : str, optional (default 'utf8') + The character encoding of the CSV data. Columns that cannot + decode using this encoding can still be read as Binary. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" + >>> print(s) + 1,2,3 + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + + Ignore the first numbered row and substitute it with defined + or autogenerated column names: + + >>> from pyarrow import csv + >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + f0: string + f1: int64 + f2: date32[day] + ---- + f0: [["Flamingo","Horse","Brittle stars","Centipede"]] + f1: [[2,4,5,100]] + f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + Remove the first 2 rows of the data: + + >>> read_options = csv.ReadOptions(skip_rows_after_names=2) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + 1: string + 2: int64 + 3: date32[day] + ---- + 1: [["Brittle stars","Centipede"]] + 2: [[5,100]] + 3: [[2022-03-03,2022-03-04]] + """ + + use_threads: bool = field(default=True, kw_only=False) + block_size: int | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: list[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + """ + Options for parsing CSV files. + + Parameters + ---------- + delimiter : 1-character string, optional (default ',') + The character delimiting individual cells in the CSV data. + quote_char : 1-character string or False, optional (default '"') + The character used optionally for quoting CSV values + (False if quoting is not allowed). + double_quote : bool, optional (default True) + Whether two quotes in a quoted CSV value denote a single quote + in the data. + escape_char : 1-character string or False, optional (default False) + The character used optionally for escaping special characters + (False if escaping is not allowed). + newlines_in_values : bool, optional (default False) + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + ignore_empty_lines : bool, optional (default True) + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + invalid_row_handler : callable, optional (default None) + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals;n_legs;entry\\n" + ... "Flamingo;2;2022-03-01\\n" + ... "# Comment here:\\n" + ... "Horse;4;2022-03-02\\n" + ... "Brittle stars;5;2022-03-03\\n" + ... "Centipede;100;2022-03-04" + ... ) + >>> print(s) + animals;n_legs;entry + Flamingo;2;2022-03-01 + # Comment here: + Horse;4;2022-03-02 + Brittle stars;5;2022-03-03 + Centipede;100;2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Read the data from a file skipping rows with comments + and defining the delimiter: + + >>> from pyarrow import csv + >>> def skip_comment(row): + ... if row.text.startswith("# "): + ... return "skip" + ... else: + ... return "error" + >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) + >>> csv.read_csv(source, parse_options=parse_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + + delimiter: str = field(default=",", kw_only=False) + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + """ + Options for converting CSV data. + + Parameters + ---------- + check_utf8 : bool, optional (default True) + Whether to check UTF8 validity of string columns. + column_types : pyarrow.Schema or dict, optional + Explicitly map column names to column types. Passing this argument + disables type inference on the defined columns. + null_values : list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). Note that by default, + string columns are not checked for null values. To enable + null checking for those, specify ``strings_can_be_null=True``. + true_values : list, optional + A sequence of strings that denote true booleans in the data + (defaults are appropriate in most cases). + false_values : list, optional + A sequence of strings that denote false booleans in the data + (defaults are appropriate in most cases). + decimal_point : 1-character string, optional (default '.') + The character used as decimal point in floating-point and decimal + data. + strings_can_be_null : bool, optional (default False) + Whether string / binary columns can have null values. + If true, then strings in null_values are considered null for + string columns. + If false, then all strings are valid string values. + quoted_strings_can_be_null : bool, optional (default True) + Whether quoted values can be null. + If true, then strings in "null_values" are also considered null + when they appear quoted in the CSV file. Otherwise, quoted values + are never considered null. + include_columns : list, optional + The names of columns to include in the Table. + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + include_missing_columns : bool, optional (default False) + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a column of nulls (whose type is selected using + `column_types`, or null by default). + This option is ignored if `include_columns` is empty. + auto_dict_encode : bool, optional (default False) + Whether to try to automatically dict-encode string / binary data. + If true, then when type inference detects a string or binary column, + it it dict-encoded up to `auto_dict_max_cardinality` distinct values + (per chunk), after which it switches to regular encoding. + This setting is ignored for non-inferred columns (those in + `column_types`). + auto_dict_max_cardinality : int, optional + The maximum dictionary cardinality for `auto_dict_encode`. + This value is per chunk. + timestamp_parsers : list, optional + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry,fast\\n" + ... "Flamingo,2,01/03/2022,Yes\\n" + ... "Horse,4,02/03/2022,Yes\\n" + ... "Brittle stars,5,03/03/2022,No\\n" + ... "Centipede,100,04/03/2022,No\\n" + ... ",6,05/03/2022," + ... ) + >>> print(s) + animals,n_legs,entry,fast + Flamingo,2,01/03/2022,Yes + Horse,4,02/03/2022,Yes + Brittle stars,5,03/03/2022,No + Centipede,100,04/03/2022,No + ,6,05/03/2022, + + Change the type of a column: + + >>> import pyarrow as pa + >>> from pyarrow import csv + >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: double + entry: string + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] + fast: [["Yes","Yes","No","No",""]] + + Define a date parsing format to get a timestamp type column + (in case dates are not in ISO format and not converted by default): + + >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: timestamp[s] + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [["Yes","Yes","No","No",""]] + + Specify a subset of columns to be read: + + >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + + List additional column to be included as a null typed column: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + location: null + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + location: [5 nulls] + + Define columns as dictionary type (by default only the + string/binary columns are dictionary encoded): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: dictionary + n_legs: int64 + entry: timestamp[s] + fast: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: + [0,1,2,3,4]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [ -- dictionary: + ["Yes","No",""] -- indices: + [0,0,1,1,2]] + + Set upper limit for the number of categories. If the categories + is more than the limit, the conversion to dictionary will not + happen: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + + Set empty strings to missing values: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs"], strings_can_be_null=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] + n_legs: [[2,4,5,100,6]] + + Define values to be True and False when converting a column + into a bool type: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + fast: bool + ---- + fast: [[true,true,false,false,null]] + """ + + check_utf8: bool = field(default=True, kw_only=False) + column_types: lib.Schema | dict | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: list[str] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + """ + Options for writing CSV files. + + Parameters + ---------- + include_header : bool, optional (default True) + Whether to write an initial header line with column names + batch_size : int, optional (default 1024) + How many rows to process together when converting and writing + CSV data + delimiter : 1-character string, optional (default ",") + The character delimiting individual cells in the CSV data. + quoting_style : str, optional (default "needed") + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ + + include_header: bool = field(default=True, kw_only=False) + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + +@dataclass +class InvalidRow(lib._Weakrefable): + """ + Description of an invalid row in a CSV file. + + Parameters + ---------- + expected_columns : int + The expected number of columns in the row. + actual_columns : int + The actual number of columns in the row. + number : int or None + The physical row number if known, otherwise None. + text : str + The contents of the row. + """ + + expected_columns: int + actual_columns: int + number: int | None + text: str + +class CSVWriter(lib._CRecordBatchWriter): + """ + Writer to create a CSV file. + + Parameters + ---------- + sink : str, path, pyarrow.OutputStream or file-like object + The location where to write the CSV data. + schema : pyarrow.Schema + The schema of the data to be written. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + """ + + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class CSVStreamingReader(lib.RecordBatchReader): ... + +ISO8601: lib._Weakrefable + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: + """ + Open a streaming reader of CSV data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.csv.CSVStreamingReader` + """ + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: + """ + Read a Table from a stream of CSV data. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from + + Returns + ------- + :class:`pyarrow.Table` + Contents of the CSV file as a in-memory table. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry\\n" + ... "Flamingo,2,2022-03-01\\n" + ... "Horse,4,2022-03-02\\n" + ... "Brittle stars,5,2022-03-03\\n" + ... "Centipede,100,2022-03-04" + ... ) + >>> print(s) + animals,n_legs,entry + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Reading from the file + + >>> from pyarrow import csv + >>> csv.read_csv(source) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: + """ + Write record batch or table to a CSV file. + + Parameters + ---------- + data : pyarrow.RecordBatch or pyarrow.Table + The data to write. + output_file : string, path, pyarrow.NativeFile, or file-like object + The location where to write the CSV data. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow import csv + + >>> legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) + >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) + + >>> csv.write_csv(table, "animals.csv") + + >>> write_options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + + >>> write_options = csv.WriteOptions(delimiter=";") + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + """ diff --git a/python/pyarrow/_cuda.pyi b/python/pyarrow/_cuda.pyi new file mode 100644 index 00000000000..ad52b2f380f --- /dev/null +++ b/python/pyarrow/_cuda.pyi @@ -0,0 +1,556 @@ +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +from . import lib +from ._stubs_typing import ArrayLike + +class Context(lib._Weakrefable): + """ + CUDA driver context. + """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + def to_numba(self) -> _numba_driver.Context: + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ + @staticmethod + def get_num_devices() -> int: + """Return the number of GPU devices.""" + @property + def device_number(self) -> int: + """Return context device number.""" + @property + def handle(self) -> int: + """Return pointer to context handle.""" + def synchronize(self) -> None: + """Blocks until the device has completed all preceding requested + tasks. + """ + @property + def bytes_allocated(self) -> int: + """Return the number of allocated bytes.""" + def get_device_address(self, address: int) -> int: + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + def new_buffer(self, nbytes: int) -> CudaBuffer: + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ + @property + def memory_manager(self) -> lib.MemoryManager: + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + @property + def device(self) -> lib.Device: + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: + """Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + def buffer_from_object(self, obj: Any) -> CudaBuffer: + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ + +class IpcMemHandle(lib._Weakrefable): + """A serializable container for a CUDA IPC handle.""" + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ + +class CudaBuffer(lib.Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: + """Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + def to_numba(self) -> _numba_driver.MemoryPointer: + """Return numba memory pointer of CudaBuffer instance.""" + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + def export_for_ipc(self) -> IpcMemHandle: + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ + @property + def context(self) -> Context: + """Returns the CUDA driver context of this buffer.""" + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + def to_pybytes(self) -> bytes: + """Return device buffer content as Python bytes.""" + +class HostBuffer(lib.Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ + @property + def size(self) -> int: ... + +class BufferReader(lib.NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ + +class BufferWriter(lib.NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ + @property + def buffer_size(self) -> int: + """Returns size of host (CPU) buffer, 0 for unbuffered""" + @buffer_size.setter + def buffer_size(self, buffer_size: int): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ + @property + def num_bytes_buffered(self) -> int: + """Returns number of bytes buffered on host""" + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: + """Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: + """Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ diff --git a/python/pyarrow/_dataset.pyi b/python/pyarrow/_dataset.pyi new file mode 100644 index 00000000000..114bf625983 --- /dev/null +++ b/python/pyarrow/_dataset.pyi @@ -0,0 +1,2301 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + IO, + Any, + Callable, + Generic, + Iterator, + Literal, + NamedTuple, + TypeVar, + overload, +) + +from _typeshed import StrPath + +from . import _csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + +class Dataset(lib._Weakrefable): + """ + Collection of data fragments and potentially child datasets. + + Arrow Datasets allow you to query against data that has been split across + multiple files. This sharding of data may indicate partitioning, which + can accelerate queries that only touch some partitions (files). + """ + + @property + def partition_expression(self) -> Expression: + """ + An Expression which evaluates to true for all data viewed by this + Dataset. + """ + def replace_schema(self, schema: lib.Schema) -> None: + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. + """ + def get_fragments(self, filter: Expression | None = None): + """Returns an iterator over the fragments in this dataset. + + Parameters + ---------- + filter : Expression, default None + Return fragments matching the optional filter, either using the + partition_expression or internal information like Parquet's + statistics. + + Returns + ------- + fragments : iterator of Fragment + """ + def scanner( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the dataset. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + See the :meth:`Scanner.from_dataset` method for further information. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "dataset_scanner.parquet") + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("dataset_scanner.parquet") + + Selecting a subset of the columns: + + >>> dataset.scanner(columns=["year", "n_legs"]).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + + Projecting selected columns using an expression: + + >>> dataset.scanner( + ... columns={ + ... "n_legs_uint": ds.field("n_legs").cast("uint8"), + ... } + ... ).to_table() + pyarrow.Table + n_legs_uint: uint8 + ---- + n_legs_uint: [[2,2,4,4,5,100]] + + Filtering rows while scanning: + + >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2022,2021,2022,2021]] + n_legs: [[2,4,4,100]] + animal: [["Parrot","Dog","Horse","Centipede"]] + """ + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the dataset as materialized record batches. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Read the dataset to an Arrow table. + + Note that this method reads all the selected data from the dataset + into memory. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + @property + def schema(self) -> lib.Schema: + """The common schema of the full Dataset""" + def filter(self, expression: Expression) -> Self: + """ + Apply a row filter to the dataset. + + Parameters + ---------- + expression : Expression + The filter that should be applied to the dataset. + + Returns + ------- + Dataset + """ + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: + """ + Sort the Dataset by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + InMemoryDataset + A new dataset sorted according to the sort keys. + """ + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: + """ + Perform a join between this dataset and another one. + + Result of the join will be a new dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + keys : str or list[str] + The columns from current dataset that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_dataset that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to right column names. This prevents confusion + when the columns in left and right datasets have colliding names. + right_suffix : str, default None + Which suffix to add to the left column names. This prevents confusion + when the columns in left and right datasets have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whenever to use multithreading or not. + + Returns + ------- + InMemoryDataset + """ + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: + """ + Perform an asof join between this dataset and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both datasets must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + on : str + The column from current dataset that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input table must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current dataset that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row `right.on - left.on <= tolerance`. The + `tolerance` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_dataset that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left dataset. + right_by : str or list[str], default None + The columns from the right_dataset that should be used as by keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + + Returns + ------- + InMemoryDataset + """ + +class InMemoryDataset(Dataset): + """ + A Dataset wrapping in-memory data. + + Parameters + ---------- + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader + If an iterable is provided, the schema must also be provided. + schema : Schema, optional + Only required if passing an iterable as the source + """ + +class UnionDataset(Dataset): + """ + A Dataset wrapping child datasets. + + Children's schemas must agree with the provided schema. + + Parameters + ---------- + schema : Schema + A known schema to conform to. + children : list of Dataset + One or more input children + """ + + @property + def children(self) -> list[Dataset]: ... + +class FileSystemDataset(Dataset): + """ + A Dataset of file fragments. + + A FileSystemDataset is composed of one or more FileFragment. + + Parameters + ---------- + fragments : list[Fragments] + List of fragments to consume. + schema : Schema + The top-level schema of the Dataset. + format : FileFormat + File format of the fragments, currently only ParquetFileFormat, + IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + FileSystem of the fragments. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: + """ + A Dataset created from a list of paths on a particular filesystem. + + Parameters + ---------- + paths : list of str + List of file paths to create the fragments from. + schema : Schema + The top-level schema of the DataDataset. + format : FileFormat + File format to create fragments from, currently only + ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + The filesystem which files are from. + partitions : list[Expression], optional + Attach additional partition information for the file paths. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: + """ + The partitioning of the Dataset source, if discovered. + + If the FileSystemDataset is created using the ``dataset()`` factory + function with a partitioning specified, this will return the + finalized Partitioning object from the dataset discovery. In all + other cases, this returns None. + """ + @property + def files(self) -> list[str]: + """List of the files""" + @property + def format(self) -> FileFormat: + """The FileFormat of this source.""" + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + +class Fragment(lib._Weakrefable): + """Fragment of data from a Dataset.""" + @property + def physical_schema(self) -> lib.Schema: + """Return the physical schema of this Fragment. This schema can be + different from the dataset read schema.""" + @property + def partition_expression(self) -> Expression: + """An Expression which evaluates to true for all data viewed by this + Fragment. + """ + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the fragment. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + Parameters + ---------- + schema : Schema + Schema to use for scanning. This is used to unify a Fragment to + its Dataset's schema. If not specified this will use the + Fragment's physical schema which might differ for each Fragment. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + """ + def to_batches( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the fragment as materialized record batches. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Convert this Fragment into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the fragment. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + +class FileFragment(Fragment): + """A Fragment representing a data file.""" + + def open(self) -> lib.NativeFile: + """ + Open a NativeFile of the buffer or file viewed by this fragment. + """ + @property + def path(self) -> str: + """ + The path of the data file viewed by this fragment, if it views a + file. If instead it views a buffer, this will be "". + """ + @property + def filesystem(self) -> FileSystem: + """ + The FileSystem containing the data file viewed by this fragment, if + it views a file. If instead it views a buffer, this will be None. + """ + @property + def buffer(self) -> lib.Buffer: + """ + The buffer viewed by this fragment, if it views a buffer. If + instead it views a file, this will be None. + """ + @property + def format(self) -> FileFormat: + """ + The format of the data file viewed by this fragment. + """ + +class FragmentScanOptions(lib._Weakrefable): + """Scan options specific to a particular fragment and scan operation.""" + + @property + def type_name(self) -> str: ... + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + +class FeatherFileFormat(IpcFileFormat): ... + +class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : pyarrow.csv.ParseOptions + Options regarding CSV parsing. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + def __init__( + self, + parse_options: _csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: _csv.ConvertOptions | None = None, + read_options: _csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] + @property + def parse_options(self) -> _csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + +class CsvFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + + convert_options: _csv.ConvertOptions + read_options: _csv.ReadOptions + + def __init__( + self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + +class CsvFileWriteOptions(FileWriteOptions): + write_options: _csv.WriteOptions + +class JsonFileFormat(FileFormat): + """ + FileFormat for JSON files. + + Parameters + ---------- + default_fragment_scan_options : JsonFragmentScanOptions + Default options for fragments scan. + parse_options : pyarrow.json.ParseOptions + Options regarding json parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + +class JsonFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for JSON fragments. + + Parameters + ---------- + parse_options : pyarrow.json.ParseOptions + Options regarding JSON parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + def __init__( + self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: + """ + Parse a path into a partition expression. + + Parameters + ---------- + path : str + + Returns + ------- + pyarrow.dataset.Expression + """ + def format(self, expr: Expression) -> tuple[str, str]: + """ + Convert a filter expression into a tuple of (directory, filename) using + the current partitioning scheme + + Parameters + ---------- + expr : pyarrow.dataset.Expression + + Returns + ------- + tuple[str, str] + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> import pyarrow.compute as pc + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) + >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) + ('1862/Jan', '') + """ + @property + def schema(self) -> lib.Schema: + """The arrow Schema attached to the partitioning.""" + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[lib.Array | None]: + """ + The unique values for each partition field, if available. + + Those values are only available if the Partitioning object was + created through dataset discovery from a PartitioningFactory, or + if the dictionaries were manually specified in the constructor. + If no dictionary field is available, this returns an empty list. + """ + +class DirectoryPartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The DirectoryPartitioning expects one segment in the file path for each + field in the schema (all fields are required to be present). + For example given schema the path "/2009/11" would + be parsed to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + DirectoryPartitioning + + Examples + -------- + >>> from pyarrow.dataset import DirectoryPartitioning + >>> partitioning = DirectoryPartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("/2009/11/")) + ((year == 2009) and (month == 11)) + """ + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a DirectoryPartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + +class HivePartitioning(KeyValuePartitioning): + """ + A Partitioning for "/$key=$value/" nested directories as found in + Apache Hive. + + Multi-level, directory based partitioning scheme originating from + Apache Hive with all data files stored in the leaf directories. Data is + partitioned by static values of a particular column in the schema. + Partition keys are represented in the form $key=$value in directory names. + Field order is ignored, as are missing or unrecognized field names. + + For example, given schema, a possible + path would be "/year=2009/month=11/day=15". + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + If any field is None then this fallback will be used as a label + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + HivePartitioning + + Examples + -------- + >>> from pyarrow.dataset import HivePartitioning + >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/year=2009/month=11/")) + ((year == 2009) and (month == 11)) + + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a HivePartitioning. + + Parameters + ---------- + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain. This can be more efficient when + materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class FilenamePartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The FilenamePartitioning expects one segment in the file name for each + field in the schema (all fields are required to be present) separated + by '_'. For example given schema the name + ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + FilenamePartitioning + + Examples + -------- + >>> from pyarrow.dataset import FilenamePartitioning + >>> partitioning = FilenamePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("2009_11_data.parquet")) + ((year == 2009) and (month == 11)) + """ + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a FilenamePartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class DatasetFactory(lib._Weakrefable): + """ + DatasetFactory is used to create a Dataset, inspect the Schema + of the fragments contained in it, and declare a partitioning. + """ + + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: + """ + Create a Dataset using the inspected schema or an explicit schema + (if given). + + Parameters + ---------- + schema : Schema, default None + The schema to conform the source to. If None, the inspected + schema is used. + + Returns + ------- + Dataset + """ + def inspect(self) -> lib.Schema: + """ + Inspect all data fragments and return a common Schema. + + Returns + ------- + Schema + """ + def inspect_schemas(self) -> list[lib.Schema]: ... + +class FileSystemFactoryOptions(lib._Weakrefable): + """ + Influences the discovery of filesystem paths. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning/PartitioningFactory, optional + Apply the Partitioning to every discovered Fragment. See Partitioning or + PartitioningFactory documentation. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + selector_ignore_prefixes : list, optional + When discovering from a Selector (and not from an explicit file list), + ignore files and directories matching any of these prefixes. + By default this is ['.', '_']. + """ + + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + artition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + +class FileSystemDatasetFactory(DatasetFactory): + """ + Create a DatasetFactory from a list of paths with schema inspection. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem + Filesystem to discover. + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes + Either a Selector object or a list of path-like objects. + format : FileFormat + Currently only ParquetFileFormat and IpcFileFormat are supported. + options : FileSystemFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + +class UnionDatasetFactory(DatasetFactory): + """ + Provides a way to inspect/discover a Dataset's expected schema before + materialization. + + Parameters + ---------- + factories : list of DatasetFactory + """ + def __init__(self, factories: list[DatasetFactory]) -> None: ... + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + """An iterator over a sequence of record batches.""" + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + +class TaggedRecordBatch(NamedTuple): + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : RecordBatch + The record batch. + fragment : Fragment + Fragment of the record batch. + """ + + record_batch: lib.RecordBatch + fragment: Fragment + +class TaggedRecordBatchIterator(lib._Weakrefable): + """An iterator over a sequence of record batches with fragments.""" + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + +class Scanner(lib._Weakrefable): + """A materialized scan operation with context and options bound. + + A scanner is the class that glues the scan tasks, data fragments and data + sources together. + """ + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Dataset, + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Fragment, + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema, optional + The schema of the fragment. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @overload + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch], + *, + schema: lib.Schema, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @overload + @staticmethod + def from_batches( + source: RecordBatchReader, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @staticmethod + def from_batches(*args, **kwargs): + """ + Create a Scanner from an iterator of batches. + + This creates a scanner which can be used only once. It is + intended to support writing a dataset (which takes a scanner) + from a source which can be read only once (e.g. a + RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator or Arrow-compatible stream object + The iterator of Batches. This can be a pyarrow RecordBatchReader, + any object that implements the Arrow PyCapsule Protocol for + streams, or an actual Python iterator of RecordBatches. + schema : Schema + The schema of the batches (required when passing a Python + iterator). + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @property + def dataset_schema(self) -> lib.Schema: + """The schema with which batches will be read from fragments.""" + @property + def projected_schema(self) -> lib.Schema: + """ + The materialized schema of the data, accounting for projections. + + This is the schema of any data returned from the scanner. + """ + def to_batches(self) -> Iterator[lib.RecordBatch]: + """ + Consume a Scanner in record batches. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def scan_batches(self) -> TaggedRecordBatchIterator: + """ + Consume a Scanner in record batches with corresponding fragments. + + Returns + ------- + record_batches : iterator of TaggedRecordBatch + """ + def to_table(self) -> lib.Table: + """ + Convert a Scanner into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Returns + ------- + Table + """ + def take(self, indices: Indices) -> lib.Table: + """ + Select rows of data by index. + + Will only consume as many batches of the underlying dataset as + needed. Otherwise, this is equivalent to + ``to_table().take(indices)``. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + + Returns + ------- + Table + """ + def head(self, num_rows: int) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + + Returns + ------- + Table + """ + def count_rows(self) -> int: + """ + Count rows matching the scanner filter. + + Returns + ------- + count : int + """ + def to_reader(self) -> RecordBatchReader: + """Consume this scanner as a RecordBatchReader. + + Returns + ------- + RecordBatchReader + """ + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: + """ + Extract partition keys (equality constraints between a field and a scalar) + from an expression as a dict mapping the field's name to its value. + + NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning + will be conjunctions of equality conditions and are accessible through this + function. Other subexpressions will be ignored. + + Parameters + ---------- + partition_expression : pyarrow.dataset.Expression + + Returns + ------- + dict + + Examples + -------- + + For example, an expression of + + is converted to {'part': 'A', 'year': 2016} + """ + +class WrittenFile(lib._Weakrefable): + """ + Metadata information about files written as + part of a dataset write operation + + Parameters + ---------- + path : str + Path to the file. + metadata : pyarrow.parquet.FileMetaData, optional + For Parquet files, the Parquet file metadata. + size : int + The size of the file in bytes. + """ + def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None], + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + +class ScanNodeOptions(_ScanNodeOptions): + """ + A Source node which yields batches from a Dataset scan. + + This is the option class for the "scan" node factory. + + This node is capable of applying pushdown projections or filters + to the file readers which reduce the amount of data that needs to + be read (if supported by the file format). But note that this does not + construct associated filter or project nodes to perform the final + filtering or projection. Rather, you may supply the same filter + expression or projection to the scan node that you also supply + to the filter or project node. + + Yielded batches will be augmented with fragment/batch indices when + implicit_ordering=True to enable stable ordering for simple ExecPlans. + + Parameters + ---------- + dataset : pyarrow.dataset.Dataset + The table which acts as the data source. + **kwargs : dict, optional + Scan options. See `Scanner.from_dataset` for possible arguments. + require_sequenced_output : bool, default False + Batches are yielded sequentially, like single-threaded + implicit_ordering : bool, default False + Preserve implicit ordering of data. + """ + + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/pyarrow/_dataset_orc.pyi b/python/pyarrow/_dataset_orc.pyi new file mode 100644 index 00000000000..9c4ac04198f --- /dev/null +++ b/python/pyarrow/_dataset_orc.pyi @@ -0,0 +1,6 @@ +from ._dataset import FileFormat + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/pyarrow/_dataset_parquet.pyi b/python/pyarrow/_dataset_parquet.pyi new file mode 100644 index 00000000000..cbcc17235f1 --- /dev/null +++ b/python/pyarrow/_dataset_parquet.pyi @@ -0,0 +1,314 @@ +from dataclasses import dataclass +from typing import IO, Any, Iterable, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from .lib import CacheOptions, Schema, _Weakrefable + +parquet_encryption_enabled: bool + +class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option + """ + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + row_groups : Iterable, optional + The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + +class _NameStats(TypedDict): + min: Any + max: Any + +class RowGroupInfo: + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. + """ + + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + +class ParquetFileFragment(FileFragment): + """A Fragment representing a parquet file.""" + + def ensure_complete_metadata(self) -> None: ... + @property + def row_groups(self) -> list[RowGroupInfo]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups viewed by this fragment (not the + number of row groups in the origin file). + """ + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: + """ + Split the fragment into multiple fragments. + + Yield a Fragment wrapping each row group in this ParquetFileFragment. + Row groups will be excluded whose metadata contradicts the optional + filter. + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + + Returns + ------- + A list of Fragments + """ + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: + """ + Create a subset of the fragment (viewing a subset of the row groups). + + Subset can be specified by either a filter predicate (with optional + schema) or by a list of row group IDs. Note that when using a filter, + the resulting fragment can be empty (viewing no row groups). + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + row_group_ids : list of ints + The row group IDs to include in the subset. Can only be specified + if `filter` is None. + + Returns + ------- + ParquetFileFragment + """ + +class ParquetReadOptions(_Weakrefable): + """ + Parquet format specific options for reading. + + Parameters + ---------- + dictionary_columns : list of string, default None + Names of columns which should be dictionary encoded as + they are read + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds + """ + def __init__( + self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None + ) -> None: ... + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + def equals(self, other: ParquetReadOptions) -> bool: ... + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for Parquet fragments. + + Parameters + ---------- + use_buffered_stream : bool, default False + Read files through buffered input streams rather than loading entire + row groups at once. This may be enabled to reduce memory overhead. + Disabled by default. + buffer_size : int, default 8192 + Size of buffered stream, if enabled. Default is 8KB. + pre_buffer : bool, default True + If enabled, pre-buffer the raw Parquet data instead of issuing one + read per column chunk. This can improve performance on high-latency + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None + If not None, use the provided ParquetDecryptionConfig to decrypt the + Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + """ + + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + """ + Influences the discovery of parquet dataset. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning, PartitioningFactory, optional + The partitioning scheme applied to fragments, see ``Partitioning``. + validate_column_chunk_paths : bool, default False + Assert that all ColumnChunk paths are consistent. The parquet spec + allows for ColumnChunk data to be stored in multiple files, but + ParquetDatasetFactory supports only a single file with all ColumnChunk + data. If this flag is set construction of a ParquetDatasetFactory will + raise an error if ColumnChunk data is not resident in a single file. + """ + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + +class ParquetDatasetFactory(DatasetFactory): + """ + Create a ParquetDatasetFactory from a Parquet `_metadata` file. + + Parameters + ---------- + metadata_path : str + Path to the `_metadata` parquet metadata-only file generated with + `pyarrow.parquet.write_metadata`. + filesystem : pyarrow.fs.FileSystem + Filesystem to read the metadata_path from, and subsequent parquet + files. + format : ParquetFileFormat + Parquet format options. + options : ParquetFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/pyarrow/_dataset_parquet_encryption.pyi b/python/pyarrow/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..7623275b865 --- /dev/null +++ b/python/pyarrow/_dataset_parquet_encryption.pyi @@ -0,0 +1,85 @@ +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig +from .lib import _Weakrefable + +class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/pyarrow/_feather.pyi b/python/pyarrow/_feather.pyi new file mode 100644 index 00000000000..8bb914ba45d --- /dev/null +++ b/python/pyarrow/_feather.pyi @@ -0,0 +1,29 @@ +from typing import IO + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + +class FeatherError(Exception): ... + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: int = 2, +): ... + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: list[int]) -> Table: ... + def read_names(self, names: list[str]) -> Table: ... diff --git a/python/pyarrow/_flight.pyi b/python/pyarrow/_flight.pyi new file mode 100644 index 00000000000..4450c42df49 --- /dev/null +++ b/python/pyarrow/_flight.pyi @@ -0,0 +1,1380 @@ +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar + +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Schema, + Table, + TimestampScalar, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + +class FlightCallOptions(_Weakrefable): + """RPC-layer options for a Flight call.""" + + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str, str]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: + """Create call options. + + Parameters + ---------- + timeout : float, None + A timeout for the call, in seconds. None means that the + timeout defaults to an implementation-specific value. + write_options : pyarrow.ipc.IpcWriteOptions, optional + IPC write options. The default options can be controlled + by environment variables (see pyarrow.ipc). + headers : List[Tuple[str, str]], optional + A list of arbitrary headers as key, value tuples + read_options : pyarrow.ipc.IpcReadOptions, optional + Serialization options for reading IPC format. + """ + +class CertKeyPair(NamedTuple): + """A TLS certificate and key for use in Flight.""" + + cert: str + key: str + +class FlightError(Exception): + """ + The base class for Flight-specific errors. + + A server may raise this class or one of its subclasses to provide + a more detailed error to clients. + + Parameters + ---------- + message : str, optional + The error message. + extra_info : bytes, optional + Extra binary error details that were provided by the + server/will be sent to the client. + + Attributes + ---------- + extra_info : bytes + Extra binary error details that were provided by the + server/will be sent to the client. + """ + + extra_info: bytes + +class FlightInternalError(FlightError, ArrowException): + """An error internal to the Flight server occurred.""" + +class FlightTimedOutError(FlightError, ArrowException): + """The Flight RPC call timed out.""" + +class FlightCancelledError(FlightError, ArrowCancelled): + """The operation was cancelled.""" + +class FlightServerError(FlightError, ArrowException): + """A server error occurred.""" + +class FlightUnauthenticatedError(FlightError, ArrowException): + """The client is not authenticated.""" + +class FlightUnauthorizedError(FlightError, ArrowException): + """The client is not authorized to perform the given operation.""" + +class FlightUnavailableError(FlightError, ArrowException): + """The server is not reachable or available.""" + +class FlightWriteSizeExceededError(ArrowInvalid): + """A write operation exceeded the client-configured limit.""" + + limit: int + actual: int + +class Action(_Weakrefable): + """An action executable on a Flight service.""" + + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: + """Create an action from a type and a buffer. + + Parameters + ---------- + action_type : bytes or str + buf : Buffer or bytes-like object + """ + @property + def type(self) -> str: + """The action type.""" + @property + def body(self) -> Buffer: + """The action body (arguments for the action).""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class ActionType(NamedTuple): + """A type of action that is executable on a Flight service.""" + + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: + """Create an Action with this type. + + Parameters + ---------- + buf : obj + An Arrow buffer or Python bytes or bytes-like object. + """ + +class Result(_Weakrefable): + """A result from executing an Action.""" + def __init__(self, buf: Buffer | bytes) -> None: + """Create a new result. + + Parameters + ---------- + buf : Buffer or bytes-like object + """ + @property + def body(self) -> Buffer: + """Get the Buffer containing the result.""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class BasicAuth(_Weakrefable): + """A container for basic auth.""" + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: + """Create a new basic auth object. + + Parameters + ---------- + username : string + password : string + """ + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + +class DescriptorType(enum.Enum): + """ + The type of a FlightDescriptor. + + Attributes + ---------- + + UNKNOWN + An unknown descriptor type. + + PATH + A Flight stream represented by a path. + + CMD + A Flight stream represented by an application-defined command. + + """ + + UNKNOWN = 0 + PATH = 1 + CMD = 2 + +class FlightMethod(enum.Enum): + """The implemented methods in Flight.""" + + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + +class FlightDescriptor(_Weakrefable): + """A description of a data stream available from a Flight service.""" + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for a resource path.""" + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for an opaque command.""" + @property + def descriptor_type(self) -> DescriptorType: + """Get the type of this descriptor.""" + @property + def path(self) -> list[bytes] | None: + """Get the path for this descriptor.""" + @property + def command(self) -> bytes | None: + """Get the command for this descriptor.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Ticket(_Weakrefable): + """A ticket for requesting a Flight stream.""" + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Location(_Weakrefable): + """The location of a Flight service.""" + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: + """Create a Location for a TCP-based gRPC service.""" + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: + """Create a Location for a TLS-based gRPC service.""" + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: + """Create a Location for a domain socket-based gRPC service.""" + +class FlightEndpoint(_Weakrefable): + """A Flight stream, along with the ticket and locations to access it.""" + def __init__( + self, + ticket: Ticket | str | bytes, + locations: list[str | Location], + expiration_time: TimestampScalar | None = ..., + app_metadata: bytes | str = ..., + ): + """Create a FlightEndpoint from a ticket and list of locations. + + Parameters + ---------- + ticket : Ticket or bytes + the ticket needed to access this flight + locations : list of string URIs + locations where this flight is available + expiration_time : TimestampScalar, default None + Expiration time of this stream. If present, clients may assume + they can retry DoGet requests. Otherwise, clients should avoid + retrying DoGet requests. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + + Raises + ------ + ArrowException + If one of the location URIs is not a valid URI. + """ + @property + def ticket(self) -> Ticket: + """Get the ticket in this endpoint.""" + @property + def locations(self) -> list[Location]: + """Get locations where this flight is available.""" + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> TimestampScalar | None: + """Get the expiration time of this stream. + + If present, clients may assume they can retry DoGet requests. + Otherwise, clients should avoid retrying DoGet requests. + + """ + @property + def app_metadata(self) -> bytes | str: + """Get application-defined opaque metadata.""" + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class SchemaResult(_Weakrefable): + """The serialized schema returned from a GetSchema request.""" + def __init__(self, schema: Schema) -> None: + """Create a SchemaResult from a schema. + + Parameters + ---------- + schema: Schema + the schema of the data in this flight. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightInfo(_Weakrefable): + """A description of a Flight stream.""" + def __init__( + self, + schema: Schema, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int = ..., + total_bytes: int = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: + """Create a FlightInfo object from a schema, descriptor, and endpoints. + + Parameters + ---------- + schema : Schema + the schema of the data in this flight. + descriptor : FlightDescriptor + the descriptor for this flight. + endpoints : list of FlightEndpoint + a list of endpoints where this flight is available. + total_records : int, default None + the total records in this flight, -1 or None if unknown. + total_bytes : int, default None + the total bytes in this flight, -1 or None if unknown. + ordered : boolean, default False + Whether endpoints are in the same order as the data. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + @property + def descriptor(self) -> FlightDescriptor: + """The descriptor of the data in this flight.""" + @property + def endpoints(self) -> list[FlightEndpoint]: + """The endpoints where this flight is available.""" + @property + def total_records(self) -> int: + """The total record count of this flight, or -1 if unknown.""" + @property + def total_bytes(self) -> int: + """The size in bytes of the data in this flight, or -1 if unknown.""" + @property + def ordered(self) -> bool: + """Whether endpoints are in the same order as the data.""" + @property + def app_metadata(self) -> bytes | str: + """ + Application-defined opaque metadata. + + There is no inherent or required relationship between this and the + app_metadata fields in the FlightEndpoints or resulting FlightData + messages. Since this metadata is application-defined, a given + application could define there to be a relationship, but there is + none required by the spec. + + """ + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightStreamChunk(_Weakrefable): + """A RecordBatch with application metadata on the side.""" + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + """A reader for Flight streams.""" + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: + """Get the schema for this reader.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + def read_chunk(self) -> FlightStreamChunk: + """Read the next FlightStreamChunk along with any metadata. + + Returns + ------- + chunk : FlightStreamChunk + The next FlightStreamChunk in the stream. + + Raises + ------ + StopIteration + when the stream is finished + """ + def to_reader(self) -> RecordBatchReader: + """Convert this reader into a regular RecordBatchReader. + + This may fail if the schema cannot be read from the remote end. + + Returns + ------- + RecordBatchReader + """ + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + """The base class for readers for Flight streams. + + See Also + -------- + FlightStreamReader + """ + +class FlightStreamReader(MetadataRecordBatchReader): + """A reader that can also be canceled.""" + def cancel(self) -> None: + """Cancel the read operation.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + """A RecordBatchWriter that also allows writing application metadata. + + This class is a context manager; on exit, close() will be called. + """ + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: + """Prepare to write data to this stream with the given schema.""" + def write_metadata(self, buf: Buffer) -> None: + """Write Flight metadata by itself.""" + def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + """ + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: + """Write a RecordBatch along with Flight metadata. + + Parameters + ---------- + batch : RecordBatch + The next RecordBatch in the stream. + buf : Buffer + Application-specific metadata for the batch as defined by + Flight. + """ + +class FlightStreamWriter(MetadataRecordBatchWriter): + """A writer that also allows closing the write side of a stream.""" + def done_writing(self) -> None: + """Indicate that the client is done writing, but not done reading.""" + +class FlightMetadataReader(_Weakrefable): + """A reader for Flight metadata messages sent during a DoPut.""" + def read(self) -> Buffer | None: + """Read the next metadata message.""" + +class FlightMetadataWriter(_Weakrefable): + """A sender for Flight metadata messages during a DoPut.""" + def write(self, message: Buffer) -> None: + """Write the next metadata message. + + Parameters + ---------- + message : Buffer + """ + +class AsyncioCall(Generic[_T]): + """State for an async RPC using asyncio.""" + + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + +class AsyncioFlightClient: + """ + A FlightClient with an asyncio-based async interface. + + This interface is EXPERIMENTAL. + """ + + def __init__(self, client: FlightClient) -> None: ... + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + +class FlightClient(_Weakrefable): + """A client to a Flight service. + + Connect to a Flight service on the given host and port. + + Parameters + ---------- + location : str, tuple or Location + Location to connect to. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + tls_root_certs : bytes or None + PEM-encoded + cert_chain: bytes or None + Client certificate if using mutual TLS + private_key: bytes or None + Client private key for cert_chain is using mutual TLS + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list optional, default None + A list of ClientMiddlewareFactory instances. + write_size_limit_bytes : int optional, default None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean optional, default False + A flag that indicates that, if the client is connecting + with TLS, that it skips server verification. If this is + enabled, all other TLS settings are overridden. + generic_options : list optional, default None + A list of generic (string, int or string) option tuples passed + to the underlying transport. Effect is implementation + dependent. + """ + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: + """Block until the server can be contacted. + + Parameters + ---------- + timeout : int, default 5 + The maximum seconds to wait. + """ + @deprecated( + "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." + ) + @classmethod + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: + """Connect to a Flight server. + + .. deprecated:: 0.15.0 + Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. + """ + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: + """Authenticate to the server. + + Parameters + ---------- + auth_handler : ClientAuthHandler + The authentication mechanism to use. + options : FlightCallOptions + Options for this call. + """ + def authenticate_basic_token( + self, username: str, password: str, options: FlightCallOptions | None = None + ) -> tuple[str, str]: + """Authenticate to the server with HTTP basic authentication. + + Parameters + ---------- + username : string + Username to authenticate with + password : string + Password to authenticate with + options : FlightCallOptions + Options for this call + + Returns + ------- + tuple : Tuple[str, str] + A tuple representing the FlightCallOptions authorization + header entry of a bearer token. + """ + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: + """List the actions available on a service.""" + def do_action( + self, action: Action, options: FlightCallOptions | None = None + ) -> Iterator[Result]: + """ + Execute an action on a service. + + Parameters + ---------- + action : str, tuple, or Action + Can be action type name (no body), type and body, or any Action + object + options : FlightCallOptions + RPC options + + Returns + ------- + results : iterator of Result values + """ + def list_flights( + self, criteria: str | None = None, options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: + """List the flights available on a service.""" + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: + """Request information about an available flight.""" + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> Schema: + """Request schema for an available flight.""" + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: + """Request the data for a flight. + + Returns + ------- + reader : FlightStreamReader + """ + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Upload data to a flight. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightMetadataReader + """ + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Start a bidirectional data exchange with a server. + + Parameters + ---------- + descriptor : FlightDescriptor + A descriptor for the flight. + options : FlightCallOptions + RPC options. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightStreamReader + """ + def close(self) -> None: + """Close the client and disconnect.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + +class FlightDataStream(_Weakrefable): + """ + Abstract base class for Flight data streams. + + See Also + -------- + RecordBatchStream + GeneratorStream + """ + +class RecordBatchStream(FlightDataStream): + """A Flight data stream backed by RecordBatches. + + The remainder of this DoGet request will be handled in C++, + without having to acquire the GIL. + + """ + def __init__( + self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None + ) -> None: + """Create a RecordBatchStream from a data source. + + Parameters + ---------- + data_source : RecordBatchReader or Table + The data to stream to the client. + options : pyarrow.ipc.IpcWriteOptions, optional + Optional IPC options to control how to write the data. + """ + +class GeneratorStream(FlightDataStream): + """A Flight data stream backed by a Python generator.""" + def __init__( + self, + schema: Schema, + generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], + options: IpcWriteOptions | None = None, + ) -> None: + """Create a GeneratorStream from a Python generator. + + Parameters + ---------- + schema : Schema + The schema for the data to be returned. + + generator : iterator or iterable + The generator should yield other FlightDataStream objects, + Tables, RecordBatches, or RecordBatchReaders. + + options : pyarrow.ipc.IpcWriteOptions, optional + """ + +class ServerCallContext(_Weakrefable): + """Per-call state/context.""" + def peer_identity(self) -> bytes: + """Get the identity of the authenticated peer. + + May be the empty string. + """ + def peer(self) -> str: + """Get the address of the peer.""" + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: + """Check if the current RPC call has been canceled by the client.""" + def add_header(self, key: str, value: str) -> None: + """Add a response header.""" + def add_trailer(self, key: str, value: str) -> None: + """Add a response trailer.""" + def get_middleware(self, key: str) -> ServerMiddleware | None: + """ + Get a middleware instance by key. + + Returns None if the middleware was not found. + """ + +class ServerAuthReader(_Weakrefable): + """A reader for messages from the client during an auth handshake.""" + def read(self) -> str: ... + +class ServerAuthSender(_Weakrefable): + """A writer for messages to the client during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ClientAuthReader(_Weakrefable): + """A reader for messages from the server during an auth handshake.""" + def read(self) -> str: ... + +class ClientAuthSender(_Weakrefable): + """A writer for messages to the server during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ServerAuthHandler(_Weakrefable): + """Authentication middleware for a server. + + To implement an authentication mechanism, subclass this class and + override its methods. + + """ + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): + """Conduct the handshake with the client. + + May raise an error if the client cannot authenticate. + + Parameters + ---------- + outgoing : ServerAuthSender + A channel to send messages to the client. + incoming : ServerAuthReader + A channel to read messages from the client. + """ + def is_valid(self, token: str) -> bool: + """Validate a client token, returning their identity. + + May return an empty string (if the auth mechanism does not + name the peer) or raise an exception (if the token is + invalid). + + Parameters + ---------- + token : bytes + The authentication token from the client. + + """ + +class ClientAuthHandler(_Weakrefable): + """Authentication plugin for a client.""" + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): + """Conduct the handshake with the server. + + Parameters + ---------- + outgoing : ClientAuthSender + A channel to send messages to the server. + incoming : ClientAuthReader + A channel to read messages from the server. + """ + def get_token(self) -> str: + """Get the auth token for a call.""" + +class CallInfo(NamedTuple): + """Information about a particular RPC for Flight middleware.""" + + method: FlightMethod + +class ClientMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + client are accessible from the middleware itself. + + """ + def start_call(self, info: CallInfo) -> ClientMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe and must not raise exceptions. + + Parameters + ---------- + info : CallInfo + Information about the call. + + Returns + ------- + instance : ClientMiddleware + An instance of ClientMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + """ + +class ClientMiddleware(_Weakrefable): + """Client-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the request, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): + """A callback when headers are received. + + The default implementation does nothing. + + Parameters + ---------- + headers : dict + A dictionary of headers from the server. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + """ + + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + The default implementation does nothing. + + Parameters + ---------- + exception : ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class ServerMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + middleware are accessible from the method itself. + + """ + + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe. + + Parameters + ---------- + info : CallInfo + Information about the call. + headers : dict + A dictionary of headers from the client. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + Returns + ------- + instance : ServerMiddleware + An instance of ServerMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + Raises + ------ + exception : pyarrow.ArrowException + If an exception is raised, the call will be rejected with + the given error. + + """ + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + """A factory for tracing middleware instances. + + This enables OpenTelemetry support in Arrow (if Arrow was compiled + with OpenTelemetry support enabled). A new span will be started on + each RPC call. The TracingServerMiddleware instance can then be + retrieved within an RPC handler to get the propagated context, + which can be used to start a new span on the Python side. + + Because the Python/C++ OpenTelemetry libraries do not + interoperate, spans on the C++ side are not directly visible to + the Python side and vice versa. + + """ + +class ServerMiddleware(_Weakrefable): + """Server-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the response, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + Parameters + ---------- + exception : pyarrow.ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + """Wrapper to bundle server middleware into a single C++ one.""" + + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + +class _FlightServerFinalizer(_Weakrefable): + """ + A finalizer that shuts down the server on destruction. + + See ARROW-16597. If the server is still active at interpreter + exit, the process may segfault. + """ + + def finalize(self) -> None: ... + +class FlightServerBase(_Weakrefable): + """A Flight service definition. + + To start the server, create an instance of this class with an + appropriate location. The server will be running as soon as the + instance is created; it is not required to call :meth:`serve`. + + Override methods to define your Flight service. + + Parameters + ---------- + location : str, tuple or Location optional, default None + Location to serve on. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + If None is passed then the server will be started on localhost with a + system provided random port. + auth_handler : ServerAuthHandler optional, default None + An authentication mechanism to use. May be None. + tls_certificates : list optional, default None + A list of (certificate, key) pairs. + verify_client : boolean optional, default False + If True, then enable mutual TLS: require the client to present + a client certificate, and validate the certificate. + root_certificates : bytes optional, default None + If enabling mutual TLS, this specifies the PEM-encoded root + certificate used to validate client certificates. + middleware : dict optional, default None + A dictionary of :class:`ServerMiddlewareFactory` instances. The + string keys can be used to retrieve the middleware instance within + RPC handlers (see :meth:`ServerCallContext.get_middleware`). + + """ + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + @property + def port(self) -> int: + """ + Get the port that this server is listening on. + + Returns a non-positive value if the operation is invalid + (e.g. init() was not called or server is listening on a domain + socket). + """ + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: + """List flights available on this service. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + criteria : bytes + Filter criteria provided by the client. + + Returns + ------- + iterator of FlightInfo + + """ + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: + """Get information about a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + FlightInfo + + """ + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: + """Get the schema of a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + Schema + + """ + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : FlightMetadataWriter + A writer to send responses to the client. + + """ + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + ticket : Ticket + The ticket for the flight. + + Returns + ------- + FlightDataStream + A stream of data to send back to the client. + + """ + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : MetadataRecordBatchWriter + A writer to send responses to the client. + + """ + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: + """List custom actions available on this server. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + + Returns + ------- + iterator of ActionType or tuple + + """ + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: + """Execute a custom action. + + This method should return an iterator, or it should be a + generator. Applications should override this method to + implement their own behavior. The default method raises a + NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + action : Action + The action to execute. + + Returns + ------- + iterator of bytes + + """ + def serve(self) -> None: + """Block until the server shuts down. + + This method only returns if shutdown() is called or a signal is + received. + """ + def run(self) -> None: + """Block until the server shuts down. + + .. deprecated:: 0.15.0 + Use the ``FlightServer.serve`` method instead + """ + def shutdown(self) -> None: + """Shut down the server, blocking until current requests finish. + + Do not call this directly from the implementation of a Flight + method, as then the server will block forever waiting for that + request to finish. Instead, call this method from a background + thread. + + This method should only be called once. + """ + def wait(self) -> None: + """Block until server is terminated with shutdown.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback): ... + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, +) -> FlightClient: + """ + Connect to a Flight server. + + Parameters + ---------- + location : str, tuple, or Location + Location to connect to. Either a URI like "grpc://localhost:port", + a tuple of (host, port), or a Location instance. + tls_root_certs : bytes or None + PEM-encoded. + cert_chain: str or None + If provided, enables TLS mutual authentication. + private_key: str or None + If provided, enables TLS mutual authentication. + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list or None + A list of ClientMiddlewareFactory instances to apply. + write_size_limit_bytes : int or None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean or None + Disable verifying the server when using TLS. + Insecure, use with caution. + generic_options : list or None + A list of generic (string, int or string) options to pass to + the underlying transport. + + Returns + ------- + client : FlightClient + """ diff --git a/python/pyarrow/_fs.pyi b/python/pyarrow/_fs.pyi new file mode 100644 index 00000000000..7670ef5230d --- /dev/null +++ b/python/pyarrow/_fs.pyi @@ -0,0 +1,1005 @@ +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Union, overload + +from fsspec import AbstractFileSystem # type: ignore[import-untyped] + +from .lib import NativeFile, _Weakrefable + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + +class FileInfo(_Weakrefable): + """ + FileSystem entry info. + + Parameters + ---------- + path : str + The full path to the filesystem entry. + type : FileType + The type of the filesystem entry. + mtime : datetime or float, default None + If given, the modification time of the filesystem entry. + If a float is given, it is the number of seconds since the + Unix epoch. + mtime_ns : int, default None + If given, the modification time of the filesystem entry, + in nanoseconds since the Unix epoch. + `mtime` and `mtime_ns` are mutually exclusive. + size : int, default None + If given, the filesystem entry size in bytes. This should only + be given if `type` is `FileType.File`. + + Examples + -------- + Generate a file: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> path_fs = local_path + "/pyarrow-fs-example.dat" + >>> with local.open_output_stream(path_fs) as stream: + ... stream.write(b"data") + 4 + + Get FileInfo object using ``get_file_info()``: + + >>> file_info = local.get_file_info(path_fs) + >>> file_info + + + Inspect FileInfo attributes: + + >>> file_info.type + + + >>> file_info.is_file + True + + >>> file_info.path + '/.../pyarrow-fs-example.dat' + + >>> file_info.base_name + 'pyarrow-fs-example.dat' + + >>> file_info.size + 4 + + >>> file_info.extension + 'dat' + + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + @property + def type(self) -> FileType: + """ + Type of the file. + + The returned enum values can be the following: + + - FileType.NotFound: target does not exist + - FileType.Unknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character device, or + Windows NUL / CON / ...) + - FileType.File: target is a regular file + - FileType.Directory: target is a regular directory + + Returns + ------- + type : FileType + """ + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: + """ + The full file path in the filesystem. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.path + '/.../pyarrow-fs-example.dat' + """ + @property + def base_name(self) -> str: + """ + The file base name. + + Component after the last directory separator. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.base_name + 'pyarrow-fs-example.dat' + """ + @property + def size(self) -> int: + """ + The size in bytes, if available. + + Only regular files are guaranteed to have a size. + + Returns + ------- + size : int or None + """ + @property + def extension(self) -> str: + """ + The file extension. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.extension + 'dat' + """ + @property + def mtime(self) -> dt.datetime | None: + """ + The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + """ + @property + def mtime_ns(self) -> int | None: + """ + The time of last modification, if available, expressed in nanoseconds + since the Unix epoch. + + Returns + ------- + mtime_ns : int or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + +class FileSelector(_Weakrefable): + """ + File and directory selector. + + It contains a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : str + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. + allow_not_found : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + + Examples + -------- + List the contents of a directory and subdirectories: + + >>> selector_1 = fs.FileSelector(local_path, recursive=True) + >>> local.get_file_info(selector_1) # doctest: +SKIP + [, + , + ] + + List only the contents of the base directory: + + >>> selector_2 = fs.FileSelector(local_path) + >>> local.get_file_info(selector_2) # doctest: +SKIP + [, + ] + + Return empty selection if the directory doesn't exist: + + >>> selector_not_found = fs.FileSelector( + ... local_path + "/missing", recursive=True, allow_not_found=True + ... ) + >>> local.get_file_info(selector_not_found) + [] + """ + + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... + +class FileSystem(_Weakrefable): + """ + Abstract file system API. + """ + + @classmethod + def from_uri(cls, uri: str) -> tuple[Self, str]: + """ + Create a new FileSystem from URI or Path. + + Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". + In addition, the argument can be a pathlib.Path object, or a string + describing an absolute local path. + + Parameters + ---------- + uri : string + URI-based path, for example: file:///some/local/path. + + Returns + ------- + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. + + Examples + -------- + Create a new FileSystem subclass from a URI: + + >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) + >>> local_new, path_new = fs.FileSystem.from_uri(uri) + >>> local_new + >> path_new + '/.../pyarrow-fs-example.dat' + + Or from a s3 bucket: + + >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") + (, 'usgs-landsat/collection02') + """ + def equals(self, other: FileSystem) -> bool: + """ + Parameters + ---------- + other : pyarrow.fs.FileSystem + + Returns + ------- + bool + """ + @property + def type_name(self) -> str: + """ + The filesystem's type name. + """ + @overload + def get_file_info(self, paths_or_selector: str) -> FileInfo: ... + @overload + def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... + def get_file_info(self, paths_or_selector): + """ + Get info for the given files. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns a FileStat object and has a FileType of + value NotFound. An exception indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector : FileSelector, path-like or list of path-likes + Either a selector object, a path-like object or a list of + path-like objects. The selector's base directory will not be + part of the results, even if it exists. If it doesn't exist, + use `allow_not_found`. + + Returns + ------- + FileInfo or list of FileInfo + Single FileInfo object is returned for a single path, otherwise + a list of FileInfo objects is returned. + + Examples + -------- + >>> local + + >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) + + """ + def create_dir(self, path: str, *, recursive: bool = True) -> None: + """ + Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : str + The path of the new directory. + recursive : bool, default True + Create nested directories as well. + """ + def delete_dir(self, path: str) -> None: + """ + Delete a directory and its contents, recursively. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + """ + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: + """ + Delete a directory's contents, recursively. + + Like delete_dir, but doesn't delete the directory itself. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + accept_root_dir : boolean, default False + Allow deleting the root directory's contents + (if path is empty or "/") + missing_dir_ok : boolean, default False + If False then an error is raised if path does + not exist + """ + def move(self, src: str, dest: str) -> None: + """ + Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : str + The path of the file or the directory to be moved. + dest : str + The destination path where the file or directory is moved to. + + Examples + -------- + Create a new folder with a file: + + >>> local.create_dir("/tmp/other_dir") + >>> local.copy_file(path, "/tmp/move_example.dat") + + Move the file: + + >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") + + Inspect the file info: + + >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") + + >>> local.get_file_info("/tmp/move_example.dat") + + + Delete the folder: + >>> local.delete_dir("/tmp/other_dir") + """ + def copy_file(self, src: str, dest: str) -> None: + """ + Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : str + The path of the file to be copied from. + dest : str + The destination path where the file is copied to. + + Examples + -------- + >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") + + Inspect the file info: + + >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") + + >>> local.get_file_info(path) + + """ + def delete_file(self, path: str) -> None: + """ + Delete a file. + + Parameters + ---------- + path : str + The path of the file to be deleted. + """ + def open_input_file(self, path: str) -> NativeFile: + """ + Open an input file for random access reading. + + Parameters + ---------- + path : str + The source to open for reading. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_file()`: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data' + """ + def open_input_stream( + self, path: str, compression: str | None = "detect", buffer_size: int | None = None + ) -> NativeFile: + """ + Open an input stream for sequential reading. + + Parameters + ---------- + path : str + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_stream()`: + + >>> with local.open_input_stream(path) as f: + ... print(f.readall()) + b'data' + """ + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: + """ + Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream(path) as stream: + ... stream.write(b"data") + 4 + """ + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): + """ + Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + .. note:: + Some filesystem implementations do not support efficient + appending to an existing file, in which case this method will + raise NotImplementedError. + Consider writing to multiple files (using e.g. the dataset layer) + instead. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Append new data to a FileSystem subclass with nonempty file: + + >>> with local.open_append_stream(path) as f: + ... f.write(b"+newly added") + 12 + + Print out the content to the file: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data+newly added' + """ + def normalize_path(self, path: str) -> str: + """ + Normalize filesystem path. + + Parameters + ---------- + path : str + The path to normalize + + Returns + ------- + normalized_path : str + The normalized path + """ + +class LocalFileSystem(FileSystem): + """ + A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + + Parameters + ---------- + use_mmap : bool, default False + Whether open_input_stream and open_input_file should return + a mmap'ed file or a regular file. + + Examples + -------- + Create a FileSystem object with LocalFileSystem constructor: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> local + + + and write data on to the file: + + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: + ... print(stream.readall()) + b'data' + + Create a FileSystem object inferred from a URI of the saved file: + + >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") + >>> local_new + >> path + '/tmp/local_fs.dat' + + Check if FileSystems `local` and `local_new` are equal: + + >>> local.equals(local_new) + True + + Compare two different FileSystems: + + >>> local2 = fs.LocalFileSystem(use_mmap=True) + >>> local.equals(local2) + False + + Copy a file and print out the data: + + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: + ... print(stream.readall()) + b'data' + + Open an output stream for appending, add text and print the new data: + + >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: + ... f.write(b"+newly added") + 12 + + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: + ... print(f.readall()) + b'data+newly added' + + Create a directory, copy a file into it and then delete the whole directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.delete_dir("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + + Create a directory, copy a file into it and then delete + the content of the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.delete_dir_contents("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + + Create a directory, copy a file into it and then delete + the file from the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.delete_file("/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.get_file_info("/tmp/new_folder") + + + Move the file: + + >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") + + >>> local.get_file_info("/tmp/local_fs-copy.dat") + + + To finish delete the file left: + >>> local.delete_file("/tmp/local_fs.dat") + """ + + def __init__(self, *, use_mmap: bool = False) -> None: ... + +class SubTreeFileSystem(FileSystem): + """ + Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + + Parameters + ---------- + base_path : str + The root of the subtree. + base_fs : FileSystem + FileSystem object the operations delegated to. + + Examples + -------- + Create a LocalFileSystem instance: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + + Create a directory and a SubTreeFileSystem instance: + + >>> local.create_dir("/tmp/sub_tree") + >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) + + Write data into the existing file: + + >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: + ... f.write(b"+newly added") + 12 + + Print out the attributes: + + >>> subtree.base_fs + + >>> subtree.base_path + '/tmp/sub_tree/' + + Get info for the given directory or given file: + + >>> subtree.get_file_info("") + + >>> subtree.get_file_info("sub_tree_fs.dat") + + + Delete the file and directory: + + >>> subtree.delete_file("sub_tree_fs.dat") + >>> local.delete_dir("/tmp/sub_tree") + >>> local.delete_file("/tmp/local_fs.dat") + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + +class PyFileSystem(FileSystem): + """ + A FileSystem with behavior implemented in Python. + + Parameters + ---------- + handler : FileSystemHandler + The handler object implementing custom filesystem behavior. + + Examples + -------- + Create an fsspec-based filesystem object for GitHub: + + >>> from fsspec.implementations import github + >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP + + Get a PyArrow FileSystem object: + + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler + >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP + + Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: + + >>> pa_fs.get_file_info("README.md") # doctest: +SKIP + + """ + def __init__(self, handler: FileSystemHandler) -> None: ... + @property + def handler(self) -> FileSystemHandler: + """ + The filesystem's underlying handler. + + Returns + ------- + handler : FileSystemHandler + """ + +class FileSystemHandler(ABC): + """ + An abstract class exposing methods to implement PyFileSystem's behavior. + """ + @abstractmethod + def get_type_name(self) -> str: + """ + Implement PyFileSystem.type_name. + """ + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : list of str + paths for which we want to retrieve the info. + """ + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : FileSelector + selector for which we want to retrieve the info. + """ + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: + """ + Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. + """ + @abstractmethod + def delete_dir(self, path: str) -> None: + """ + Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : str + path of the directory. + """ + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: + """ + Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : str + path of the directory. + missing_dir_ok : bool + if False an error should be raised if path does not exist + """ + @abstractmethod + def delete_root_dir_contents(self) -> None: + """ + Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). + """ + @abstractmethod + def delete_file(self, path: str) -> None: + """ + Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : str + path of the file. + """ + @abstractmethod + def move(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.move(...). + + Parameters + ---------- + src : str + path of what should be moved. + dest : str + path of where it should be moved to. + """ + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : str + path of what should be copied. + dest : str + path of where it should be copied to. + """ + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + @abstractmethod + def normalize_path(self, path: str) -> str: + """ + Implement PyFileSystem.normalize_path(...). + + Parameters + ---------- + path : str + path of what should be normalized. + """ diff --git a/python/pyarrow/_gcsfs.pyi b/python/pyarrow/_gcsfs.pyi new file mode 100644 index 00000000000..4fc7ea68e48 --- /dev/null +++ b/python/pyarrow/_gcsfs.pyi @@ -0,0 +1,83 @@ +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class GcsFileSystem(FileSystem): + """ + Google Cloud Storage (GCS) backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credentials, resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user or service account to have the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if `access_token` is specified. + default_bucket_location : str, default 'US' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. + project_id : str, default None + The GCP project identifier to use for creating buckets. + If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + variable. Most I/O operations do not need a project id, only applications + that create new buckets need a project id. + """ + + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: + """ + The GCP location this filesystem will write to. + """ + @property + def project_id(self) -> str: + """ + The GCP project id this filesystem will use. + """ diff --git a/python/pyarrow/_hdfs.pyi b/python/pyarrow/_hdfs.pyi new file mode 100644 index 00000000000..200f669379b --- /dev/null +++ b/python/pyarrow/_hdfs.pyi @@ -0,0 +1,75 @@ +from _typeshed import StrPath + +from ._fs import FileSystem + +class HadoopFileSystem(FileSystem): + """ + HDFS backed FileSystem implementation + + Parameters + ---------- + host : str + HDFS host to connect to. Set to "default" for fs.defaultFS from + core-site.xml. + port : int, default 8020 + HDFS port to connect to. Set to 0 for default or logical (HA) nodes. + user : str, default None + Username when connecting to HDFS; None implies login user. + replication : int, default 3 + Number of copies each block will have. + buffer_size : int, default 0 + If 0, no buffering will happen otherwise the size of the temporary read + and write buffer. + default_block_size : int, default None + None means the default configuration for HDFS, a typical block size is + 128 MB. + kerb_ticket : string or path, default None + If not None, the path to the Kerberos ticket cache. + extra_conf : dict, default None + Extra key/value pairs for configuration; will override any + hdfs-site.xml properties. + + Examples + -------- + >>> from pyarrow import fs + >>> hdfs = fs.HadoopFileSystem( + ... host, port, user=user, kerb_ticket=ticket_cache_path + ... ) # doctest: +SKIP + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__( + self, + host: str, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] + """ + Instantiate HadoopFileSystem object from an URI string. + + The following two calls are equivalent + + * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ +&replication=1')`` + * ``HadoopFileSystem('localhost', port=8020, user='test', \ +replication=1)`` + + Parameters + ---------- + uri : str + A string URI describing the connection to HDFS. + In order to change the user, replication, buffer_size or + default_block_size pass the values as query parts. + + Returns + ------- + HadoopFileSystem + """ diff --git a/python/pyarrow/_json.pyi b/python/pyarrow/_json.pyi new file mode 100644 index 00000000000..43d2ae83cd8 --- /dev/null +++ b/python/pyarrow/_json.pyi @@ -0,0 +1,169 @@ +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + +class ReadOptions(_Weakrefable): + """ + Options for reading JSON files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual chunks in the Table. + """ + + use_threads: bool + """ + Whether to use multiple threads to accelerate reading. + """ + block_size: int + """ + How much bytes to process at a time from the input stream. + + This will determine multi-threading granularity as well as the size of + individual chunks in the Table. + """ + def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... + def equals(self, other: ReadOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ReadOptions + + Returns + ------- + bool + """ + +class ParseOptions(_Weakrefable): + """ + Options for parsing JSON files. + + Parameters + ---------- + explicit_schema : Schema, optional (default None) + Optional explicit schema (no type inference, ignores other fields). + newlines_in_values : bool, optional (default False) + Whether objects may be printed across multiple lines (for example + pretty printed). If false, input must end with an empty line. + unexpected_field_behavior : str, default "infer" + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + """ + + explicit_schema: Schema + """ + Optional explicit schema (no type inference, ignores other fields) + """ + newlines_in_values: bool + """ + Whether newline characters are allowed in JSON values. + Setting this to True reduces the performance of multi-threaded + JSON reading. + """ + unexpected_field_behavior: Literal["ignore", "error", "infer"] + """ + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + + Set to "infer" by default. + """ + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ParseOptions + + Returns + ------- + bool + """ + +class JSONStreamingReader(RecordBatchReader): + """An object that reads record batches incrementally from a JSON file. + + Should not be instantiated directly by user code. + """ + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: + """ + Read a Table from a stream of JSON data. + + Parameters + ---------- + input_file : str, path or file-like object + The location of JSON data. Currently only the line-delimited JSON + format is supported. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. + + Returns + ------- + :class:`pyarrow.Table` + Contents of the JSON file as a in-memory table. + """ + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: + """ + Open a streaming reader of JSON data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of JSON data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see pyarrow.json.ReadOptions constructor + for defaults) + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see pyarrow.json.ParseOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.json.JSONStreamingReader` + """ diff --git a/python/pyarrow/_orc.pyi b/python/pyarrow/_orc.pyi new file mode 100644 index 00000000000..71bf0dde9ba --- /dev/null +++ b/python/pyarrow/_orc.pyi @@ -0,0 +1,56 @@ +from typing import IO, Literal + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, + compression_block_size: int | None = None, + compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/pyarrow/_parquet.pyi b/python/pyarrow/_parquet.pyi new file mode 100644 index 00000000000..a9187df0428 --- /dev/null +++ b/python/pyarrow/_parquet.pyi @@ -0,0 +1,445 @@ +from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def hash_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnChunkMetaData: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open( + self, + source: StrPath | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + def iter_batches( + self, + batch_size: int, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Iterator[RecordBatch]: ... + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def read_row_groups( + self, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Table: ... + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + +class FileEncryptionProperties: ... +class FileDecryptionProperties: ... diff --git a/python/pyarrow/_parquet_encryption.pyi b/python/pyarrow/_parquet_encryption.pyi new file mode 100644 index 00000000000..c707edb844a --- /dev/null +++ b/python/pyarrow/_parquet_encryption.pyi @@ -0,0 +1,67 @@ +import datetime as dt + +from typing import Callable + +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + ) -> None: ... + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/pyarrow/_s3fs.pyi b/python/pyarrow/_s3fs.pyi new file mode 100644 index 00000000000..fc13c498bd9 --- /dev/null +++ b/python/pyarrow/_s3fs.pyi @@ -0,0 +1,74 @@ +import enum + +from typing import Literal, NotRequired, Required, TypedDict + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class _ProxyOptions(TypedDict): + schema: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + +class AwsStandardS3RetryStrategy(S3RetryStrategy): ... +class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + check_directory_existence_before_creation: bool = False, + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/pyarrow/_stubs_typing.pyi b/python/pyarrow/_stubs_typing.pyi new file mode 100644 index 00000000000..c259513f1ea --- /dev/null +++ b/python/pyarrow/_stubs_typing.pyi @@ -0,0 +1,80 @@ +import datetime as dt + +from collections.abc import Sequence +from decimal import Decimal +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar + +import numpy as np + +from numpy.typing import NDArray + +from .compute import BooleanArray, IntegerArray + +ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray +Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray +PyScalar: TypeAlias = ( + bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta +) + +_T = TypeVar("_T") +SingleOrList: TypeAlias = list[_T] | _T + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] +) + +class Buffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +class SupportPyBuffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + +class SupportArrowSchema(Protocol): + def __arrow_c_schema(self) -> Any: ... diff --git a/python/pyarrow/_substrait.pyi b/python/pyarrow/_substrait.pyi new file mode 100644 index 00000000000..ff226e9521b --- /dev/null +++ b/python/pyarrow/_substrait.pyi @@ -0,0 +1,39 @@ +from typing import Any, Callable + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + +class SubstraitSchema: + schema: Schema + expression: Expression + def __init__(self, schema: Schema, expression: Expression) -> None: ... + def to_pysubstrait(self) -> Any: ... + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes) -> Schema: ... +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow/acero.pyi b/python/pyarrow/acero.pyi new file mode 100644 index 00000000000..8a520bdc24a --- /dev/null +++ b/python/pyarrow/acero.pyi @@ -0,0 +1,85 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression, FunctionOptions + +_StrOrExpr: TypeAlias = str | Expression + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: list[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + +class ExecNodeOptions(lib._Weakrefable): ... + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table) -> None: ... + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression) -> None: ... + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: list[tuple[list[str], str, FunctionOptions, str]], + keys: list[_StrOrExpr] | None = None, + ) -> None: ... + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | list[_StrOrExpr], + right_keys: _StrOrExpr | list[_StrOrExpr], + left_output: list[_StrOrExpr] | None = None, + right_output: list[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + ) -> None: ... + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | list[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | list[_StrOrExpr], + tolerance: int, + ) -> None: ... diff --git a/python/pyarrow/benchmark.pyi b/python/pyarrow/benchmark.pyi new file mode 100644 index 00000000000..048973301dc --- /dev/null +++ b/python/pyarrow/benchmark.pyi @@ -0,0 +1,3 @@ +from pyarrow.lib import benchmark_PandasObjectIsNull + +__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/pyarrow/cffi.pyi b/python/pyarrow/cffi.pyi new file mode 100644 index 00000000000..2ae945c5974 --- /dev/null +++ b/python/pyarrow/cffi.pyi @@ -0,0 +1,4 @@ +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/pyarrow/compute.pyi b/python/pyarrow/compute.pyi new file mode 100644 index 00000000000..8d8fc35b134 --- /dev/null +++ b/python/pyarrow/compute.pyi @@ -0,0 +1,7779 @@ +# ruff: noqa: I001 +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from collections.abc import Callable + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import Function as Function +from pyarrow._compute import FunctionOptions as FunctionOptions +from pyarrow._compute import FunctionRegistry as FunctionRegistry +from pyarrow._compute import HashAggregateFunction as HashAggregateFunction +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel +from pyarrow._compute import IndexOptions as IndexOptions +from pyarrow._compute import JoinOptions as JoinOptions +from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction +from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import ScalarFunction as ScalarFunction +from pyarrow._compute import ScalarKernel as ScalarKernel +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import UdfContext as UdfContext +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import VectorFunction as VectorFunction +from pyarrow._compute import VectorKernel as VectorKernel +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions + +# Functions +from pyarrow._compute import call_function as call_function + +# Udf +from pyarrow._compute import call_tabular_function as call_tabular_function +from pyarrow._compute import function_registry as function_registry +from pyarrow._compute import get_function as get_function +from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_aggregate_function as register_aggregate_function +from pyarrow._compute import register_scalar_function as register_scalar_function +from pyarrow._compute import register_tabular_function as register_tabular_function +from pyarrow._compute import register_vector_function as register_vector_function + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: + """Reference a column of the dataset. + + Stores only the field's name. Type and other information is known only when + the expression is bound to a dataset having an explicit scheme. + + Nested references are allowed by passing multiple names or a tuple of + names. For example ``('foo', 'bar')`` references the field named "bar" + inside the field named "foo". + + Parameters + ---------- + *name_or_index : string, multiple strings, tuple or int + The name or index of the (possibly nested) field the expression + references to. + + Returns + ------- + field_expr : Expression + Reference to the given field + + Examples + -------- + >>> import pyarrow.compute as pc + >>> pc.field("a") + + >>> pc.field(1) + + >>> pc.field(("a", "b")) + >> pc.field("a", "b") + Expression: + """Expression representing a scalar value. + + Creates an Expression object representing a scalar value that can be used + in compute expressions and predicates. + + Parameters + ---------- + value : bool, int, float or string + Python value of the scalar. This function accepts any value that can be + converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. + + Notes + ----- + This function differs from ``pyarrow.scalar()`` in the following way: + + * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents + a single value in Arrow's memory model. + * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing + a scalar value that can be used in compute expressions, predicates, and + dataset filtering operations. + + Returns + ------- + scalar_expr : Expression + An Expression representing the scalar value + """ + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.Uint32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = ( + lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] +# =============================== 1. Aggregation =============================== + +# ========================= 1.1 functions ========================= + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: + """ + Test whether all elements in a boolean array evaluate to true. + + Null values are ignored by default. + If the `skip_nulls` option is set to false, then Kleene logic is used. + See "kleene_and" for more details on Kleene logic. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +any = _clone_signature(all) +""" +Test whether any element in a boolean array evaluates to true. + +Null values are ignored by default. +If the `skip_nulls` option is set to false, then Kleene logic is used. +See "kleene_or" for more details on Kleene logic. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Approximate median of a numeric array with T-Digest algorithm. + + Nulls and NaNs are ignored. + A null scalar is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of null / non-null values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of unique values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the first value in each group. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: + """ + Compute the first and last values of an array. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array-like + value : Scalar-like object + The value to search for. + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : int + the index, or -1 if not found + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) + >>> pc.index(arr, "ipsum") + + >>> pc.index(arr, "ipsum", start=2) + + >>> pc.index(arr, "amet") + + """ + +last = _clone_signature(first) +""" +Compute the first and last values of an array. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True +In [15]: print(pc.last.__doc__) +Compute the first value in each group. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +max = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min_max = _clone_signature(first_last) +""" +Compute the minimum and maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def mean( + array: FloatScalar | FloatArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal128Scalar] + | lib.ChunkedArray[lib.Decimal128Scalar] + | lib.Decimal128Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal128Scalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal256Scalar] + | lib.ChunkedArray[lib.Decimal256Scalar] + | lib.Decimal256Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal256Scalar: ... +def mean(*args, **kwargs): + """ + Compute the mean of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + The result is a double for integer and floating point arguments, + and a decimal with the same bit-width/precision/scale for decimal arguments. + For integers and floats, NaN is returned if min_count = 0 and + there are no values. For decimals, null is returned instead. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: + """ + Compute the modal (most common) values of a numeric array. + + Compute the n most common values and their respective occurrence counts. + The output has type `struct`, where T is the + input type. + The results are ordered by descending `count` first, and ascending `mode` + when breaking ties. + Nulls are ignored. If there are no non-null values in the array, + an empty array is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ModeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + + >>> modes[1] + + """ + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the product of values in a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Compute an array of quantiles of a numeric array or chunked array. + + By default, 0.5 quantile (median) is returned. + If quantile lies between two data points, an interpolated value is + returned based on selected interpolation method. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.QuantileOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the standard deviation of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population standard deviation is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: + """ + Compute the sum of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Approximate quantiles of a numeric array with T-Digest algorithm. + + By default, 0.5 quantile (median) is returned. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.TDigestOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +def variance( + array: NumericScalar | NumericArray, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the variance of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population variance is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array + Indices of the top-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + + [ + 5, + 4, + 2 + ] + """ + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Indices of the bottom-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + + [ + 0, + 1, + 2 + ] + """ + +# ========================= 2. Element-wise (“scalar”) functions ========================= + +# ========================= 2.1 Arithmetic ========================= +@overload +def abs( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def abs( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... +@overload +def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def abs(*args, **kwargs): + """ + Calculate the absolute value of the argument element-wise. + + Results will wrap around on integer overflow. + Use function "abs_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +abs_checked = _clone_signature(abs) +""" +Calculate the absolute value of the argument element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "abs". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def add( + x: _NumericOrTemporalScalarT, + y: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: Expression, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: NumericOrTemporalScalar, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: NumericOrTemporalScalar, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def add(*args, **kwargs): + """ + Add the arguments element-wise. + + Results will wrap around on integer overflow. + Use function "add_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +add_checked = _clone_signature(add) +""" +Add the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "add". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +""" + +@overload +def divide( + dividend: _NumericOrTemporalScalarT, + divisor: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: Expression, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: Expression, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def divide(*args, **kwargs): + """ + Divide the arguments element-wise. + + Integer division by zero returns an error. However, integer overflow + wraps around, and floating-point division by zero returns an infinite. + Use function "divide_checked" if you want to get an error + in all the aforementioned cases. + + Parameters + ---------- + dividend : Array-like or scalar-like + Argument to compute function. + divisor : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +divide_checked = _clone_signature(divide) +""" +Divide the arguments element-wise. + +An error is returned when trying to divide by zero, or when +integer overflow is encountered. + +Parameters +---------- +dividend : Array-like or scalar-like + Argument to compute function. +divisor : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def exp( + exponent: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatArrayT: ... +@overload +def exp( + exponent: ArrayOrChunkedArray[NonFloatNumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... +@overload +def exp( + exponent: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatScalarT: ... +@overload +def exp( + exponent: NonFloatNumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.DoubleScalar: ... +@overload +def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def exp(*args, **kwargs): + """ + Compute Euler's number raised to the power of specified exponent, element-wise. + + If exponent is null the result will be null. + + Parameters + ---------- + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +multiply = _clone_signature(add) +""" +Multiply the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "multiply_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +multiply_checked = _clone_signature(add) +""" +Multiply the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "multiply". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def negate( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def negate( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... +@overload +def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def negate(*args, **kwargs): + """ + Negate the argument element-wise. + + Results will wrap around on integer overflow. + Use function "negate_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +negate_checked = _clone_signature(negate) +""" +Negate the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "negate". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def power( + base: _NumericScalarT, + exponent: _NumericScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def power( + base: _NumericArrayT, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: Expression, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def power( + base: _NumericArrayT, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def power( + base: Expression, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def power(*args, **kwargs): + """ + Raise arguments to power element-wise. + + Integer to negative integer power returns an error. However, integer overflow + wraps around. If either base or exponent is null the result will be null. + + Parameters + ---------- + base : Array-like or scalar-like + Argument to compute function. + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +power_checked = _clone_signature(power) +""" +Raise arguments to power element-wise. + +An error is returned when integer to negative integer power is encountered, +or integer overflow is encountered. + +Parameters +---------- +base : Array-like or scalar-like + Argument to compute function. +exponent : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def sign( + x: NumericOrDurationArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] +): ... +@overload +def sign( + x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... +@overload +def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sign(*args, **kwargs): + """ + Get the signedness of the arguments element-wise. + + Output is any of (-1,1) for nonzero inputs and 0 for zero input. + NaN values return NaN. Integral values return signedness as Int8 and + floating-point values return it with the same type as the input values. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +@overload +def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... +@overload +def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... +@overload +def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sqrt(*args, **kwargs): + """ + Takes the square root of arguments element-wise. + + A negative argument returns a NaN. For a variant that returns an + error, use function "sqrt_checked". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +sqrt_checked = _clone_signature(sqrt) +""" +Takes the square root of arguments element-wise. + +A negative argument returns an error. For a variant that returns a +NaN, use function "sqrt". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +subtract = _clone_signature(add) +""" +Subtract the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "subtract_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subtract_checked = _clone_signature(add) +""" +Subtract the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "subtract". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.1 Bit-wise functions ========================= +@overload +def bit_wise_and( + x: _NumericScalarT, y: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, + y: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: NumericScalar, y: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def bit_wise_and( + x: Expression, + y: NumericScalar | ArrayOrChunkedArray[NumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def bit_wise_and( + x: NumericScalar | ArrayOrChunkedArray[NumericScalar], + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def bit_wise_and(*args, **kwargs): + """ + Bit-wise AND the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def bit_wise_not( + x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_not( + x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def bit_wise_not(*args, **kwargs): + """ + Bit-wise negate the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +bit_wise_or = _clone_signature(bit_wise_and) +""" +Bit-wise OR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +bit_wise_xor = _clone_signature(bit_wise_and) +""" +Bit-wise XOR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +`x` is returned if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_left_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left_checked = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_left" for a variant that doesn't fail for an invalid shift amount. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +`x` is returned if `y` (the amount to shift by) is: (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_right_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right_checked = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_right" for a variant that doesn't fail for an invalid shift amount + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.2 Rounding functions ========================= +@overload +def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... +@overload +def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... +@overload +def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ceil(*args, **kwargs): + """ + Round up to the nearest integer. + + Compute the smallest integer value not less in magnitude than `x`. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor = _clone_signature(ceil) +""" +Round down to the nearest integer. + +Compute the largest integer value not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def round( + x: _NumericScalarT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round( + x: _NumericArrayT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round( + x: Expression, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round(*args, **kwargs): + """ + Round to a given precision. + + Options are used to control the number of digits and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def round_to_multiple( + x: _NumericScalarT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_to_multiple( + x: _NumericArrayT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_to_multiple( + x: Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round_to_multiple(*args, **kwargs): + """ + Round to a given multiple. + + Options are used to control the rounding multiple and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundToMultipleOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def round_binary( + x: _NumericScalarT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_binary( + x: _NumericScalarT, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[_NumericScalarT]: ... +@overload +def round_binary( + x: _NumericArrayT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_binary( + x: Expression, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round_binary(*args, **kwargs): + """ + Round to the given precision. + + Options are used to control the rounding mode. + Default behavior is to use the half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + s : Array-like or scalar-like + Argument to compute function. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundBinaryOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +trunc = _clone_signature(ceil) +""" +Compute the integral part. + +Compute the nearest integer not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.3 Logarithmic functions ========================= +@overload +def ln( + x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def ln( + x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ln(*args, **kwargs): + """ + Compute natural logarithm. + + Non-positive values return -inf or NaN. Null values return null. + Use function "ln_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ln_checked = _clone_signature(ln) +""" +Compute natural logarithm. + +Non-positive values raise an error. Null values return null. +Use function "ln" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10 = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log10_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10_checked = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log10" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p_checked" if you want invalid values to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p_checked = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p" if you want invalid values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2 = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log2_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2_checked = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log2" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def logb( + x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def logb( + x: FloatArray, b: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatScalar, + b: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatArray, + b: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression | Any: ... +def logb(*args, **kwargs): + """ + Compute base `b` logarithm. + + Values <= 0 return -inf or NaN. Null values return null. + Use function "logb_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + b : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +logb_checked = _clone_signature(logb) +""" +Compute base `b` logarithm. + +Values <= 0 return -inf or NaN. Null values return null. +Use function "logb" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +b : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +""" +Compute the inverse cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "acos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +acos_checked = _clone_signature(ln) +""" +Compute the inverse cosine. + +Invalid input values raise an error; +to return NaN instead, see "acos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin = _clone_signature(ln) +""" +Compute the inverse sine. + +NaN is returned for invalid input values; +to raise an error instead, see "asin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin_checked = _clone_signature(ln) +""" +Compute the inverse sine. + +Invalid input values raise an error; +to return NaN instead, see "asin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +atan = _clone_signature(ln) +""" +Compute the inverse tangent of x. + +The return value is in the range [-pi/2, pi/2]; +for a full return range [-pi, pi], see "atan2". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos = _clone_signature(ln) +""" +Compute the cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "cos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos_checked = _clone_signature(ln) +""" +Compute the cosine. + +Infinite values raise an error; +to return NaN instead, see "cos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin = _clone_signature(ln) +""" +Compute the sine. + +NaN is returned for invalid input values; +to raise an error instead, see "sin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin_checked = _clone_signature(ln) +""" +Compute the sine. + +Invalid input values raise an error; +to return NaN instead, see "sin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan = _clone_signature(ln) +""" +Compute the tangent. + +NaN is returned for invalid input values; +to raise an error instead, see "tan_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan_checked = _clone_signature(ln) +""" +Compute the tangent. + +Infinite values raise an error; +to return NaN instead, see "tan". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def atan2( + y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def atan2( + y: FloatArray, x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatArray, + x: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatScalar, + x: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: Expression, x: Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def atan2( + y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def atan2(*args, **kwargs): + """ + Compute the inverse tangent of y/x. + + The return value is in the range [-pi, pi]. + + Parameters + ---------- + y : Array-like or scalar-like + Argument to compute function. + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.5 Comparisons functions ========================= +@overload +def equal( + x: lib.Scalar, y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def equal( + x: lib.Scalar, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: lib.Scalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: Expression, + y: lib.Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def equal(*args, **kwargs): + """ + Compare values for equality (x == y). + + A null on either side emits a null comparison result. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +greater = _clone_signature(equal) +""" +Compare values for ordered inequality (x > y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +greater_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x >= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less = _clone_signature(equal) +""" +Compare values for ordered inequality (x < y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x <= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +not_equal = _clone_signature(equal) +""" +Compare values for inequality (x != y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT], + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT: ... +@overload +def max_element_wise( + *args: Expression, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def max_element_wise(*args, **kwargs): + """ + Find the element-wise maximum value. + + Nulls are ignored (by default) or propagated. + NaN is preferred over null, but not over any valid value. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +min_element_wise = _clone_signature(max_element_wise) +""" +Find the element-wise minimum value. + +Nulls are ignored (by default) or propagated. +NaN is preferred over null, but not over any valid value. + +Parameters +---------- +*args : Array-like or scalar-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.6 Logical functions ========================= +@overload +def and_( + x: lib.BooleanScalar, y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def and_( + x: BooleanArray, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: lib.BooleanScalar, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: BooleanArray, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: lib.BooleanScalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: Expression, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: ScalarOrArray[lib.BooleanScalar], + y: ScalarOrArray[lib.BooleanScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ScalarOrArray[lib.BooleanScalar]: ... +def and_(*args, **kwargs): + """ + Logical 'and' boolean values. + + When a null is encountered in either input, a null is output. + For a different null behavior, see function "and_kleene". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +and_kleene = _clone_signature(and_) +""" +Logical 'and' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and null = null +- null and true = null +- false and null = false +- null and false = false +- null and null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and' false is always false. +For a different null behavior, see function "and". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not = _clone_signature(and_) +""" +Logical 'and not' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "and_not_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not_kleene = _clone_signature(and_) +""" +Logical 'and not' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and not null = null +- null and not false = null +- false and not null = false +- null and not true = false +- null and not null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and not' true is always false, as is false +'and not' an unknown value. +For a different null behavior, see function "and_not". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_ = _clone_signature(and_) +""" +Logical 'or' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "or_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_kleene = _clone_signature(and_) +""" +Logical 'or' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true or null = true +- null or true = true +- false or null = null +- null or false = null +- null or null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'or' true is always true. +For a different null behavior, see function "or". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +xor = _clone_signature(and_) +""" +Logical 'xor' boolean values. + +When a null is encountered in either input, a null is output. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def invert( + x: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def invert( + x: _BooleanArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _BooleanArrayT: ... +@overload +def invert( + x: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def invert(*args, **kwargs): + """ + Invert boolean values. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.10 String predicates ========================= +@overload +def ascii_is_alnum( + strings: StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def ascii_is_alnum( + strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def ascii_is_alnum( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def ascii_is_alnum(*args, **kwargs): + """ + Classify strings as ASCII alphanumeric. + + For each string in `strings`, emit true iff the string is non-empty + and consists only of alphanumeric ASCII characters. Null strings emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alnum = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphanumeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphanumeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_digit = _clone_signature(ascii_is_alnum) +""" +Classify strings as digits. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of Unicode digits. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_numeric = _clone_signature(ascii_is_alnum) +""" +Classify strings as numeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of numeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +string_is_ascii = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII. + +For each string in `strings`, emit true iff the string consists only +of ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.11 String transforms ========================= +@overload +def ascii_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def ascii_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def ascii_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def ascii_capitalize(*args, **kwargs): + """ + Capitalize the first character of ASCII input. + + For each string in `strings`, return a capitalized version. + + This function assumes the input is fully ASCII. If it may contain + non-ASCII characters, use "utf8_capitalize" instead. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lower = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to lowercase. + +For each string in `strings`, return a lowercase version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_lower" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_reverse = _clone_signature(ascii_capitalize) +""" +Reverse ASCII input. + +For each ASCII string in `strings`, return a reversed version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_reverse" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_swapcase = _clone_signature(ascii_capitalize) +""" +Transform ASCII input by inverting casing. + +For each string in `strings`, return a string with opposite casing. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_swapcase" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_title = _clone_signature(ascii_capitalize) +""" +Titlecase each word of ASCII input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_title" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_upper = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to uppercase. + +For each string in `strings`, return an uppercase version. + +This function assumes the input is fully ASCII. It it may contain +non-ASCII characters, use "utf8_upper" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def binary_length( + strings: lib.BinaryScalar | lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def binary_length( + strings: lib.LargeBinaryScalar | lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def binary_length( + strings: lib.BinaryArray + | lib.StringArray + | lib.ChunkedArray[lib.BinaryScalar] + | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def binary_length( + strings: lib.LargeBinaryArray + | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeBinaryScalar] + | lib.ChunkedArray[lib.LargeStringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def binary_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_length(*args, **kwargs): + """ + Compute string lengths. + + For each string in `strings`, emit its length of bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: int, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[_StringOrBinaryScalarT]: ... +@overload +def binary_repeat( + strings: _StringOrBinaryArrayT, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_repeat( + strings: Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_repeat(*args, **kwargs): + """ + Repeat a binary string. + + For each binary string in `strings`, return a replicated version. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + num_repeats : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_replace_slice( + strings: _StringOrBinaryScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_replace_slice( + strings: _StringOrBinaryArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_replace_slice(*args, **kwargs): + """ + Replace a slice of a binary string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_reverse( + strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT: ... +@overload +def binary_reverse( + strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryArrayT: ... +@overload +def binary_reverse( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def binary_reverse(*args, **kwargs): + """ + Reverse binary input. + + For each binary string in `strings`, return a reversed version. + + This function reverses the binary data at a byte-level. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def replace_substring( + strings: _StringScalarT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def replace_substring( + strings: _StringArrayT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def replace_substring( + strings: Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def replace_substring(*args, **kwargs): + """ + Replace matching non-overlapping substrings with replacement. + + For each string in `strings`, replace non-overlapping substrings that match + the given literal `pattern` with the given `replacement`. + If `max_replacements` is given and not equal to -1, it limits the + maximum amount replacements per input, counted from the left. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +replace_substring_regex = _clone_signature(replace_substring) +""" +Replace matching non-overlapping substrings with replacement. + +For each string in `strings`, replace non-overlapping substrings that match +the given regular expression `pattern` with the given `replacement`. +If `max_replacements` is given and not equal to -1, it limits the +maximum amount replacements per input, counted from the left. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +replacement : str + What to replace the pattern with. +max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). +options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def utf8_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def utf8_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def utf8_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def utf8_capitalize(*args, **kwargs): + """ + Capitalize the first character of input. + + For each string in `strings`, return a capitalized version, + with the first character uppercased and the others lowercased. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def utf8_length( + strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def utf8_length( + strings: lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def utf8_length( + strings: lib.StringArray | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def utf8_length( + strings: lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def utf8_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_length(*args, **kwargs): + """ + Compute UTF8 string lengths. + + For each string in `strings`, emit its length in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_lower = _clone_signature(utf8_capitalize) +""" +Transform input to lowercase. + +For each string in `strings`, return a lowercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def utf8_replace_slice( + strings: _StringScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_replace_slice( + strings: _StringArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_replace_slice(*args, **kwargs): + """ + Replace a slice of a string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_reverse = _clone_signature(utf8_capitalize) +""" +Reverse input. + +For each string in `strings`, return a reversed version. + +This function operates on Unicode codepoints, not grapheme +clusters. Hence, it will not correctly reverse grapheme clusters +composed of multiple codepoints. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_swapcase = _clone_signature(utf8_capitalize) +""" +Transform input lowercase characters to uppercase and uppercase characters to lowercase. + +For each string in `strings`, return an opposite case version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_title = _clone_signature(utf8_capitalize) +""" +Titlecase each word of input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_upper = _clone_signature(utf8_capitalize) +""" +Transform input to uppercase. + +For each string in `strings`, return an uppercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory poo +""" + +# ========================= 2.12 String padding ========================= +@overload +def ascii_center( + strings: _StringScalarT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_center( + strings: _StringArrayT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_center( + strings: Expression, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_center(*args, **kwargs): + """ + Center strings by padding with a given character. + + For each string in `strings`, emit a centered string by padding both sides + with the given ASCII character. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_center = _clone_signature(ascii_center) +""" +Center strings by padding with a given character. + +For each string in `strings`, emit a centered string by padding both sides +with the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.13 String trimming ========================= +@overload +def ascii_ltrim( + strings: _StringScalarT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim( + strings: _StringArrayT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_ltrim( + strings: Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_ltrim(*args, **kwargs): + """ + Trim leading characters. + + For each string in `strings`, remove any leading characters + from the `characters` option (as given in TrimOptions). + Null values emit null. + Both the `strings` and the `characters` are interpreted as + ASCII; to trim non-ASCII characters, use `utf8_ltrim`. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + characters : str + Individual characters to be trimmed from the string. + options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_rtrim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_trim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim = _clone_signature(ascii_ltrim) +""" +Trim leading characters. + +For each string in `strings`, remove any leading characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def ascii_ltrim_whitespace( + strings: _StringScalarT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim_whitespace( + strings: _StringArrayT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_ltrim_whitespace( + strings: Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_ltrim_whitespace(*args, **kwargs): + """ + Trim leading ASCII whitespace characters. + + For each string in `strings`, emit a string with leading ASCII whitespace + characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode + whitespace characters. Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with trailing ASCII whitespace +characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with leading and trailing ASCII +whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading whitespace characters. + +For each string in `strings`, emit a string with leading whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing whitespace characters. + +For each string in `strings`, emit a string with trailing whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing whitespace characters. + +For each string in `strings`, emit a string with leading and trailing +whitespace characters removed, where whitespace characters are defined +by the Unicode standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.14 String splitting ========================= +@overload +def ascii_split_whitespace( + strings: _StringScalarT, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringScalarT]: ... +@overload +def ascii_split_whitespace( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def ascii_split_whitespace( + strings: Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_split_whitespace(*args, **kwargs): + """ + Split string according to any ASCII whitespace. + + Split each string according any non-zero length sequence of ASCII + whitespace characters. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def split_pattern( + strings: _StringOrBinaryScalarT, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringOrBinaryScalarT]: ... +@overload +def split_pattern( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def split_pattern( + strings: Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def split_pattern(*args, **kwargs): + """ + Split string according to separator. + + Split each string according to the exact `pattern` defined in + SplitPatternOptions. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitPatternOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +split_pattern_regex = _clone_signature(split_pattern) +""" +Split string according to regex pattern. + +Split each string according to the regex `pattern` defined in +SplitPatternOptions. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitPatternOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + String pattern to split on. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) +""" +Split string according to any Unicode whitespace. + +Split each string according any non-zero length sequence of Unicode +whitespace characters. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.15 String component extraction ========================= +@overload +def extract_regex( + strings: StringOrBinaryScalar, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def extract_regex( + strings: StringOrBinaryArray, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def extract_regex( + strings: Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def extract_regex(*args, **kwargs): + """ + Extract substrings captured by a regex pattern. + + For each string in `strings`, match the regular expression and, if + successful, emit a struct with field names and values coming from the + regular expression's named capture groups. If the input is null or the + regular expression fails matching, a null output value is emitted. + + Regular expression matching is done using the Google RE2 library. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Regular expression with named capture fields. + options : pyarrow.compute.ExtractRegexOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: + """ + Join a list of strings together with a separator. + + Concatenate the strings in `list`. The `separator` is inserted + between each given string. + Any null input and any null `list` element emits a null output. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + separator : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryScalarT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryArrayT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_join_element_wise( + *strings: Expression, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_join_element_wise(*args, **kwargs): + """ + Join string arguments together, with the last argument as separator. + + Concatenate the `strings` except for the last one. The last argument + in `strings` is inserted between each given string. + Any null separator element emits a null output. Null elements either + emit a null (the default), are skipped, or replaced with a given string. + + Parameters + ---------- + *strings : Array-like or scalar-like + Argument to compute function. + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + options : pyarrow.compute.JoinOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.17 String Slicing ========================= +@overload +def binary_slice( + strings: _BinaryScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT: ... +@overload +def binary_slice( + strings: _BinaryArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryArrayT: ... +@overload +def binary_slice( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_slice(*args, **kwargs): + """ + Slice binary string. + + For each binary string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + bytes. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def utf8_slice_codeunits( + strings: _StringScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_slice_codeunits( + strings: _StringArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_slice_codeunits( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_slice_codeunits(*args, **kwargs): + """ + Slice string. + + For each string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + UTF8 codeunits. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.18 Containment tests ========================= +@overload +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def count_substring( + strings: lib.LargeStringScalar | lib.LargeBinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def count_substring( + strings: lib.StringArray + | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] + | lib.ChunkedArray[lib.BinaryScalar], + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def count_substring( + strings: lib.LargeStringArray + | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] + | lib.ChunkedArray[lib.LargeBinaryScalar], + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def count_substring( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def count_substring(*args, **kwargs): + """ + Count occurrences of substring. + + For each string in `strings`, emit the number of occurrences of the given + literal pattern. + Null inputs emit null. The pattern must be given in MatchSubstringOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +count_substring_regex = _clone_signature(count_substring) +""" +Count occurrences of substring. + +For each string in `strings`, emit the number of occurrences of the given +regular expression pattern. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def ends_with( + strings: StringScalar | BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def ends_with( + strings: StringArray | BinaryArray, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def ends_with( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ends_with(*args, **kwargs): + """ + Check if strings end with a literal pattern. + + For each string in `strings`, emit true iff it ends with a given pattern. + The pattern must be given in MatchSubstringOptions. + If ignore_case is set, only simple case folding is performed. + + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +find_substring = _clone_signature(count_substring) +""" +Find first occurrence of substring. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +find_substring_regex = _clone_signature(count_substring) +""" +Find location of first match of regex pattern. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def index_in( + values: lib.Scalar, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def index_in( + values: lib.Array | lib.ChunkedArray, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def index_in( + values: Expression, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def index_in(*args, **kwargs): + """ + Return index of each element in a set of values. + + For each element in `values`, return its index in a given set of + values, or null if it is not found there. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_in( + values: lib.Scalar, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_in( + values: lib.Array | lib.ChunkedArray, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_in( + values: Expression, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_in(*args, **kwargs): + """ + Find each element in a set of values. + + For each element in `values`, return true if it is found in a given + set of values, false otherwise. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +match_like = _clone_signature(ends_with) +""" +Match strings against SQL-style LIKE pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. '%' will match any number of characters, '_' will +match exactly one character, and any other character matches itself. +To match a literal '%', '_', or '\', precede the character with a backslash. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring = _clone_signature(ends_with) +""" +Match strings against literal pattern. + +For each string in `strings`, emit true iff it contains a given pattern. +Null inputs emit null. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring_regex = _clone_signature(ends_with) +""" +Match strings against regex pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +starts_with = _clone_signature(ends_with) +""" +Check if strings start with a literal pattern. + +For each string in `strings`, emit true iff it starts with a given pattern. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.19 Categorizations ========================= +@overload +def is_finite( + values: NumericScalar | lib.NullScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_finite( + values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def is_finite( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def is_finite(*args, **kwargs): + """ + Return true if value is finite. + + For each input value, emit true iff the value is finite + (i.e. neither NaN, inf, nor -inf). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +is_inf = _clone_signature(is_finite) +""" +Return true if infinity. + +For each input value, emit true iff the value is infinite (inf or -inf). + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +is_nan = _clone_signature(is_finite) +""" +Return true if NaN. + +For each input value, emit true iff the value is NaN. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def is_null( + values: lib.Scalar, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_null( + values: lib.Array | lib.ChunkedArray, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_null( + values: Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_null(*args, **kwargs): + """ + Return true if null (and optionally NaN). + + For each input value, emit true iff the value is null. + True may also be emitted for NaN values by setting the `nan_is_null` flag. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + options : pyarrow.compute.NullOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_valid( + values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_valid( + values: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def is_valid( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def is_valid(*args, **kwargs): + """ + Return true if non-null. + + For each input value, emit true iff the value is valid (i.e. non-null). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +true_unless_null = _clone_signature(is_valid) +""" +Return true if non-null, else return null. + +For each input value, emit true iff the value +is valid (non-null), otherwise emit null. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): + """ + Choose values based on multiple conditions. + + `cond` must be a struct of Boolean values. `cases` can be a mix + of scalar and array arguments (of any type, but all must be the + same type or castable to a common type), with either exactly one + datum per child of `cond`, or one more `cases` than children of + `cond` (in which case we have an "else" value). + + Each row of the output will be the corresponding value of the + first datum in `cases` for which the corresponding child of `cond` + is true, or otherwise the "else" value (if given), or null. + + Essentially, this implements a switch-case or if-else, if-else... statement. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + *cases : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): + """ + Choose values from several arrays. + + For each row, the value of the first argument is used as a 0-based index + into the list of `values` arrays (i.e. index 0 selects the first of the + `values` arrays). The output value is the corresponding value of the + selected argument. + + If an index is null, the output will be null. + + Parameters + ---------- + indices : Array-like or scalar-like + Argument to compute function. + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def coalesce( + *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT: + """ + Select the first non-null value. + + Each row of the output will be the value from the first corresponding input + for which the value is not null. If all inputs are null in a row, the output + will be null. + + Parameters + ---------- + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +fill_null = coalesce +"""Replace each null element in values with a corresponding +element from fill_value. + +If fill_value is scalar-like, then every null element in values +will be replaced with fill_value. If fill_value is array-like, +then the i-th element in values will be replaced with the i-th +element in fill_value. + +The fill_value's type must be the same as that of values, or it +must be able to be implicitly casted to the array's type. + +This is an alias for :func:`coalesce`. + +Parameters +---------- +values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. +fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as values, will attempt to cast. + +Returns +------- +result : depends on inputs + Values with all null elements replaced + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array([1, 2, None, 3], type=pa.int8()) +>>> fill_value = pa.scalar(5, type=pa.int8()) +>>> arr.fill_null(fill_value) + +[ + 1, + 2, + 5, + 3 +] +>>> arr = pa.array([1, 2, None, 4, None]) +>>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) + +[ + 1, + 2, + 30, + 4, + 50 +] +""" + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: + """ + Choose values based on a condition. + + `cond` must be a Boolean scalar/ array. + `left` or `right` must be of the same type scalar/ array. + `null` values in `cond` will be promoted to the output. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + left : Array-like or scalar-like + Argument to compute function. + right : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.21 Structural transforms ========================= + +@overload +def list_value_length( + lists: _ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def list_value_length( + lists: _LargeListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def list_value_length( + lists: ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array: ... +@overload +def list_value_length( + lists: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def list_value_length(*args, **kwargs): + """ + Compute list lengths. + + `lists` must have a list-like type. + For each non-null value in `lists`, its length is emitted. + Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def make_struct( + *args: lib.Scalar, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def make_struct( + *args: lib.Array | lib.ChunkedArray, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def make_struct( + *args: Expression, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def make_struct(*args, **kwargs): + """ + Wrap Arrays into a StructArray. + + Names of the StructArray's fields are + specified through MakeStructOptions. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + options : pyarrow.compute.MakeStructOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.22 Conversions ========================= +@overload +def ceil_temporal( + timestamps: _TemporalScalarT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT: ... +@overload +def ceil_temporal( + timestamps: _TemporalArrayT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalArrayT: ... +@overload +def ceil_temporal( + timestamps: Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ceil_temporal(*args, **kwargs): + """ + Round temporal values up to nearest multiple of specified time unit. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values down to nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +round_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values to the nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def cast( + arr: lib.Scalar, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[_DataTypeT]: ... +@overload +def cast( + arr: lib.Array, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def cast( + arr: lib.ChunkedArray, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +def cast(*args, **kwargs): + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp("ms")) + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp("ms")).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast("timestamp[ms]") + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + >>> arr.cast("timestamp[ms]").type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + The cast result as a new Array + """ + +@overload +def strftime( + timestamps: TemporalScalar, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar: ... +@overload +def strftime( + timestamps: TemporalArray, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringArray: ... +@overload +def strftime( + timestamps: Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def strftime(*args, **kwargs): + """ + Format temporal values according to a format string. + + For each input value, emit a formatted string. + The time format string and locale can be set using StrftimeOptions. + The output precision of the "%S" (seconds) format code depends on + the input time precision: it is an integer for timestamps with + second precision, a real number with the required number of fractional + digits for higher precisions. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database, or if the specified locale + does not exist on this system. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + options : pyarrow.compute.StrftimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def strptime( + strings: StringScalar, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def strptime( + strings: StringArray, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def strptime( + strings: Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def strptime(*args, **kwargs): + """ + Parse timestamps. + + For each string in `strings`, parse it as a timestamp. + The timestamp unit and the expected string pattern must be given + in StrptimeOptions. Null inputs emit null. If a non-null string + fails parsing, an error is returned by default. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + options : pyarrow.compute.StrptimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.23 Temporal component extraction ========================= +@overload +def day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +@overload +def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def day(*args, **kwargs): + """ + Extract day number. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def day_of_week( + values: TemporalScalar, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def day_of_week( + values: TemporalArray, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def day_of_week( + values: Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def day_of_week(*args, **kwargs): + """ + Extract day of the week number. + + By default, the week starts on Monday represented by 0 and ends on Sunday + represented by 6. + `DayOfWeekOptions.week_start` can be used to set another starting day using + the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). + Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +day_of_year = _clone_signature(day) +""" +Extract day of year number. + +January 1st maps to day number 1, February 1st to 32, etc. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def hour( + values: lib.TimestampArray[Any] + | lib.Time32Array[Any] + | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def hour( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def hour(*args, **kwargs): + """ + Extract hour value. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_dst( + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_dst( + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def is_dst(*args, **kwargs): + """ + Extracts if currently observing daylight savings. + + IsDaylightSavings returns true if a timestamp has a daylight saving + offset in the given timezone. + Null values emit null. + An error is returned if the values do not have a defined timezone. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def iso_week( + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def iso_week( + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def iso_week( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def iso_week(*args, **kwargs): + """ + Extract ISO week of year number. + + First ISO week has the majority (4 or more) of its days in January. + ISO week starts on Monday. The week number starts with 1 and can run + up to 53. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +iso_year = _clone_signature(iso_week) +""" +Extract ISO year number. + +First week of an ISO year has the majority (4 or more) of its days in January. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_leap_year( + values: lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_leap_year( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_leap_year(*args, **kwargs): + """ + Extract if year is a leap year. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +microsecond = _clone_signature(iso_week) +""" +Extract microsecond values. + +Microsecond returns number of microseconds since the last full millisecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +millisecond = _clone_signature(iso_week) +""" +Extract millisecond values. + +Millisecond returns number of milliseconds since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minute = _clone_signature(iso_week) +""" +Extract minute values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +month = _clone_signature(day_of_week) +""" +Extract month number. + +Month is encoded as January=1, December=12. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +nanosecond = _clone_signature(hour) +""" +Extract nanosecond values. + +Nanosecond returns number of nanoseconds since the last full microsecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarter = _clone_signature(day_of_week) +""" +Extract quarter of year number. + +First quarter maps to 1 and forth quarter maps to 4. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +second = _clone_signature(hour) +""" +Extract second values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subsecond = _clone_signature(hour) +""" +Extract subsecond values. + +Subsecond returns the fraction of a second since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_week = _clone_signature(iso_week) +""" +Extract US week of year number. + +First US week has the majority (4 or more) of its days in January. +US week starts on Monday. The week number starts with 1 and can run +up to 53. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_year = _clone_signature(iso_week) +""" +Extract US epidemiological year number. + +First week of US epidemiological year has the majority (4 or more) of +it's days in January. Last week of US epidemiological year has the +year's last Wednesday in it. US epidemiological week starts on Sunday. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +year = _clone_signature(iso_week) +""" +Extract year number. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def week( + values: lib.TimestampScalar, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def week( + values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def week( + values: Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def week(*args, **kwargs): + """ + Extract week of year number. + + First week has the majority (4 or more) of its days in January. + Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using + DayOfWeekOptions.count_from_zero. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + options : pyarrow.compute.WeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def year_month_day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar: ... +@overload +def year_month_day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... +@overload +def year_month_day( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def year_month_day(*args, **kwargs): + """ + Extract (year, month, day) struct. + + Null values emit null. + An error is returned in the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of days and milliseconds between two timestamps. + + Returns the number of days and milliseconds from `start` to `end`. + That is, first the difference in days is computed as if both + timestamps were truncated to the day, then the difference between time times + of the two timestamps is computed as if both times were truncated to the + millisecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of days between two timestamps. + + Returns the number of day boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the day. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +hours_between = _clone_signature(days_between) +""" +Compute the number of hours between two timestamps. + +Returns the number of hour boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the hour. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +microseconds_between = _clone_signature(days_between) +""" +Compute the number of microseconds between two timestamps. + +Returns the number of microsecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the microsecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +milliseconds_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minutes_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. +In [152]: print(pc.minutes_between.__doc__) +Compute the number of minute boundaries between two timestamps. + +Returns the number of minute boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the minute. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: + """ + Compute the number of months, days and nanoseconds between two timestamps. + + Returns the number of months, days, and nanoseconds from `start` to `end`. + That is, first the difference in months is computed as if both timestamps + were truncated to the months, then the difference between the days + is computed, and finally the difference between the times of the two + timestamps is computed as if both times were truncated to the nanosecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of months between two timestamps. + + Returns the number of month boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the month. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +nanoseconds_between = _clone_signature(days_between) +""" +Compute the number of nanoseconds between two timestamps. + +Returns the number of nanosecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the nanosecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarters_between = _clone_signature(days_between) +""" +Compute the number of quarters between two timestamps. + +Returns the number of quarter start boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the quarter. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +seconds_between = _clone_signature(days_between) +""" +Compute the number of seconds between two timestamps. + +Returns the number of second boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the second. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of weeks between two timestamps. + + Returns the number of week boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the week. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +years_between = _clone_signature(days_between) +""" +Compute the number of years between two timestamps. + +Returns the number of year boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the year. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.25 Timezone handling ========================= +@overload +def assume_timezone( + timestamps: lib.TimestampScalar, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def assume_timezone( + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def assume_timezone( + timestamps: Expression, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def assume_timezone(*args, **kwargs): + """ + Convert naive timestamp to timezone-aware timestamp. + + Input timestamps are assumed to be relative to the timezone given in the + `timezone` option. They are converted to UTC-relative timestamps and + the output type has its timezone set to the value of the `timezone` + option. Null values emit null. + This function is meant to be used when an external system produces + "timezone-naive" timestamps which need to be converted to + "timezone-aware" timestamps. An error is returned if the timestamps + already have a defined timezone. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + options : pyarrow.compute.AssumeTimezoneOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def local_timestamp( + timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar: ... +@overload +def local_timestamp( + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def local_timestamp( + timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def local_timestamp(*args, **kwargs): + """ + Convert timestamp to a timezone-naive local time timestamp. + + LocalTimestamp converts timezone-aware timestamp to local timestamp + of the given timestamp's timezone and removes timezone metadata. + Alternative name for this timestamp is also wall clock time. + If input is in UTC or without timezone, then unchanged input values + without timezone metadata are returned. + Null values emit null. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Generate numbers in the range [0, 1). + + Generated values are uniformly-distributed, double-precision + in range [0, 1). Algorithm and seed can be changed via RandomOptions. + + Parameters + ---------- + n : int + Number of values to generate, must be greater than or equal to 0 + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + options : pyarrow.compute.RandomOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +@overload +def cumulative_sum( + values: _NumericArrayT, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def cumulative_sum( + values: Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def cumulative_sum(*args, **kwargs): + """ + Compute the cumulative sum over a numeric input. + + `values` must be numeric. Return an array/chunked array which is the + cumulative sum computed over `values`. Results will wrap around on + integer overflow. Use function "cumulative_sum_checked" if you want + overflow to return an error. The default start is 0. + + Parameters + ---------- + values : Array-like + Argument to compute function. + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +cumulative_sum_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative sum over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative sum computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_sum". The default start is 0. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. Results will wrap around on +integer overflow. Use function "cumulative_prod_checked" if you want +overflow to return an error. The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_prod". The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_max = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_min = _clone_signature(cumulative_sum) +""" +Compute the cumulative min over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative min computed over `values`. The default start is the maximum +value of input type (so that any other value will replace the +start as the new minimum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_mean = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +# ========================= 3.2 Associative transforms ========================= + +@overload +def dictionary_encode( + array: _ScalarOrArrayT, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT: ... +@overload +def dictionary_encode( + array: Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload +def value_counts( + array: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... +@overload +def value_counts( + array: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +# ========================= 3.3 Selections ========================= +@overload +def array_filter( + array: _ArrayT, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_filter( + array: Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def array_take( + array: _ArrayT, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_take( + array: Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def drop_null( + input: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +filter = array_filter +take = array_take +""" +Select values (or records) from array- or table-like data given integer +selection indices. + +The result will be of the same type(s) as the input, with elements taken +from the input array (or record batch / table fields) at the given +indices. If an index is null then the corresponding value in the output +will be null. + +Parameters +---------- +data : Array, ChunkedArray, RecordBatch, or Table +indices : Array, ChunkedArray + Must be of integer type +boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. +memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +Returns +------- +result : depends on inputs + Selected values for the given indices + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array(["a", "b", "c", None, "e", "f"]) +>>> indices = pa.array([0, None, 4, 3]) +>>> arr.take(indices) + +[ + "a", + null, + "e", + null +] +""" + +# ========================= 3.4 Containment tests ========================= +@overload +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def indices_nonzero( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def indices_nonzero(*args, **kwargs): + """ + Return the indices of the values in the array that are non-zero. + + For each input value, check if it's zero, false or null. Emit the index + of the value in the array if it's none of the those. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.5 Sorts and partitions ========================= +@overload +def array_sort_indices( + array: lib.Array | lib.ChunkedArray, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def array_sort_indices( + array: Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def array_sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array. + + This function computes an array of indices that define a stable sort + of the input array. By default, Null values are considered greater + than any other value and are therefore sorted at the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in ArraySortOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.ArraySortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def partition_nth_indices( + array: Expression, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def partition_nth_indices(*args, **kwargs): + """ + Return the indices that would partition an array around a pivot. + + This functions computes an array of indices that define a non-stable + partial sort of the input array. + + The output is such that the `N`'th index points to the `N`'th element + of the input in sorted order, and all indices before the `N`'th point + to elements in the input less or equal to elements at or after the `N`'th. + + By default, null values are considered greater than any other value + and are therefore partitioned towards the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The pivot index `N` must be given in PartitionNthOptions. + The handling of nulls and NaNs can also be changed in PartitionNthOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.PartitionNthOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: + """ + Compute ordinal ranks of an array (1-based). + + This function computes a rank of the input array. + By default, null values are considered greater than any other value and + are therefore sorted at the end of the input. For floating-point types, + NaNs are considered greater than any other non-null value, but smaller + than null values. The default tiebreaker is to assign ranks in order of + when ties appear in the input. + + The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + options : pyarrow.compute.RankOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def select_k_unstable( + input: lib.Array | lib.ChunkedArray, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def select_k_unstable( + input: Expression, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def select_k_unstable(*args, **kwargs): + """ + Select the indices of the first `k` ordered elements from the input. + + This function selects an array of indices of the first `k` ordered elements + from the `input` array, record batch or table specified in the column keys + (`options.sort_keys`). Output is not guaranteed to be stable. + Null values are considered greater than any other value and are + therefore ordered at the end. For floating-point types, NaNs are considered + greater than any other non-null value, but smaller than null values. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + options : pyarrow.compute.SelectKOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def sort_indices( + input: Expression, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array, record batch or table. + + This function computes an array of indices that define a stable sort + of the input array, record batch or table. By default, null values are + considered greater than any other value and are therefore sorted at the + end of the input. For floating-point types, NaNs are considered greater + than any other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in SortOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.SortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.6 Structural transforms ========================= +@overload +def list_element( + lists: Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: lib.ChunkedArray[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: ListScalar[_DataTypeT], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _DataTypeT: ... +def list_element(*args, **kwargs): + """ + Compute elements using of nested list values using an index. + + `lists` must have a list-like type. + For each value in each list of `lists`, the element at `index` + is emitted. Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + index : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_flatten( + lists: Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]], + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any]: ... +def list_flatten(*args, **kwargs): + """ + Flatten list values. + + `lists` must have a list-like type (lists, list-views, and + fixed-size lists). + Return an array with the top list level flattened unless + `recursive` is set to true in ListFlattenOptions. When that + is that case, flattening happens recursively until a non-list + array is formed. + + Null list values do not emit anything to the output. + + Parameters + ---------- + lists : Array-like + Argument to compute function. + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + options : pyarrow.compute.ListFlattenOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_parent_indices( + lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def list_parent_indices( + lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +def list_parent_indices(*args, **kwargs): + """ + Compute parent indices of nested list values. + + `lists` must have a list-like or list-view type. + For each value in each list of `lists`, the top-level list index + is emitted. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_slice( + lists: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def list_slice( + lists: ArrayOrChunkedArray[Any], + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any]: ... +def list_slice(*args, **kwargs): + """ + Compute slice of list-like array. + + `lists` must have a list-like type. + For each list element, compute a slice, returning a new list array. + A variable or fixed size list array is returned, depending on options. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + options : pyarrow.compute.ListSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Find the items corresponding to a given key in a Map. + + For a given query key (passed via MapLookupOptions), extract + either the FIRST, LAST or ALL items from a Map that have + matching keys. + + Parameters + ---------- + container : Array-like or scalar-like + Argument to compute function. + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + options : pyarrow.compute.MapLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Extract children of a struct or union by index. + + Given a list of indices (passed via StructFieldOptions), extract + the child array or scalar with the given child index, recursively. + + For union inputs, nulls are emitted for union values that reference + a different child than specified. Also, the indices are always + in physical order, not logical type codes - for example, the first + child is always index 0. + + An empty list of indices returns the argument unchanged. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + options : pyarrow.compute.StructFieldOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values backward to fill null slots. + + Given an array, propagate next valid observation backward to previous valid + or nothing if all next values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values forward to fill null slots. + + Given an array, propagate last valid observation forward to next valid + or nothing if all previous values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def replace_with_mask( + values, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +): + """ + Replace items selected with a mask. + + Given an array and a boolean mask (either scalar or of equal length), + along with replacement values (either scalar or array), + each element of the array for which the corresponding mask element is + true will be replaced by the next value from the replacements, + or with null if the mask is null. + Hence, for replacement arrays, len(replacements) == sum(mask == true). + + Parameters + ---------- + values : Array-like + Argument to compute function. + mask : Array-like + Argument to compute function. + replacements : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.7 Pairwise functions ========================= +@overload +def pairwise_diff( + input: _NumericOrTemporalArrayT, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def pairwise_diff( + input: Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def pairwise_diff(*args, **kwargs): + """ + Compute first order difference of an array. + + Computes the first order difference of an array, It internally calls + the scalar function "subtract" to compute + differences, so its + behavior and supported types are the same as + "subtract". The period can be specified in :struct:`PairwiseOptions`. + + Results will wrap around on integer overflow. Use function + "pairwise_diff_checked" if you want overflow to return an error. + + Parameters + ---------- + input : Array-like + Argument to compute function. + period : int, default 1 + Period for applying the period function. + options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +pairwise_diff_checked = _clone_signature(pairwise_diff) +""" +Compute first order difference of an array. + +Computes the first order difference of an array, It internally calls +the scalar function "subtract_checked" (or the checked variant) to compute +differences, so its behavior and supported types are the same as +"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. + +This function returns an error on overflow. For a variant that doesn't +fail on overflow, use function "pairwise_diff". + +Parameters +---------- +input : Array-like + Argument to compute function. +period : int, default 1 + Period for applying the period function. +options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" diff --git a/python/pyarrow/csv.pyi b/python/pyarrow/csv.pyi new file mode 100644 index 00000000000..510229d7e72 --- /dev/null +++ b/python/pyarrow/csv.pyi @@ -0,0 +1,27 @@ +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/pyarrow/cuda.pyi b/python/pyarrow/cuda.pyi new file mode 100644 index 00000000000..e11baf7d4e7 --- /dev/null +++ b/python/pyarrow/cuda.pyi @@ -0,0 +1,25 @@ +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/pyarrow/dataset.pyi b/python/pyarrow/dataset.pyi new file mode 100644 index 00000000000..98f1a38aa85 --- /dev/null +++ b/python/pyarrow/dataset.pyi @@ -0,0 +1,229 @@ +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + +@overload +def partitioning( + schema: Schema, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + field_names: list[str], + *, + flavor: Literal["filename"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + *, + flavor: Literal["hive"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: StrPath | Sequence[StrPath], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: list[Dataset], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> UnionDataset: ... +@overload +def dataset( + source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +@overload +def dataset( + source: RecordBatch | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +def write_dataset( + data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, +): ... diff --git a/python/pyarrow/feather.pyi b/python/pyarrow/feather.pyi new file mode 100644 index 00000000000..9451ee15763 --- /dev/null +++ b/python/pyarrow/feather.pyi @@ -0,0 +1,50 @@ +from typing import IO, Literal + +import pandas as pd + +from _typeshed import StrPath +from pyarrow._feather import FeatherError +from pyarrow.lib import Table + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + +def check_chunked_overflow(name: str, col) -> None: ... +def write_feather( + df: pd.DataFrame | Table, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... +def read_feather( + source: StrPath | IO, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... +def read_table( + source: StrPath | IO, + columns: list[str] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... diff --git a/python/pyarrow/flight.pyi b/python/pyarrow/flight.pyi new file mode 100644 index 00000000000..9b806ccf305 --- /dev/null +++ b/python/pyarrow/flight.pyi @@ -0,0 +1,95 @@ +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/pyarrow/fs.pyi b/python/pyarrow/fs.pyi new file mode 100644 index 00000000000..6bf75616c13 --- /dev/null +++ b/python/pyarrow/fs.pyi @@ -0,0 +1,77 @@ +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, + use_threads: bool = True, +) -> None: ... + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/pyarrow/gandiva.pyi b/python/pyarrow/gandiva.pyi new file mode 100644 index 00000000000..a344f885b29 --- /dev/null +++ b/python/pyarrow/gandiva.pyi @@ -0,0 +1,65 @@ +from typing import Iterable, Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... + def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... + def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... + def make_field(self, field: Field) -> Node: ... + def make_if( + self, condition: Node, this_node: Node, else_node: Node, return_type: DataType + ) -> Node: ... + def make_and(self, children: list[Node]) -> Node: ... + def make_or(self, children: list[Node]) -> Node: ... + def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... + def make_condition(self, condition: Node) -> Condition: ... + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + +def make_projector( + schema: Schema, + children: list[Expression], + pool: MemoryPool, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... +def make_filter( + schema: Schema, condition: Condition, configuration: Configuration | None = None +) -> Filter: ... + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/pyarrow/interchange/__init__.pyi b/python/pyarrow/interchange/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pyarrow/interchange/buffer.pyi b/python/pyarrow/interchange/buffer.pyi new file mode 100644 index 00000000000..46673961a75 --- /dev/null +++ b/python/pyarrow/interchange/buffer.pyi @@ -0,0 +1,58 @@ +import enum + +from pyarrow.lib import Buffer + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class _PyArrowBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ diff --git a/python/pyarrow/interchange/column.pyi b/python/pyarrow/interchange/column.pyi new file mode 100644 index 00000000000..e6662867b6b --- /dev/null +++ b/python/pyarrow/interchange/column.pyi @@ -0,0 +1,252 @@ +import enum + +from typing import Any, Iterable, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + @property + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + + TBD: are there any other in-memory representations that are needed? + """ + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ diff --git a/python/pyarrow/interchange/dataframe.pyi b/python/pyarrow/interchange/dataframe.pyi new file mode 100644 index 00000000000..526a58926a9 --- /dev/null +++ b/python/pyarrow/interchange/dataframe.pyi @@ -0,0 +1,102 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Iterable, Sequence + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ + def select_columns(self, indices: Sequence[int]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + def select_columns_by_name(self, names: Sequence[str]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ diff --git a/python/pyarrow/interchange/from_dataframe.pyi b/python/pyarrow/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..b04b6268975 --- /dev/null +++ b/python/pyarrow/interchange/from_dataframe.pyi @@ -0,0 +1,244 @@ +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... + +ColumnObject: TypeAlias = Any + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: + """ + Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + + Examples + -------- + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + Convert a pandas dataframe to a pyarrow table: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_attendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... } + ... ) + >>> df + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_attendees: int64 + country: large_string + ---- + n_attendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + """ + +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: + """ + Convert interchange protocol chunk to ``pa.RecordBatch``. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.RecordBatch + """ + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding one of the primitive dtypes to a PyArrow array. + A primitive type is one of: int, uint, float, bool (1 bit). + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding boolean dtype to a PyArrow array. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: + """ + Convert a column holding categorical data to a pa.DictionaryArray. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.DictionaryArray + """ + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: + """Parse datetime `format_str` to interpret the `data`.""" + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: + """Map column date type to pyarrow date type.""" + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: + """ + Build a PyArrow array from the passed buffer. + + Parameters + ---------- + buffer : ColumnBuffers + Dictionary containing tuples of underlying buffers and + their associated dtype. + data_type : Tuple[DtypeKind, int, str, str], + Dtype description of the column as a tuple ``(kind, bit-width, format string, + endianness)``. + length : int + The number of values in the array. + describe_null: ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + + Notes + ----- + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as + the returned PyArrow array is being used. + """ + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from the passed mask buffer. + + Parameters + ---------- + validity_buff : BufferObject + Tuple of underlying validity buffer and associated dtype. + validity_dtype : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from NaN or sentinel values. + + Parameters + ---------- + data_pa_buffer : pa.Buffer + PyArrow buffer for the column data. + data_type : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ diff --git a/python/pyarrow/ipc.pyi b/python/pyarrow/ipc.pyi new file mode 100644 index 00000000000..c7f2af004d4 --- /dev/null +++ b/python/pyarrow/ipc.pyi @@ -0,0 +1,123 @@ +from io import IOBase + +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... +def open_stream( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchFileWriter: ... +def open_file( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... +def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + +__all__ = [ + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/pyarrow/json.pyi b/python/pyarrow/json.pyi new file mode 100644 index 00000000000..db1d35e0b8b --- /dev/null +++ b/python/pyarrow/json.pyi @@ -0,0 +1,3 @@ +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow/lib.pyi b/python/pyarrow/lib.pyi new file mode 100644 index 00000000000..1698b55520b --- /dev/null +++ b/python/pyarrow/lib.pyi @@ -0,0 +1,106 @@ +# ruff: noqa: F403 +from typing import NamedTuple + +from .__lib_pxi.array import * +from .__lib_pxi.benchmark import * +from .__lib_pxi.builder import * +from .__lib_pxi.compat import * +from .__lib_pxi.config import * +from .__lib_pxi.device import * +from .__lib_pxi.error import * +from .__lib_pxi.io import * +from .__lib_pxi.ipc import * +from .__lib_pxi.memory import * +from .__lib_pxi.pandas_shim import * +from .__lib_pxi.scalar import * +from .__lib_pxi.table import * +from .__lib_pxi.tensor import * +from .__lib_pxi.types import * + +class MonthDayNano(NamedTuple): + days: int + months: int + nanoseconds: int + +def cpu_count() -> int: + """ + Return the number of threads to use in parallel operations. + + The number of threads is determined at startup by inspecting the + ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. + If neither is present, it will default to the number of hardware threads + on the system. It can be modified at runtime by calling + :func:`set_cpu_count()`. + + See Also + -------- + set_cpu_count : Modify the size of this pool. + io_thread_count : The analogous function for the I/O thread pool. + """ + +def set_cpu_count(count: int) -> None: + """ + Set the number of threads to use in parallel operations. + + Parameters + ---------- + count : int + The number of concurrent threads that should be used. + + See Also + -------- + cpu_count : Get the size of this pool. + set_io_thread_count : The analogous function for the I/O thread pool. + """ + +def is_threading_enabled() -> bool: + """ + Returns True if threading is enabled in libarrow. + + If it isn't enabled, then python shouldn't create any + threads either, because we're probably on a system where + threading doesn't work (e.g. Emscripten). + """ + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/python/pyarrow/orc.pyi b/python/pyarrow/orc.pyi new file mode 100644 index 00000000000..2eba8d40a11 --- /dev/null +++ b/python/pyarrow/orc.pyi @@ -0,0 +1,279 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + +class ORCFile: + """ + Reader interface for a single ORC file + + Parameters + ---------- + source : str or pyarrow.NativeFile + Readable source. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + """ + + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: + """The file metadata, as an arrow KeyValueMetadata""" + @property + def schema(self) -> Schema: + """The file schema, as an arrow schema""" + @property + def nrows(self) -> int: + """The number of rows in the file""" + @property + def nstripes(self) -> int: + """The number of stripes in the file""" + @property + def file_version(self) -> str: + """Format version of the ORC file, must be 0.11 or 0.12""" + @property + def software_version(self) -> str: + """Software instance and version that wrote this file""" + @property + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: + """Compression codec of the file""" + @property + def compression_size(self) -> int: + """Number of bytes to buffer for the compression codec in the file""" + @property + def writer(self) -> str: + """Name of the writer that wrote this file. + If the writer is unknown then its Writer ID + (a number) is returned""" + @property + def writer_version(self) -> str: + """Version of the writer""" + @property + def row_index_stride(self) -> int: + """Number of rows per an entry in the row index or 0 + if there is no row index""" + @property + def nstripe_statistics(self) -> int: + """Number of stripe statistics""" + @property + def content_length(self) -> int: + """Length of the data stripes in the file in bytes""" + @property + def stripe_statistics_length(self) -> int: + """The number of compressed bytes in the file stripe statistics""" + @property + def file_footer_length(self) -> int: + """The number of compressed bytes in the file footer""" + @property + def file_postscript_length(self) -> int: + """The number of bytes in the file postscript""" + @property + def file_length(self) -> int: + """The number of bytes in the file""" + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: + """Read a single stripe from the file. + + Parameters + ---------- + n : int + The stripe index + columns : list + If not None, only these columns will be read from the stripe. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.RecordBatch + Content of the stripe as a RecordBatch. + """ + def read(self, columns: list[str] | None = None) -> Table: + """Read the whole file. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. Output always follows the + ordering of the file and not the `columns` list. + + Returns + ------- + pyarrow.Table + Content of the file as a Table. + """ + +class ORCWriter: + """ + Writer interface for a single ORC file + + Parameters + ---------- + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ + + writer: _orc.ORCWriter + is_open: bool + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def write(self, table: Table) -> None: + """ + Write the table into an ORC file. The schema of the table must + be equal to the schema used when opening the ORC file. + + Parameters + ---------- + table : pyarrow.Table + The table to be written into the ORC file + """ + def close(self) -> None: + """ + Close the ORC file + """ + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Table: + """ + Read a Table from an ORC file. + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name. For file-like objects, + only read a single file. Use pyarrow.BufferReader to read a file + contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. Output always follows the ordering of the file and + not the `columns` list. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + """ + +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, +) -> None: + """ + Write a table into an ORC file. + + Parameters + ---------- + table : pyarrow.lib.Table + The table to be written into the ORC file + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ diff --git a/python/pyarrow/pandas_compat.pyi b/python/pyarrow/pandas_compat.pyi new file mode 100644 index 00000000000..efbd05ac2fe --- /dev/null +++ b/python/pyarrow/pandas_compat.pyi @@ -0,0 +1,54 @@ +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table + +_T = TypeVar("_T") + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... +def table_to_dataframe( + options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None +) -> pd.DataFrame: ... +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/python/pyarrow/parquet/__init__.pyi b/python/pyarrow/parquet/__init__.pyi new file mode 100644 index 00000000000..4ef88705809 --- /dev/null +++ b/python/pyarrow/parquet/__init__.pyi @@ -0,0 +1 @@ +from .core import * # noqa diff --git a/python/pyarrow/parquet/core.pyi b/python/pyarrow/parquet/core.pyi new file mode 100644 index 00000000000..56b2c8447d9 --- /dev/null +++ b/python/pyarrow/parquet/core.pyi @@ -0,0 +1,2061 @@ +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Callable, Iterator, Literal, Sequence + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: + """ + Check if filters are well-formed and convert to an ``Expression``. + + Parameters + ---------- + filters : List[Tuple] or List[List[Tuple]] + + Notes + ----- + See internal ``pyarrow._DNF_filter_doc`` attribute for more details. + + Examples + -------- + + >>> filters_to_expression([("foo", "==", "bar")]) + + + Returns + ------- + pyarrow.compute.Expression + An Expression representing the filters + """ + +@deprecated("use filters_to_expression") +def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + +_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] + +class ParquetFile: + """ + Reader interface for a single Parquet file. + + Parameters + ---------- + source : str, pathlib.Path, pyarrow.NativeFile, or file-like object + Readable source. For passing bytes or buffer-like file containing a + Parquet file, use pyarrow.BufferReader. + metadata : FileMetaData, default None + Use existing metadata object, rather than reading from file. + common_metadata : FileMetaData, default None + Will be used in reads for pandas schema metadata if not found in the + main file's metadata, no other uses at the moment. + read_dictionary : list + List of column names to read directly as DictionaryArray. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + pre_buffer : bool, default False + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties, default None + File decryption properties for Parquet Modular Encryption. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Examples + -------- + + Generate an example PyArrow Table and write it to Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Create a ``ParquetFile`` object from the Parquet file: + + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the data: + + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Create a ParquetFile object with "animal" column as DictionaryArray: + + >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [ -- dictionary: + ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: + [0,1,2,3,4,5]] + """ + + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: + """ + Return the Parquet metadata. + """ + @property + def schema(self) -> ParquetSchema: + """ + Return the Parquet schema, unconverted to Arrow types + """ + @property + def schema_arrow(self) -> Schema: + """ + Return the inferred Arrow schema, converted from the whole Parquet + file's schema + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the Arrow schema: + + >>> parquet_file.schema_arrow + n_legs: int64 + animal: string + """ + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups of the Parquet file. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.num_row_groups + 1 + """ + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + def read_row_group( + self, + i: int, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a single row group from a Parquet file. + + Parameters + ---------- + i : int + Index of the individual row group that we want to read. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row group as a table (of columns) + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_group(0) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def read_row_groups( + self, + row_groups: list, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a multiple row groups from a Parquet file. + + Parameters + ---------- + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row groups as a table (of columns). + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_groups([0, 0]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] + """ + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list | None = None, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: + """ + Read streaming batches from a Parquet file. + + Parameters + ---------- + batch_size : int, default 64K + Maximum number of records to yield per batch. Batches may be + smaller if there aren't enough rows in the file. + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : boolean, default True + Perform multi-threaded column reads. + use_pandas_metadata : boolean, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Yields + ------ + pyarrow.RecordBatch + Contents of each batch as a record batch + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + >>> for i in parquet_file.iter_batches(): + ... print("RecordBatch") + ... print(i.to_pandas()) + RecordBatch + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + def read( + self, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a Table from Parquet format. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read a Table: + + >>> parquet_file.read(columns=["animal"]) + pyarrow.Table + animal: string + ---- + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: + """ + Read contents of file for the given columns and batch size. + + Notes + ----- + This function's primary purpose is benchmarking. + The scan is executed on a single thread. + + Parameters + ---------- + columns : list of integers, default None + Select columns to read, if None scan all columns. + batch_size : int, default 64K + Number of rows to read at a time internally. + + Returns + ------- + num_rows : int + Number of rows in file + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.scan_contents() + 6 + """ + +class ParquetWriter: + """ + Class for incrementally building a Parquet file for Arrow tables. + + Parameters + ---------- + where : path or file-like object + schema : pyarrow.Schema + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + writer_engine_version : unused + **options : dict + If options contains a key `metadata_collector` then the + corresponding value is assumed to be a list (or any object with + `.append` method) that will be filled with the file metadata instance + of the written file. + + Examples + -------- + Generate an example PyArrow Table and RecordBatch: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.record_batch( + ... [ + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... ], + ... names=["n_legs", "animal"], + ... ) + + create a ParquetWriter object: + + >>> import pyarrow.parquet as pq + >>> writer = pq.ParquetWriter("example.parquet", table.schema) + + and write the Table into the Parquet file: + + >>> writer.write_table(table) + >>> writer.close() + + >>> pq.read_table("example.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + create a ParquetWriter object for the RecordBatch: + + >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) + + and write the RecordBatch into the Parquet file: + + >>> writer2.write_batch(batch) + >>> writer2.close() + + >>> pq.read_table("example2.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: + """ + Write RecordBatch or Table to the Parquet file. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the input + table or batch length and 1024 * 1024. + """ + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: + """ + Write RecordBatch to the Parquet file. + + Parameters + ---------- + batch : RecordBatch + row_group_size : int, default None + Maximum number of rows in written row group. If None, the + row group size will be the minimum of the RecordBatch + size and 1024 * 1024. If set larger than 64Mi then 64Mi + will be used instead. + """ + def write_table(self, table: Table, row_group_size: int | None = None) -> None: + """ + Write Table to the Parquet file. + + Parameters + ---------- + table : Table + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the Table size + and 1024 * 1024. If set larger than 64Mi then 64Mi will + be used instead. + + """ + def close(self) -> None: + """ + Close the connection to the Parquet file. + """ + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: + """ + Add key-value metadata to the file. + This will overwrite any existing metadata with the same key. + + Parameters + ---------- + key_value_metadata : dict + Keys and values must be string-like / coercible to bytes. + """ + +class ParquetDataset: + """ + Encapsulates details of reading a complete Parquet dataset possibly + consisting of multiple files and partitions in subdirectories. + + Parameters + ---------- + path_or_paths : str or List[str] + A directory name, single file name, or list of file names. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : pyarrow.parquet.Schema + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. Set to False if you want to prioritize minimal memory usage + over maximum speed. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular resolution + (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 + timestamps will be inferred as timestamps in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + + Examples + -------- + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) + + create a ParquetDataset object from the dataset source: + + >>> dataset = pq.ParquetDataset("dataset_v2/") + + and read the data: + + >>> dataset.read().to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + create a ParquetDataset object with filter: + + >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) + >>> dataset.read().to_pandas() + n_legs animal year + 0 4 Dog 2021 + 1 4 Horse 2022 + """ + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: + """ + Schema of the Dataset. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_schema/") + + Read the schema: + + >>> dataset.schema + n_legs: int64 + animal: string + year: dictionary + """ + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read (multiple) Parquet files as a single pyarrow.Table. + + Parameters + ---------- + columns : List[str] + Names of columns to read from the dataset. The partition fields + are not automatically included. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_read/") + + Read the dataset: + + >>> dataset.read(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[5],[2],[4,100],[2,4]] + """ + def read_pandas(self, **kwargs) -> Table: + """ + Read dataset including pandas metadata, if any. Other arguments passed + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` + + Examples + -------- + Generate an example parquet file: + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "table_V2.parquet") + >>> dataset = pq.ParquetDataset("table_V2.parquet") + + Read the dataset with pandas metadata: + + >>> dataset.read_pandas(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,2,4,4,5,100]] + + >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} + """ + @property + def fragments(self) -> list[ParquetFileFragment]: + """ + A list of the Dataset source fragments or pieces with absolute + file paths. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") + + List the fragments: + + >>> dataset.fragments + [ list[str]: + """ + A list of absolute Parquet file paths in the Dataset source. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_files/") + + List the files: + + >>> dataset.files + ['dataset_v2_files/year=2019/...-0.parquet', ... + """ + @property + def filesystem(self) -> FileSystem: + """ + The filesystem type of the Dataset source. + """ + @property + def partitioning(self) -> Partitioning: + """ + The partitioning of the Dataset source, if discovered. + """ + +def read_table( + source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + filesystem: SupportedFileSystem | None = None, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: + """ + Read a Table from Parquet format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns) + + + Examples + -------- + + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) + + Read the data: + + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + + Read only a subset of columns: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] + + Read a subset of columns and read one column as DictionaryArray: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [ -- dictionary: + ["Brittle stars"] -- indices: + [0], -- dictionary: + ["Flamingo"] -- indices: + [0], -- dictionary: + ["Dog","Centipede"] -- indices: + [0,1], -- dictionary: + ["Parrot","Horse"] -- indices: + [0,1]] + + Read the table with filter: + + >>> pq.read_table( + ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] + ... ).to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + + Read data from a single Parquet file: + + >>> pq.write_table(table, "example.parquet") + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + """ + +def read_pandas( + source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs +) -> Table: + """ + + Read a Table from Parquet format, also reading DataFrame + index values if known in the file metadata + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + **kwargs + additional options for :func:`read_table` + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a Table of Columns, including DataFrame + indexes as columns + """ + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: + """ + + Write a Table to Parquet format. + + Parameters + ---------- + table : pyarrow.Table + where : string or pyarrow.NativeFile + row_group_size : int + Maximum number of rows in each written row group. If None, the + row group size will be the minimum of the Table size and + 1024 * 1024. + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + + **kwargs : optional + Additional options for ParquetWriter + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write the Table into Parquet file: + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Defining row group size for the Parquet file: + + >>> pq.write_table(table, "example.parquet", row_group_size=3) + + Defining row group compression (default is Snappy): + + >>> pq.write_table(table, "example.parquet", compression="none") + + Defining row group compression and encoding per-column: + + >>> pq.write_table( + ... table, + ... "example.parquet", + ... compression={"n_legs": "snappy", "animal": "gzip"}, + ... use_dictionary=["n_legs", "animal"], + ... ) + + Defining column encoding per-column: + + >>> pq.write_table( + ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False + ... ) + """ + +def write_to_dataset( + table: Table, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: + """ + Wrapper around dataset.write_dataset for writing a Table to + Parquet format by partitions. + For each combination of partition columns and values, + a subdirectories are created in the following + manner: + + root_dir/ + group1=value1 + group2=value1 + .parquet + group2=value2 + .parquet + group1=valueN + group2=value1 + .parquet + group2=valueN + .parquet + + Parameters + ---------- + table : pyarrow.Table + root_path : str, pathlib.Path + The root directory of the dataset. + partition_cols : list, + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : Schema, optional + This Schema of the dataset. + partitioning : Partitioning or list[str], optional + The partitioning scheme specified with the + ``pyarrow.dataset.partitioning()`` function or a list of field names. + When providing a list of field names, you can use + ``partitioning_flavor`` to drive which partitioning type should be + used. + basename_template : str, optional + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. If not specified, it defaults to "guid-{i}.parquet". + use_threads : bool, default True + Write files in parallel. If enabled, then maximum parallelism will be + used determined by the number of available CPU cores. + file_visitor : function + If set, this function will be called with a WrittenFile instance + for each file created during the call. This object will have both + a path attribute and a metadata attribute. + + The path attribute will be a string containing the path to + the created file. + + The metadata attribute will be the parquet metadata of the file. + This metadata will have the file path attribute set and can be used + to build a _metadata file. The metadata attribute will be None if + the format is not parquet. + + Example visitor which simple collects the filenames created:: + + visited_paths = [] + + def file_visitor(written_file): + visited_paths.append(written_file.path) + + existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' + Controls how the dataset will handle data that already exists in + the destination. The default behaviour is 'overwrite_or_ignore'. + + 'overwrite_or_ignore' will ignore any existing data and will + overwrite files with the same name as an output file. Other + existing files will be ignored. This behavior, in combination + with a unique basename_template for each write, will allow for + an append workflow. + + 'error' will raise an error if any data exists in the destination. + + 'delete_matching' is useful when you are writing a partitioned + dataset. The first time each partition directory is encountered + the entire directory will be deleted. This allows you to overwrite + old partitions completely. + **kwargs : dict, + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. + Using `metadata_collector` in kwargs allows one to collect the + file metadata instances of dataset pieces. The file paths in the + ColumnChunkMetaData will be set relative to `root_path`. + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write it to a partitioned dataset: + + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) + >>> pq.ParquetDataset("dataset_name_3").files + ['dataset_name_3/year=2019/...-0.parquet', ... + + Write a single Parquet file into the root folder: + + >>> pq.write_to_dataset(table, root_path="dataset_name_4") + >>> pq.ParquetDataset("dataset_name_4/").files + ['dataset_name_4/...-0.parquet'] + """ + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: + """ + Write metadata-only Parquet file from schema. This can be used with + `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar + files. + + Parameters + ---------- + schema : pyarrow.Schema + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + **kwargs : dict, + Additional kwargs for ParquetWriter class. See docstring for + `ParquetWriter` for more information. + + Examples + -------- + Generate example data: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Write a dataset and collect metadata information. + + >>> metadata_collector = [] + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) + + Write the `_common_metadata` parquet file without row groups statistics. + + >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") + + Write the `_metadata` parquet file with row groups statistics. + + >>> pq.write_metadata( + ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector + ... ) + """ + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> FileMetaData: + """ + Read FileMetaData from footer of a single Parquet file. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + metadata : FileMetaData + The metadata of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_metadata("example.parquet") + + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... + """ + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Schema: + """ + Read effective Arrow schema from Parquet file metadata. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + schema : pyarrow.Schema + The schema of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_schema("example.parquet") + n_legs: int64 + animal: string + """ diff --git a/python/pyarrow/parquet/encryption.pyi b/python/pyarrow/parquet/encryption.pyi new file mode 100644 index 00000000000..5a77dae7ef7 --- /dev/null +++ b/python/pyarrow/parquet/encryption.pyi @@ -0,0 +1,15 @@ +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/pyarrow/substrait.pyi b/python/pyarrow/substrait.pyi new file mode 100644 index 00000000000..a56a8a5b40f --- /dev/null +++ b/python/pyarrow/substrait.pyi @@ -0,0 +1,21 @@ +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] diff --git a/python/pyarrow/types.pyi b/python/pyarrow/types.pyi new file mode 100644 index 00000000000..0cb4f6171d3 --- /dev/null +++ b/python/pyarrow/types.pyi @@ -0,0 +1,194 @@ +import sys + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + Uint32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... + +__all__ = [ + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] diff --git a/python/pyarrow/util.pyi b/python/pyarrow/util.pyi new file mode 100644 index 00000000000..c2ecf7d6b61 --- /dev/null +++ b/python/pyarrow/util.pyi @@ -0,0 +1,27 @@ +from collections.abc import Callable +from os import PathLike +from typing import Any, Protocol, Sequence, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... From 881f4b9c2c6acaf6d1bd0bf5deb08a7f960b4add Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 19 Aug 2025 13:14:57 +0200 Subject: [PATCH 02/26] Fix test_compute.py --- python/pyarrow/tests/test_compute.py | 323 +++++++++++++++------------ python/pyproject.toml | 13 ++ 2 files changed, 199 insertions(+), 137 deletions(-) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 5441dd493d3..7820111b70f 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -28,31 +28,17 @@ import sys import textwrap -try: - import numpy as np -except ImportError: - np = None - -try: - import pandas as pd -except ImportError: - pd = None import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import ArrowNotImplementedError - -try: - import pyarrow.substrait as pas -except ImportError: - pas = None +from pyarrow.lib import ArrowNotImplementedError, ArrowTypeError exported_functions = [ - func for (name, func) in sorted(pc.__dict__.items()) + func for (_, func) in sorted(pc.__dict__.items()) if hasattr(func, '__arrow_compute_function__')] exported_option_classes = [ - cls for (name, cls) in sorted(pc.__dict__.items()) + cls for (_, cls) in sorted(pc.__dict__.items()) if (isinstance(cls, type) and cls is not pc.FunctionOptions and issubclass(cls, pc.FunctionOptions))] @@ -217,7 +203,7 @@ def test_option_class_equality(request): and cls != pc.AssumeTimezoneOptions ): try: - options.append(cls()) + options.append(cls()) # type: ignore[reportArgumentType] except TypeError: pytest.fail(f"Options class is not tested: {cls}") @@ -276,6 +262,8 @@ def test_get_function_hash_aggregate(): @pytest.mark.numpy def test_call_function_with_memory_pool(): + import numpy as np + arr = pa.array(["foo", "bar", "baz"]) indices = np.array([2, 2, 1]) result1 = arr.take(indices) @@ -799,11 +787,11 @@ def test_min_max(): # Wrong options type options = pc.TakeOptions() with pytest.raises(TypeError): - s = pc.min_max(data, options=options) + s = pc.min_max(data, options=options) # type: ignore[reportCallIssue] # Missing argument with pytest.raises(TypeError, match="min_max takes 1 positional"): - s = pc.min_max() + s = pc.min_max() # type: ignore[reportCallIssue] def test_any(): @@ -854,11 +842,11 @@ def test_all(): def test_is_valid(): # An example generated function wrapper without options - data = [4, 5, None] + data = pa.array([4, 5, None]) assert pc.is_valid(data).to_pylist() == [True, True, False] with pytest.raises(TypeError): - pc.is_valid(data, options=None) + pc.is_valid(data, options=None) # type: ignore[no-matching-overload] def test_generated_docstrings(): @@ -1037,21 +1025,6 @@ def find_new_unicode_codepoints(): 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } -# utf8proc does not store if a codepoint is numeric -numeric_info_missing = { - 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, - 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, - 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, - 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, - 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, - 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, - 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, - 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, - 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, - 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, - 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, - 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, - 0x10fcb, } # utf8proc has no no digit/numeric information digit_info_missing = { 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, @@ -1070,6 +1043,7 @@ def find_new_unicode_codepoints(): 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } +# utf8proc does not store if a codepoint is numeric numeric_info_missing = { 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, @@ -1104,7 +1078,8 @@ def test_string_py_compat_boolean(function_name, variant): py_name = function_name.replace('_', '') ignore = codepoints_ignore.get(function_name, set()) | \ find_new_unicode_codepoints() - for i in range(128 if ascii else 0x11000): + for i in range(128 if ascii # type: ignore[reportUnnecessaryComparison] + else 0x11000): if i in range(0xD800, 0xE000): continue # bug? pyarrow doesn't allow utf16 surrogates # the issues we know of, we skip @@ -1170,6 +1145,8 @@ def test_utf8_zfill(): @pytest.mark.pandas def test_replace_slice(): + import numpy as np + offsets = range(-3, 4) arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) @@ -1246,6 +1223,7 @@ def test_binary_join(): expected = pa.array(['a1b', 'c2d'], type=pa.binary()) ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary())) assert pc.binary_join(ar_list, separator_array).equals(expected) + assert expected.equals(pc.binary_join(ar_list, separator_array)) def test_binary_join_element_wise(): @@ -1309,7 +1287,8 @@ def test_take_indices_types(): for indices_type in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64']: - indices = pa.array([0, 4, 2, None], type=indices_type) + indices = pa.array( + [0, 4, 2, None], type=indices_type) # type: ignore[reportArgumentType] result = arr.take(indices) result.validate() expected = pa.array([0, 4, 2, None]) @@ -1318,7 +1297,7 @@ def test_take_indices_types(): for indices_type in [pa.float32(), pa.float64()]: indices = pa.array([0, 4, 2], type=indices_type) with pytest.raises(NotImplementedError): - arr.take(indices) + arr.take(indices) # type: ignore[reportArgumentType] def test_take_on_chunked_array(): @@ -1486,6 +1465,8 @@ def test_filter(ty, values): @pytest.mark.numpy @pytest.mark.parametrize(('ty', 'values'), all_array_types) def test_filter_numpy_array_mask(ty, values): + import numpy as np + arr = pa.array(values, type=ty) # same test as test_filter with different array type mask = np.array([True, False, False, True, None]) @@ -1562,7 +1543,7 @@ def test_filter_errors(): # non-boolean dtype mask = pa.array([0, 1, 0, 1, 0]) with pytest.raises(NotImplementedError): - obj.filter(mask) + obj.filter(mask) # type: ignore[reportArgumentType] # wrong length mask = pa.array([True, False, True]) @@ -1573,7 +1554,7 @@ def test_filter_errors(): scalar = pa.scalar(True) for filt in [batch, table, scalar]: with pytest.raises(TypeError): - table.filter(filt) + table.filter(filt) # type: ignore[reportArgumentType] def test_filter_null_type(): @@ -1592,11 +1573,10 @@ def test_filter_null_type(): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_array(typ): - if typ == "array": - def con(values): + def con(values): + if typ == "array": return pa.array(values) - else: - def con(values): + else: return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) @@ -1623,11 +1603,10 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_string_scalar(typ): - if typ == "array": - def con(values): + def con(values): + if typ == "array": return pa.array(values) - else: - def con(values): + else: return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) @@ -1660,11 +1639,10 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_scalar(typ): - if typ == "array": - def con(values): + def con(values): + if typ == "array": return pa.array(values) - else: - def con(values): + else: return pa.chunked_array([values]) arr = con([1, 2, 3, None]) @@ -1757,14 +1735,17 @@ def test_round_to_integer(ty): "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None], } for round_mode, expected in rmode_and_expected.items(): - options = RoundOptions(round_mode=round_mode) - result = round(values, options=options) + options = RoundOptions( # type: ignore[reportPossiblyUnboundVariable] + round_mode=round_mode) # type: ignore[reportArgumentType] + result = round(values, options=options) # type: ignore[reportArgumentType] expected_array = pa.array(expected, type=pa.float64()) assert expected_array.equals(result) @pytest.mark.numpy def test_round(): + import numpy as np + values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] ndigits_and_expected = { -2: [300, 0, 0, 0, -0, -0, -0, None], @@ -1784,6 +1765,8 @@ def test_round(): @pytest.mark.numpy def test_round_to_multiple(): + import numpy as np + values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] multiple_and_expected = { 0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None], @@ -1807,7 +1790,8 @@ def test_round_to_multiple(): for multiple in [object, 99999999999999999999999]: with pytest.raises(TypeError, match="is not a valid multiple type"): - pc.round_to_multiple(values, multiple=multiple) + pc.round_to_multiple( + values, multiple=multiple) # type: ignore[reportArgumentType] def test_round_binary(): @@ -1878,7 +1862,7 @@ def test_fill_null(): fill_value = pa.array([5], type=pa.int8()) with pytest.raises(pa.ArrowInvalid, match="Array arguments must all be the same length"): - arr.fill_null(fill_value) + arr.fill_null(fill_value) # type: ignore[reportArgumentType] arr = pa.array([None, None, None, None], type=pa.null()) fill_value = pa.scalar(None, type=pa.null()) @@ -2075,14 +2059,16 @@ def test_fsl_to_fsl_cast(value_type): # Different sized FSL cast_type = pa.list_(pa.field("element", value_type), 3) err_msg = 'Size of FixedSizeList is not the same.' - with pytest.raises(pa.lib.ArrowTypeError, match=err_msg): + with pytest.raises(ArrowTypeError, match=err_msg): fsl.cast(cast_type) DecimalTypeTraits = namedtuple('DecimalTypeTraits', + # type: ignore[reportUntypedNamedTuple] ('name', 'factory', 'max_precision')) FloatToDecimalCase = namedtuple('FloatToDecimalCase', + # type: ignore[reportUntypedNamedTuple] ('precision', 'scale', 'float_val')) decimal_type_traits = [DecimalTypeTraits('decimal32', pa.decimal32, 9), @@ -2095,6 +2081,8 @@ def largest_scaled_float_not_above(val, scale): """ Find the largest float f such as `f * 10**scale <= val` """ + import numpy as np + assert val >= 0 assert scale >= 0 float_val = float(val) / 10**scale @@ -2155,7 +2143,7 @@ def random_float_to_decimal_cast_cases(float_ty, max_precision): r = random.Random(42) for precision in range(1, max_precision, 6): for scale in range(0, precision, 4): - for i in range(20): + for _ in range(20): unscaled = r.randrange(0, 10**precision) float_val = scaled_float(unscaled, scale) assert float_val * 10**scale < 10**precision @@ -2212,6 +2200,8 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): """ Test float-to-decimal conversion against exactly generated values. """ + import numpy as np + r = random.Random(43) np_float_ty = { pa.float32(): np.float32, @@ -2252,10 +2242,13 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): float_exp = (-mantissa_bits + math.floor(math.log2(10**(precision - scale)))) assert float_exp_min <= float_exp <= float_exp_max - for i in range(5): + for _ in range(5): mantissa = r.randrange(0, 2**mantissa_bits) - float_val = np.ldexp(np_float_ty(mantissa), float_exp) - assert isinstance(float_val, np_float_ty) + float_val = np.ldexp( + np_float_ty(mantissa), float_exp + ) + assert isinstance( + float_val, np_float_ty) # type: ignore[reportArgumentType] # Make sure we compute the exact expected value and # round by half-to-even when converting to the expected precision. if float_exp >= 0: @@ -2301,6 +2294,8 @@ def test_strptime(): @pytest.mark.pandas @pytest.mark.timezone_data def test_strftime(): + import pandas as pd + times = ["2018-03-10 09:00", "2038-01-31 12:23", None] timezones = ["CET", "UTC", "Europe/Ljubljana"] @@ -2311,7 +2306,8 @@ def test_strftime(): formats.extend(["%c", "%x", "%X"]) for timezone in timezones: - ts = pd.to_datetime(times).tz_localize(timezone) + ts = pd.to_datetime(times # type: ignore[reportArgumentType] + ).tz_localize(timezone) for unit in ["s", "ms", "us", "ns"]: tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) for fmt in formats: @@ -2358,7 +2354,7 @@ def test_strftime(): # Test timestamps without timezone fmt = "%Y-%m-%dT%H:%M:%S" - ts = pd.to_datetime(times) + ts = pd.to_datetime(times) # type: ignore[reportArgumentType] tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) expected = pa.array(ts.strftime(fmt)).cast(result.type) @@ -2377,6 +2373,7 @@ def test_strftime(): def _check_datetime_components(timestamps, timezone=None): from pyarrow.vendored.version import Version + import pandas as pd ts = pd.to_datetime(timestamps).tz_localize( "UTC").tz_convert(timezone).to_series() @@ -2392,9 +2389,15 @@ def _check_datetime_components(timestamps, timezone=None): if Version(pd.__version__) < Version("1.1.0"): # https://github.com/pandas-dev/pandas/issues/33206 - iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64") - iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64") - iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64") + iso_year = ts.map( + lambda x: x.isocalendar()[0] # type: ignore[reportUnknownLambdaType] + ).astype("int64") + iso_week = ts.map( + lambda x: x.isocalendar()[1] # type: ignore[reportUnknownLambdaType] + ).astype("int64") + iso_day = ts.map( + lambda x: x.isocalendar()[2] # type: ignore[reportUnknownLambdaType] + ).astype("int64") else: # Casting is required because pandas isocalendar returns int32 # while arrow isocalendar returns int64. @@ -2444,7 +2447,8 @@ def _check_datetime_components(timestamps, timezone=None): # datetime with utc returns None for dst() is_dst = [False] * len(ts) else: - is_dst = ts.apply(lambda x: x.dst().seconds > 0) + is_dst = ts.apply( + lambda x: x.dst().seconds > 0) # type: ignore[reportUnknownLambdaType] assert pc.is_dst(tsa).equals(pa.array(is_dst)) day_of_week_options = pc.DayOfWeekOptions( @@ -2505,6 +2509,9 @@ def test_iso_calendar_longer_array(unit): @pytest.mark.pandas @pytest.mark.timezone_data def test_assume_timezone(): + import numpy as np + import pandas as pd + ts_type = pa.timestamp("ns") timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", @@ -2529,9 +2536,9 @@ def test_assume_timezone(): ambiguous_array = pa.array(ambiguous, type=ts_type) nonexistent_array = pa.array(nonexistent, type=ts_type) + ta = pa.array(timestamps, type=ts_type) for timezone in ["UTC", "America/Chicago", "Asia/Kolkata"]: options = pc.AssumeTimezoneOptions(timezone) - ta = pa.array(timestamps, type=ts_type) expected = timestamps.tz_localize(timezone) result = pc.assume_timezone(ta, options=options) assert result.equals(pa.array(expected)) @@ -2540,7 +2547,8 @@ def test_assume_timezone(): ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone)) with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"): - pc.assume_timezone(ta_zoned, options=options) + pc.assume_timezone( + ta_zoned, options=options) # type: ignore[reportArgumentType] invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss") with pytest.raises(ValueError, match="not found in timezone database"): @@ -2583,18 +2591,22 @@ def test_assume_timezone(): f"timezone '{timezone}'"): pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) - expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) + expected = ambiguous.tz_localize(timezone, ambiguous=np.array([True, True, True])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) - expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) + expected = ambiguous.tz_localize( + timezone, ambiguous=np.array([False, False, False])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) def _check_temporal_rounding(ts, values, unit): + import numpy as np + import pandas as pd + unit_shorthand = { "nanosecond": "ns", "microsecond": "us", @@ -2638,7 +2650,7 @@ def _check_temporal_rounding(ts, values, unit): value, unit, calendar_based_origin=True) origin = ts.dt.floor(greater_unit[unit]) - if ta.type.tz is None: + if not hasattr(ta.type, "tz"): result = pc.ceil_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.ceil(frequency) + origin np.testing.assert_array_equal(result, expected) @@ -2669,16 +2681,20 @@ def _check_temporal_rounding(ts, values, unit): # to regular ceiled timestamp if it is equal to the original timestamp. # This does not work if timestamp is zoned since our logic will not # account for DST jumps. - if ta.type.tz is None: + if not hasattr(ta.type, "tz"): options = pc.RoundTemporalOptions( - value, unit, ceil_is_strictly_greater=True) + value, # type: ignore[reportPossiblyUnboundVariable] + ceil_is_strictly_greater=True, + unit=unit) # type: ignore[reportPossiblyUnboundVariable] result = pc.ceil_temporal(ta, options=options) - expected = ts.dt.ceil(frequency) + expected = ts.dt.ceil(frequency) # type: ignore[reportPossiblyUnboundVariable] expected = np.where( expected == ts, - expected + pd.Timedelta(value, unit_shorthand[unit]), - expected) + expected + pd.Timedelta( + value, # type: ignore[reportPossiblyUnboundVariable] + unit=unit_shorthand[unit]), expected # type: ignore[reportArgumentType] + ) np.testing.assert_array_equal(result, expected) # Check RoundTemporalOptions defaults @@ -2703,8 +2719,10 @@ def _check_temporal_rounding(ts, values, unit): "second", "minute", "hour", "day")) @pytest.mark.pandas def test_round_temporal(unit): + import pandas as pd + values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750) - timestamps = [ + timestamps = pd.Series([ "1923-07-07 08:52:35.203790336", "1931-03-17 10:45:00.641559040", "1932-06-16 01:16:42.911994368", @@ -2717,7 +2735,7 @@ def test_round_temporal(unit): "1982-01-21 18:43:44.517366784", "1992-01-01 00:00:00.100000000", "1999-12-04 05:55:34.794991104", - "2026-10-26 08:39:00.316686848"] + "2026-10-26 08:39:00.316686848"]) ts = pd.Series([pd.Timestamp(x, unit="ns") for x in timestamps]) _check_temporal_rounding(ts, values, unit) @@ -2739,7 +2757,7 @@ def test_count(): with pytest.raises(ValueError, match='"something else" is not a valid count mode'): - pc.count(arr, 'something else') + pc.count(arr, 'something else') # type: ignore[invalid-argument-type] def test_index(): @@ -2789,7 +2807,7 @@ def test_partition_nth(): with pytest.raises( ValueError, match="'partition_nth_indices' cannot be called without options"): - pc.partition_nth_indices(data) + pc.partition_nth_indices(data) # type: ignore[no-matching-overload] def test_partition_nth_null_placement(): @@ -2816,10 +2834,13 @@ def validate_select_k(select_k_indices, arr, order, stable_sort=False): arr = pa.array([1, 2, None, 0]) for k in [0, 2, 4]: - for order in ["descending", "ascending"]: - result = pc.select_k_unstable( - arr, k=k, sort_keys=[("dummy", order)]) - validate_select_k(result, arr, order) + result = pc.select_k_unstable( + arr, k=k, sort_keys=[("dummy", "ascending")]) + validate_select_k(result, arr, "ascending") + + result = pc.select_k_unstable( + arr, k=k, sort_keys=[("dummy", "descending")]) + validate_select_k(result, arr, "descending") result = pc.top_k_unstable(arr, k=k) validate_select_k(result, arr, "descending") @@ -2876,7 +2897,7 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): with pytest.raises( ValueError, match="'select_k_unstable' cannot be called without options"): - pc.select_k_unstable(table) + pc.select_k_unstable(table) # type: ignore[no-matching-overload] with pytest.raises(ValueError, match="select_k_unstable requires a nonnegative `k`"): @@ -2885,14 +2906,19 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): with pytest.raises(ValueError, match="select_k_unstable requires a " "non-empty `sort_keys`"): - pc.select_k_unstable(table, k=2, sort_keys=[]) + pc.select_k_unstable(table, sort_keys=[], + k=2 # type: ignore[reportPossiblyUnboundVariable] + ) with pytest.raises(ValueError, match="not a valid sort order"): - pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) + pc.select_k_unstable( + table, k=k, # type: ignore[reportPossiblyUnboundVariable] + sort_keys=[("a", "nonscending")]) # type: ignore[reportArgumentType] with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) + pc.select_k_unstable(table, k=k, # type: ignore[reportPossiblyUnboundVariable] + sort_keys=[("unknown", "ascending")]) def test_array_sort_indices(): @@ -2911,7 +2937,9 @@ def test_array_sort_indices(): assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="not a valid sort order"): - pc.array_sort_indices(arr, order="nonscending") + pc.array_sort_indices(arr, + order="nonscending" # type: ignore[reportArgumentType] + ) def test_sort_indices_array(): @@ -2967,14 +2995,19 @@ def test_sort_indices_table(): assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="Must specify one or more sort keys"): - pc.sort_indices(table) + pc.sort_indices(table) # type: ignore[reportArgumentType] with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) + pc.sort_indices( + table, + sort_keys=[("unknown", "ascending")] # type: ignore[reportArgumentType] + ) with pytest.raises(ValueError, match="not a valid sort order"): - pc.sort_indices(table, sort_keys=[("a", "nonscending")]) + pc.sort_indices( + table, sort_keys=[("a", "nonscending")] # type: ignore[reportArgumentType] + ) def test_is_in(): @@ -3052,9 +3085,9 @@ def test_quantile(): assert result.to_pylist() == [1.25, 1.5, 1.75] with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): - pc.quantile(arr, q=1.1) + pc.quantile(arr, q=1.1) # type: ignore[invalid-argument-type] with pytest.raises(ValueError, match="not a valid quantile interpolation"): - pc.quantile(arr, interpolation='zzz') + pc.quantile(arr, interpolation='zzz') # type: ignore[invalid-argument-type] def test_tdigest(): @@ -3120,6 +3153,8 @@ def test_min_max_element_wise(): @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_sum(start, skip_nulls): + import numpy as np + # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3168,13 +3203,15 @@ def test_cumulative_sum(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_sum([1, 2, 3], start=strt) + pc.cumulative_sum([1, 2, 3], start=strt) # type: ignore[reportArgumentType] @pytest.mark.numpy @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_prod(start, skip_nulls): + import numpy as np + # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3223,13 +3260,17 @@ def test_cumulative_prod(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_prod([1, 2, 3], start=strt) + pc.cumulative_prod( + [1, 2, 3], start=strt # type: ignore[reportArgumentType] + ) @pytest.mark.numpy @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_max(start, skip_nulls): + import numpy as np + # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3281,13 +3322,15 @@ def test_cumulative_max(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_max([1, 2, 3], start=strt) + pc.cumulative_max([1, 2, 3], start=strt) # type: ignore[reportArgumentType] @pytest.mark.numpy @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_min(start, skip_nulls): + import numpy as np + # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3335,11 +3378,12 @@ def test_cumulative_min(start, skip_nulls): expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) np.testing.assert_array_almost_equal(result.to_numpy( + # type: ignore[reportAttributeAccessIssue] zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_max([1, 2, 3], start=strt) + pc.cumulative_max([1, 2, 3], start=strt) # type: ignore[reportArgumentType] def test_make_struct(): @@ -3431,12 +3475,12 @@ def test_list_element(): lists = pa.array([l1, l2], list_type) index = 1 - result = pa.compute.list_element(lists, index) + result = pc.list_element(lists, index) expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type) assert result.equals(expected) index = 4 - result = pa.compute.list_element(lists, index) + result = pc.list_element(lists, index) expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type) assert result.equals(expected) @@ -3475,7 +3519,7 @@ def test_random(): pa.array([], type=pa.float64()) # System random initialization => outputs all distinct - arrays = [tuple(pc.random(100).to_pylist()) for i in range(10)] + arrays = [tuple(pc.random(100).to_pylist()) for _ in range(10)] assert len(set(arrays)) == len(arrays) arrays = [tuple(pc.random(100, initializer=i % 7).to_pylist()) @@ -3484,15 +3528,14 @@ def test_random(): # Arbitrary hashable objects can be given as initializer initializers = [object(), (4, 5, 6), "foo"] - initializers.extend(os.urandom(10) for i in range(10)) - arrays = [tuple(pc.random(100, initializer=i).to_pylist()) - for i in initializers] + initializers.extend(os.urandom(10) for _ in range(10)) + arrays = [tuple(pc.random(100, initializer=i).to_pylist()) for i in initializers] assert len(set(arrays)) == len(arrays) with pytest.raises(TypeError, match=r"initializer should be 'system', an integer, " r"or a hashable object; got \[\]"): - pc.random(100, initializer=[]) + pc.random(100, initializer=[]) # type: ignore[invalid-argument-type] @pytest.mark.parametrize( @@ -3542,7 +3585,7 @@ def test_rank_options(): match=r'"NonExisting" is not a valid tiebreaker'): pc.RankOptions(sort_keys="descending", null_placement="at_end", - tiebreaker="NonExisting") + tiebreaker="NonExisting") # type: ignore[invalid-argument-type] def test_rank_quantile_options(): @@ -3572,7 +3615,7 @@ def test_rank_quantile_options(): assert result.equals(expected_descending) with pytest.raises(ValueError, match="not a valid sort order"): - pc.rank_quantile(arr, sort_keys="XXX") + pc.rank_quantile(arr, sort_keys="XXX") # type: ignore[reportArgumentType] def test_rank_normal_options(): @@ -3600,6 +3643,8 @@ def test_rank_normal_options(): def create_sample_expressions(): + import numpy as np + # We need a schema for substrait conversion schema = pa.schema([pa.field("i64", pa.int64()), pa.field( "foo", pa.struct([pa.field("bar", pa.string())]))]) @@ -3614,7 +3659,7 @@ def create_sample_expressions(): e = pc.scalar(None) f = pc.scalar({'a': 1}) g = pc.scalar(pa.scalar(1)) - h = pc.scalar(np.int64(2)) + h = pc.scalar(np.int64(2)) # type: ignore[reportOptionalMemberAccess] j = pc.scalar(False) k = pc.scalar(0) @@ -3689,20 +3734,22 @@ def test_expression_serialization_arrow(pickle_module): def test_expression_serialization_substrait(): exprs = create_sample_expressions() - schema = exprs["schema"] + schema = pa.schema(exprs["schema"]) # type: ignore[reportAttributeAccessIssue] # Basic literals don't change on binding and so they will round # trip without any change - for expr in exprs["literals"]: - serialized = expr.to_substrait(schema) + for expr in exprs["literals"]: # type: ignore[reportAttributeAccessIssue] + serialized = \ + expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] deserialized = pc.Expression.from_substrait(serialized) - assert expr.equals(deserialized) + assert expr.equals(deserialized) # type: ignore[reportAttributeAccessIssue] # Expressions are bound when they get serialized. Since bound # expressions are not equal to their unbound variants we cannot # compare the round tripped with the original - for expr in exprs["calls"]: - serialized = expr.to_substrait(schema) + for expr in exprs["calls"]: # type: ignore[reportAttributeAccessIssue] + serialized = \ + expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] deserialized = pc.Expression.from_substrait(serialized) # We can't compare the expressions themselves because of the bound # unbound difference. But we can compare the string representation @@ -3712,7 +3759,8 @@ def test_expression_serialization_substrait(): assert deserialized.equals(deserialized_again) for expr, expr_norm in zip(exprs["refs"], exprs["numeric_refs"]): - serialized = expr.to_substrait(schema) + serialized = \ + expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] deserialized = pc.Expression.from_substrait(serialized) assert str(deserialized) == str(expr_norm) serialized_again = deserialized.to_substrait(schema) @@ -3722,15 +3770,16 @@ def test_expression_serialization_substrait(): # For the special cases we get various wrinkles in serialization but we # should always get the same thing from round tripping twice for expr in exprs["special"]: - serialized = expr.to_substrait(schema) + serialized = \ + expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] deserialized = pc.Expression.from_substrait(serialized) serialized_again = deserialized.to_substrait(schema) deserialized_again = pc.Expression.from_substrait(serialized_again) assert deserialized.equals(deserialized_again) # Special case, we lose the field names of struct literals - f = exprs["special"][0] - serialized = f.to_substrait(schema) + f = exprs["special"][0] # type: ignore[reportAttributeAccessIssue] + serialized = f.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] deserialized = pc.Expression.from_substrait(serialized) assert deserialized.equals(pc.scalar({'': 1})) @@ -3758,10 +3807,10 @@ def test_expression_construction(): nested_field = pc.field(("nested", "field")) nested_field2 = pc.field("nested", "field") - zero | one == string - ~true == false + _ = zero | one == string + _ = ~true == false for typ in ("bool", pa.bool_()): - field.cast(typ) == true + _ = field.cast(typ) == true field.isin([1, 2]) nested_mixed_types.isin(["foo", "bar"]) @@ -3769,10 +3818,10 @@ def test_expression_construction(): nested_field2.isin(["foo", "bar"]) with pytest.raises(TypeError): - field.isin(1) + field.isin(1) # type: ignore[invalid-argument-type] with pytest.raises(pa.ArrowInvalid): - field != object() + _ = field != object() def test_expression_boolean_operators(): @@ -3781,16 +3830,16 @@ def test_expression_boolean_operators(): false = pc.scalar(False) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - true and false + _ = true and false with pytest.raises(ValueError, match="cannot be evaluated to python True"): - true or false + _ = true or false with pytest.raises(ValueError, match="cannot be evaluated to python True"): bool(true) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - not true + _ = not true def test_expression_call_function(): @@ -3812,14 +3861,14 @@ def test_expression_call_function(): # Invalid pc.scalar input gives original error message msg = "only other expressions allowed as arguments" with pytest.raises(TypeError, match=msg): - pc.add(field, object) + pc.add(field, object) # type: ignore[reportArgumentType] def test_cast_table_raises(): table = pa.table({'a': [1, 2]}) - with pytest.raises(pa.lib.ArrowTypeError): - pc.cast(table, pa.int64()) + with pytest.raises(ArrowTypeError): + pc.cast(table, pa.int64()) # type: ignore[reportArgumentType] @pytest.mark.parametrize("start,stop,expected", ( @@ -3966,31 +4015,31 @@ def test_run_end_encode(value_type, option): def test_pairwise_diff(): arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, 1, 1, None, None, 1]) - result = pa.compute.pairwise_diff(arr, period=1) + result = pc.pairwise_diff(arr, period=1) assert result.equals(expected) arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, None, 2, None, 1, None]) - result = pa.compute.pairwise_diff(arr, period=2) + result = pc.pairwise_diff(arr, period=2) assert result.equals(expected) # negative period arr = pa.array([1, 2, 3, None, 4, 5], type=pa.int8()) expected = pa.array([-1, -1, None, None, -1, None], type=pa.int8()) - result = pa.compute.pairwise_diff(arr, period=-1) + result = pc.pairwise_diff(arr, period=-1) assert result.equals(expected) # wrap around overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) expected = pa.array([255, 255, None, None, 255, None], type=pa.uint8()) - result = pa.compute.pairwise_diff(arr, period=-1) + result = pc.pairwise_diff(arr, period=-1) assert result.equals(expected) # fail on overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) with pytest.raises(pa.ArrowInvalid, match="overflow"): - pa.compute.pairwise_diff_checked(arr, period=-1) + pc.pairwise_diff_checked(arr, period=-1) def test_pivot_wider(): diff --git a/python/pyproject.toml b/python/pyproject.toml index fac3b25c554..598ddf7a75b 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -91,3 +91,16 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '22.0.0a0' + +[tool.pyright] +typeCheckingMode = "strict" +reportMissingImports = false +reportPrivateUsage = false +reportUnknownParameterType = false +reportMissingTypeArgument = false +reportMissingParameterType = false +reportMissingTypeStubs = false +reportUnknownVariableType = false +reportUnknownArgumentType = false +reportUnknownMemberType = false +include = ["pyarrow/tests/test_compute.py"] From 0fd9ee16d4c8a1ca9b064106e617f6d764c3690e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 21 Aug 2025 19:39:56 +0200 Subject: [PATCH 03/26] Fix pyarrow-stubs --- python/pyarrow/__lib_pxi/array.pyi | 19 +- python/pyarrow/__lib_pxi/scalar.pyi | 28 +- python/pyarrow/__lib_pxi/table.pyi | 18 +- python/pyarrow/__lib_pxi/types.pyi | 8 +- python/pyarrow/_compute.pyi | 40 +- python/pyarrow/compute.pyi | 807 +++++++++++++++++++++++----- 6 files changed, 758 insertions(+), 162 deletions(-) diff --git a/python/pyarrow/__lib_pxi/array.pyi b/python/pyarrow/__lib_pxi/array.pyi index ec1cda30a88..9283f57b69f 100644 --- a/python/pyarrow/__lib_pxi/array.pyi +++ b/python/pyarrow/__lib_pxi/array.pyi @@ -14,6 +14,7 @@ from typing import ( Iterable, Iterator, Literal, + LiteralString, TypeVar, overload, ) @@ -49,6 +50,7 @@ from .types import ( DataType, Field, MapType, + ListType, _AsPyType, _BasicDataType, _BasicValueT, @@ -1944,7 +1946,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): or if each element should be on its own line. """ format = to_string - def equals(self, other: Self) -> bool: ... + def equals(self, other: Self | Iterable[Any]) -> bool: ... def __len__(self) -> int: ... def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: """ @@ -1972,7 +1974,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): Return BooleanArray indicating the non-null values. """ def fill_null( - self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + self: Array[Scalar[_BasicDataType[_AsPyType]]] | Array[Scalar[_DataTypeT]], fill_value: Scalar[_DataTypeT] | _AsPyType | str | None ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: """ See :func:`pyarrow.compute.fill_null` for usage. @@ -2078,7 +2080,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): @overload def index( self: Array[Scalar[_BasicDataType[_AsPyType]]], - value: _AsPyType, + value: _AsPyType | None, start: int | None = None, end: int | None = None, *, @@ -2152,7 +2154,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): array : numpy.ndarray """ def to_pylist( - self: Array[Scalar[_BasicDataType[_AsPyType]]], + self: Array[Scalar[_BasicDataType[_AsPyType]]] | Array[Scalar[ListType[Any]]] | StructArray | DictionaryArray[Unknown, Unknown], *, map_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[_AsPyType | None]: @@ -3209,7 +3211,7 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S cls, values: Array[Scalar[_DataTypeT]], *, - type: None = None, + type: types.FixedSizeListType[_DataTypeT, Literal[int]] | None = None, mask: Mask | None = None, ) -> FixedSizeListArray[_DataTypeT, None]: ... @overload @@ -3661,7 +3663,7 @@ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): @staticmethod def from_arrays( indices: Indices, - dictionary: Array | np.ndarray | pd.Series, + dictionary: Array | np.ndarray | pd.Series | list[Any], mask: np.ndarray | pd.Series | BooleanArray | None = None, ordered: bool = False, from_pandas: bool = False, @@ -3724,8 +3726,8 @@ class StructArray(Array[scalar.StructScalar]): """ @staticmethod def from_arrays( - arrays: Iterable[Array], - names: list[str] | None = None, + arrays: Iterable[Array] | list[list[Any]], + names: list[str] | list[LiteralString] | None = None, fields: list[Field] | None = None, mask=None, memory_pool: MemoryPool | None = None, @@ -4217,6 +4219,7 @@ __all__ = [ "repeat", "infer_type", "_PandasConvertible", + "_CastAs", "Array", "NullArray", "BooleanArray", diff --git a/python/pyarrow/__lib_pxi/scalar.pyi b/python/pyarrow/__lib_pxi/scalar.pyi index 81ab5012067..77368bb264b 100644 --- a/python/pyarrow/__lib_pxi/scalar.pyi +++ b/python/pyarrow/__lib_pxi/scalar.pyi @@ -297,6 +297,7 @@ class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar def __iter__(self) -> Iterator[str]: ... def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + def tolist(self) -> list[Any]: ... class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): @property @@ -573,7 +574,7 @@ def scalar( @overload def scalar( value: Any, - type: types.BoolType, + type: types.BoolType | Literal["bool"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -581,7 +582,7 @@ def scalar( @overload def scalar( value: Any, - type: types.UInt8Type, + type: types.UInt8Type | Literal["uint8"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -589,7 +590,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Int8Type, + type: types.Int8Type | Literal["int8"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -597,7 +598,7 @@ def scalar( @overload def scalar( value: Any, - type: types.UInt16Type, + type: types.UInt16Type | Literal["uint16"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -605,7 +606,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Int16Type, + type: types.Int16Type | Literal["int16"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -613,7 +614,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Uint32Type, + type: types.Uint32Type | Literal["uint32"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -621,7 +622,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Int32Type, + type: types.Int32Type | Literal["int32"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -629,7 +630,7 @@ def scalar( @overload def scalar( value: Any, - type: types.UInt64Type, + type: types.UInt64Type | Literal["uint64"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -637,7 +638,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Int64Type, + type: types.Int64Type | Literal["int64"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -645,7 +646,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Float16Type, + type: types.Float16Type | Literal["f16"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -653,7 +654,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Float32Type, + type: types.Float32Type | Literal["f32"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -661,7 +662,7 @@ def scalar( @overload def scalar( value: Any, - type: types.Float64Type, + type: types.Float64Type | Literal["f64"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -693,7 +694,7 @@ def scalar( @overload def scalar( value: Any, - type: types.StringType, + type: types.StringType | Literal["string"], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -1014,4 +1015,5 @@ __all__ = [ "JsonScalar", "OpaqueScalar", "scalar", + "NullableCollection", ] diff --git a/python/pyarrow/__lib_pxi/table.pyi b/python/pyarrow/__lib_pxi/table.pyi index ffba4262e8c..34960e2b903 100644 --- a/python/pyarrow/__lib_pxi/table.pyi +++ b/python/pyarrow/__lib_pxi/table.pyi @@ -53,11 +53,11 @@ from pyarrow.lib import Device, MemoryManager, MemoryPool, MonthDayNano, Schema from pyarrow.lib import Field as _Field from . import array, scalar, types -from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible +from .array import Array, StructArray, _CastAs, _PandasConvertible from .device import DeviceAllocationType from .io import Buffer from .ipc import RecordBatchReader -from .scalar import Int64Scalar, Scalar +from .scalar import Int64Scalar, Scalar, NullableCollection from .tensor import Tensor from .types import DataType, _AsPyType, _BasicDataType, _DataTypeT @@ -389,7 +389,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): ] ] """ - def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: + def fill_null(self, fill_value: Scalar[_DataTypeT] | _AsPyType | str | None) -> Self: """ Replace each null element in values with fill_value. @@ -423,7 +423,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): ] ] """ - def equals(self, other: Self) -> bool: + def equals(self, other: Self | Array[Any] | Iterable[Any]) -> bool: """ Return whether the contents of two chunked arrays are equal. @@ -1522,6 +1522,11 @@ def chunked_array( type: None = None, ) -> ChunkedArray[scalar.ListScalar[Any]]: ... @overload +def chunked_array( + values: Iterable[NullableCollection[types.Decimal128Type[Any, Any]]], + type: types.Decimal128Type, +) -> ChunkedArray[types.Decimal128Type]: ... +@overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["null"] | types.NullType, @@ -5083,8 +5088,9 @@ class Table(_Tabular[ChunkedArray[Any]]): """ def record_batch( - data: dict[str, list[Any] | Array[Any]] - | Collection[Array[Any]] + data: list[ArrayOrChunkedArray[Any]] + | dict[str, list[Any] | Array[Any]] + | Iterable[Array[Any]] | pd.DataFrame | SupportArrowArray | SupportArrowDeviceArray, diff --git a/python/pyarrow/__lib_pxi/types.pyi b/python/pyarrow/__lib_pxi/types.pyi index 7fe6c36e332..5cac864c3cc 100644 --- a/python/pyarrow/__lib_pxi/types.pyi +++ b/python/pyarrow/__lib_pxi/types.pyi @@ -1532,7 +1532,7 @@ class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): **kwargs : optional additional key-value metadata """ - def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... + def __init__(self, __arg0__: Mapping[bytes, bytes] | Mapping[str, str] | None = None, **kwargs) -> None: ... def equals(self, other: KeyValueMetadata) -> bool: ... def __len__(self) -> int: ... def __contains__(self, __key: object) -> bool: ... @@ -2771,9 +2771,9 @@ def string_to_tzinfo(name: str) -> dt.tzinfo: """ @overload -def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... +def timestamp(unit: _Unit | str) -> TimestampType[_Unit, _Tz]: ... @overload -def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... +def timestamp(unit: _Unit | str, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... def timestamp(*args, **kwargs): """ Create instance of timestamp type with resolution and optional time zone. @@ -4290,7 +4290,9 @@ def is_float_value(obj: Any) -> bool: __all__ = [ "_Weakrefable", "_Metadata", + "_AsPyType", "DataType", + "_DataTypeT", "_BasicDataType", "NullType", "BoolType", diff --git a/python/pyarrow/_compute.pyi b/python/pyarrow/_compute.pyi index 3d61ae42787..61ccb233feb 100644 --- a/python/pyarrow/_compute.pyi +++ b/python/pyarrow/_compute.pyi @@ -10,6 +10,7 @@ from typing import ( ) from . import lib +from .compute import _NumericScalarT _Order: TypeAlias = Literal["ascending", "descending"] _Placement: TypeAlias = Literal["at_start", "at_end"] @@ -75,6 +76,11 @@ class Function(lib._Weakrefable): """ The number of kernels implementing this function. """ + @property + def kernels(self) -> list[ScalarKernel]: + """ + A list of all kernels implementing this function. + """ def call( self, args: Iterable, @@ -307,7 +313,7 @@ class RunEndEncodeOptions(FunctionOptions): Accepted values are pyarrow.{int16(), int32(), int64()}. """ # TODO: default is DataType(int32) - def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + def __init__(self, run_end_type: lib.DataType | Literal["int16","int32","int64"] = Literal["int32"]) -> None: ... class ElementWiseAggregateOptions(FunctionOptions): """ @@ -589,7 +595,7 @@ class QuantileOptions(FunctionOptions): """ def __init__( self, - q: float | Sequence[float], + q: float | Sequence[float] = 0.5, *, interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", skip_nulls: bool = True, @@ -859,7 +865,7 @@ class RoundToMultipleOptions(FunctionOptions): "half_down", "half_up", "half_towards_zero", "half_towards_infinity", "half_to_even", "half_to_odd". """ - def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... + def __init__(self, multiple: int | float | _NumericScalarT = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... class ScalarAggregateOptions(FunctionOptions): """ @@ -1095,6 +1101,19 @@ class Utf8NormalizeOptions(FunctionOptions): def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... +class ZeroFillOptions(FunctionOptions): + """ + Options for utf8_zero_fill. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default "0" + Padding character. Should be one Unicode codepoint. + """ + def __init__(self, width: int, padding: str = '0') -> None: ... + class VarianceOptions(FunctionOptions): """ Options for the `variance` and `stddev` functions. @@ -1584,8 +1603,19 @@ class Expression(lib._Weakrefable): ], null_matching_behavior=MATCH})> """ + def equals(self, other: Expression | lib.Array | Iterable) -> bool: + """ + Parameters + ---------- + other : pyarrow.dataset.Expression + + Returns + ------- + bool + """ + @staticmethod - def from_substrait(buffer: bytes | lib.Buffer) -> Expression: + def from_substrait(message: bytes | lib.Buffer) -> Expression: """ Deserialize an expression from Substrait @@ -1678,7 +1708,7 @@ class Expression(lib._Weakrefable): is_nan : Expression """ def cast( - self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + self, type: lib.DataType | Literal["bool"], safe: bool = True, options: CastOptions | None = None ) -> Expression: """ Explicitly set or change the expression's data type. diff --git a/python/pyarrow/compute.pyi b/python/pyarrow/compute.pyi index 8d8fc35b134..cbbb9b0efcc 100644 --- a/python/pyarrow/compute.pyi +++ b/python/pyarrow/compute.pyi @@ -1,6 +1,23 @@ -# ruff: noqa: I001 -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence, Hashable from collections.abc import Callable +from numpy.typing import NDArray # Option classes from pyarrow._compute import ArraySortOptions as ArraySortOptions @@ -68,6 +85,7 @@ from pyarrow._compute import TDigestOptions as TDigestOptions from pyarrow._compute import TrimOptions as TrimOptions from pyarrow._compute import UdfContext as UdfContext from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import ZeroFillOptions as ZeroFillOptions from pyarrow._compute import VarianceOptions as VarianceOptions from pyarrow._compute import VectorFunction as VectorFunction from pyarrow._compute import VectorKernel as VectorKernel @@ -90,11 +108,12 @@ from pyarrow._compute import register_vector_function as register_vector_functio from pyarrow._compute import _Order, _Placement from pyarrow._stubs_typing import ArrayLike, ScalarLike from . import lib +from _stubs_typing import Indices _P = ParamSpec("_P") _R = TypeVar("_R") -def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: +def field(*name_or_index: str | bytes | tuple[str | int, ...] | int) -> Expression: """Reference a column of the dataset. Stores only the field's name. Type and other information is known only when @@ -128,7 +147,7 @@ def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: Expression: +def scalar(value: bool | int | float | NumericScalar | None | str | dict[bool | float | str, bool | float | str]) -> Expression: """Expression representing a scalar value. Creates an Expression object representing a scalar value that can be used @@ -166,6 +185,10 @@ _ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) _ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT +_ZonedTimestampArrayT: TypeAlias = ArrayOrChunkedArray[lib.Scalar[lib.TimestampType[Any, Any]]] +_ZonelessTimestampArrayT: TypeAlias = ArrayOrChunkedArray[lib.Scalar[lib.TimestampType[Any, None]]] +_ZonedTimestampScalarT: TypeAlias = lib.Scalar[lib.TimestampType[Any, Any]] +_ZonelessTimestampScalarT: TypeAlias = lib.Scalar[lib.TimestampType[Any, None]] SignedIntegerScalar: TypeAlias = ( lib.Scalar[lib.Int8Type] @@ -209,6 +232,7 @@ TemporalScalar: TypeAlias = ( | lib.Time32Scalar[Any] | lib.Time64Scalar[Any] | lib.TimestampScalar[Any] + | lib.TimestampScalar[Any, None] | lib.DurationScalar[Any] | lib.MonthDayNanoIntervalScalar ) @@ -216,9 +240,9 @@ NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar _NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) @@ -245,6 +269,9 @@ _TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) _ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] _LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] +_DecimalScalarT = TypeVar("_DecimalScalarT", bound=DecimalScalar) +DecimalArray: TypeAlias = lib.Array[_DecimalScalarT] | lib.ChunkedArray[_DecimalScalarT] +_DecimalArrayT = TypeVar("_DecimalArrayT", bound=DecimalArray) # =============================== 1. Aggregation =============================== # ========================= 1.1 functions ========================= @@ -423,12 +450,12 @@ def first( """ def first_last( - array: lib.Array[Any] | lib.ChunkedArray[Any], + array: lib.Array[Any] | lib.ChunkedArray[Any] | Sequence[Any], /, *, skip_nulls: bool = True, min_count: int = 1, - options: ScalarAggregateOptions | None = None, + options: ScalarAggregateOptions | dict[str, Any] | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.StructScalar: """ @@ -742,7 +769,7 @@ def product( def quantile( array: NumericScalar | NumericArray, /, - q: float = 0.5, + q: float | list[float] = 0.5, *, interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", skip_nulls: bool = True, @@ -823,8 +850,64 @@ def stddev( If not passed, will allocate memory from the default memory pool. """ +def skew( + array: NumericArray | Sequence[int | None], + /, + *, + skip_nulls: bool = True, + biased: bool = True, + min_count: int = 0, + options: SkewOptions | None = None, +) -> NumericScalar: + """ + Calculate the skewness of a numeric array + Nulls are ignored by default. If there are not enough non-null values + in the array to satisfy `min_count`, null is returned. + The behavior of nulls and the `min_count` parameter can be changed. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : SkewOptions, optional + Options for the `skew` and `kurtosis` functions. + """ + +kurtosis = _clone_signature(skew) +""" +Calculate the kurtosis of a numeric array +Nulls are ignored by default. If there are not enough non-null values +in the array to satisfy `min_count`, null is returned. +The behavior of nulls and the `min_count` parameter can be changed. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. +min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : SkewOptions, optional + Options for the `skew` and `kurtosis` functions. +""" + def sum( - array: _NumericScalarT | NumericArray[_NumericScalarT], + array: _NumericScalarT | NumericArray[_NumericScalarT] | _DecimalArrayT, /, *, skip_nulls: bool = True, @@ -858,7 +941,7 @@ def sum( def tdigest( array: NumericScalar | NumericArray, /, - q: float = 0.5, + q: float | list[float] = 0.5, *, delta: int = 100, buffer_size: int = 500, @@ -899,7 +982,7 @@ def tdigest( """ def variance( - array: NumericScalar | NumericArray, + array: NumericScalar | NumericArray | list[int] | list[int | None], /, *, ddof: int = 0, @@ -1022,6 +1105,113 @@ def bottom_k_unstable( ] """ +def winsorize( + values: lib.Array | lib.ChunkedArray, + lower_limit: float | None = None, + upper_limit: float | None = None, + /, + *, + options: WinsorizeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Apply a winsorization transform to the input array so as to reduce the influence of potential outliers. + NaNs and nulls in the input are ignored for the purpose of computing the lower and upper quantiles. + The quantile limits can be changed in WinsorizeOptions. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + + lower_limit : float, between 0 and 1 + The quantile below which all values are replaced with the quantile's value. + For example, if lower_limit = 0.05, then all values in the lower 5% percentile will be replaced with the 5% percentile value. + + upper_limit : float, between 0 and 1 + The quantile above which all values are replaced with the quantile’s value. + For example, if upper_limit = 0.95, then all values in the upper 95% percentile will be replaced with the 95% percentile value. + + options : pyarrow.compute.WinsorizeOptions, optional + Alternative way of passing options. + + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Winsorized array + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) + >>> pc.winsorize(arr, 0.1, 0.8) + + [ + 8, + 4, + 8, + 8, + 5, + 3, + 7, + 2, + 2, + 6 + ] + """ + +def pivot_wider( + pivot_keys: lib.Array | lib.ChunkedArray | list[Any], + pivot_values: lib.Array | lib.ChunkedArray | list[Any], + /, + key_names: list[Any] | None = None, + *, + unexpected_key_behavior: str | None = None, + options: PivotWiderOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: + """ + Pivot values according to a pivot key column. + + Output is a struct with as many fields as PivotWiderOptions.key_names. + All output struct fields have the same type as pivot_values. Each pivot + key decides in which output field the corresponding pivot value is emitted. + If a pivot key doesn’t appear, null is emitted. If more than one non-null + value is encountered for a given pivot key, Invalid is raised. The pivot + key column can be string, binary or integer. The key_names will be cast + to the pivot key column type for matching. Behavior of unexpected pivot + keys is controlled by unexpected_key_behavior. + + Parameters + ---------- + pivot_keys : sequence + Array, ChunkedArray, list + pivot_values : sequence + Array, ChunkedArray, list + key_names : sequence of str + The pivot key names expected in the pivot key column. + For each entry in `key_names`, a column with the same name is emitted + in the struct output. + unexpected_key_behavior : str, default "ignore" + The behavior when pivot keys not in `key_names` are encountered. + Accepted values are "ignore", "raise". + If "ignore", unexpected keys are silently ignored. + If "raise", unexpected keys raise a KeyError. + options : pyarrow.compute.PivotWiderOptions, optional + Alternative way of passing options. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Pivoted struct array + """ + # ========================= 2. Element-wise (“scalar”) functions ========================= # ========================= 2.1 Arithmetic ========================= @@ -1076,8 +1266,8 @@ def add( ) -> _NumericOrTemporalScalarT: ... @overload def add( - x: _NumericOrTemporalArrayT, - y: _NumericOrTemporalArrayT, + x: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], + y: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], /, *, memory_pool: lib.MemoryPool | None = None, @@ -1088,27 +1278,27 @@ def add( ) -> Expression: ... @overload def add( - x: NumericOrTemporalScalar, - y: _NumericOrTemporalArrayT, + x: NumericOrTemporalScalar | lib._AsPyType, + y: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], /, *, memory_pool: lib.MemoryPool | None = None, ) -> _NumericOrTemporalArrayT: ... @overload def add( - x: _NumericOrTemporalArrayT, - y: NumericOrTemporalScalar, + x: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], + y: NumericOrTemporalScalar | lib._AsPyType, /, *, memory_pool: lib.MemoryPool | None = None, ) -> _NumericOrTemporalArrayT: ... @overload def add( - x: NumericOrTemporalScalar, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None + x: NumericOrTemporalScalar | lib._AsPyType, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... @overload def add( - x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None + x: Expression, y: NumericOrTemporalScalar | lib._AsPyType, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... def add(*args, **kwargs): """ @@ -1772,7 +1962,7 @@ memory_pool : pyarrow.MemoryPool, optional @overload def round( - x: _NumericScalarT, + x: _NumericScalarT | int | float, /, ndigits: int = 0, round_mode: Literal[ @@ -1793,7 +1983,7 @@ def round( ) -> _NumericScalarT: ... @overload def round( - x: _NumericArrayT, + x: _NumericArrayT | Sequence[int | float | None], /, ndigits: int = 0, round_mode: Literal[ @@ -1860,9 +2050,9 @@ def round(*args, **kwargs): @overload def round_to_multiple( - x: _NumericScalarT, + x: int | float | _NumericScalarT, /, - multiple: int = 0, + multiple: int | float | _NumericScalarT = 0, round_mode: Literal[ "down", "up", @@ -1881,9 +2071,9 @@ def round_to_multiple( ) -> _NumericScalarT: ... @overload def round_to_multiple( - x: _NumericArrayT, + x: _NumericArrayT | Sequence[int | float | None], /, - multiple: int = 0, + multiple: int | float | _NumericScalarT = 0, round_mode: Literal[ "down", "up", @@ -1904,7 +2094,7 @@ def round_to_multiple( def round_to_multiple( x: Expression, /, - multiple: int = 0, + multiple: int | float | _NumericScalarT = 0, round_mode: Literal[ "down", "up", @@ -1949,7 +2139,7 @@ def round_to_multiple(*args, **kwargs): @overload def round_binary( - x: _NumericScalarT, + x: _NumericScalarT | float, s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, /, round_mode: Literal[ @@ -1970,7 +2160,7 @@ def round_binary( ) -> _NumericScalarT: ... @overload def round_binary( - x: _NumericScalarT, + x: _NumericScalarT | float, s: Iterable, /, round_mode: Literal[ @@ -1991,7 +2181,7 @@ def round_binary( ) -> lib.NumericArray[_NumericScalarT]: ... @overload def round_binary( - x: _NumericArrayT, + x: _NumericArrayT | Sequence[float], s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, /, round_mode: Literal[ @@ -2298,6 +2488,18 @@ Compute the inverse sine. NaN is returned for invalid input values; to raise an error instead, see "asin_checked". +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asinh = _clone_signature(ln) +""" +Compute the inverse hyperbolic sine. +NaN is returned for invalid input values. + Parameters ---------- x : Array-like or scalar-like @@ -2326,6 +2528,19 @@ Compute the inverse tangent of x. The return value is in the range [-pi/2, pi/2]; for a full return range [-pi, pi], see "atan2". +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +atanh = _clone_signature(ln) +""" +Compute the inverse hyperbolic tangent of x. +The return value is in the range [-1, 1]. +NaN is returned for invalid input values. + Parameters ---------- x : Array-like or scalar-like @@ -2340,6 +2555,30 @@ Compute the cosine. NaN is returned for invalid input values; to raise an error instead, see "cos_checked". +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cosh = _clone_signature(ln) +""" +Compute the hyperbolic cosine. +NaN is returned for invalid input values. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +acosh = _clone_signature(ln) +""" +Compute the inverse hyperbolic cosine. +NaN is returned for invalid input values. + Parameters ---------- x : Array-like or scalar-like @@ -2382,6 +2621,18 @@ Compute the sine. Invalid input values raise an error; to return NaN instead, see "sin". +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sinh = _clone_signature(ln) +""" +Compute the hyperbolic sine. +NaN is returned for invalid input values. + Parameters ---------- x : Array-like or scalar-like @@ -2410,6 +2661,18 @@ Compute the tangent. Infinite values raise an error; to return NaN instead, see "tan". +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tanh = _clone_signature(ln) +""" +Compute the hyperbolic tangent. +NaN is returned for invalid input values. + Parameters ---------- x : Array-like or scalar-like @@ -2473,16 +2736,16 @@ def equal( ) -> lib.BooleanScalar: ... @overload def equal( - x: lib.Scalar, - y: lib.Array | lib.ChunkedArray, + x: lib.Scalar | lib._AsPyType, + y: lib.Array | lib.ChunkedArray | list[lib._AsPyType], /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... @overload def equal( - x: lib.Array | lib.ChunkedArray, - y: lib.Scalar, + x: lib.Array | lib.ChunkedArray | list[lib._AsPyType], + y: lib.Scalar | lib._AsPyType, /, *, memory_pool: lib.MemoryPool | None = None, @@ -2613,11 +2876,11 @@ memory_pool : pyarrow.MemoryPool, optional @overload def max_element_wise( - *args: ScalarOrArray[_Scalar_CoT], + *args: ScalarOrArray[_Scalar_CoT] | NDArray[Any] | float, skip_nulls: bool = True, options: ElementWiseAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _Scalar_CoT: ... +) -> lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT]: ... @overload def max_element_wise( *args: Expression, @@ -3774,6 +4037,25 @@ memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory poo """ +def utf8_normalize( + strings: _StringArrayT, /, form: str, *, options: Utf8NormalizeOptions | None = None, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: + """ + Utf8-normalize input + + For each string in `strings`, return the normal form. + The normalization form must be given in the Utf8NormalizeOptions. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + # ========================= 2.12 String padding ========================= @overload def ascii_center( @@ -3960,6 +4242,60 @@ memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """ +@overload +def utf8_zero_fill( + strings: _StringScalarT, + /, + width: int, + padding: str = '0', + *, + options: ZeroFillOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_zero_fill( + strings: _StringArrayT, + /, + width: int | None = None, + padding: str | None = '0', + *, + options: ZeroFillOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_zero_fill( + strings: Expression, + /, + width: int, + padding: str = '0', + *, + options: ZeroFillOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_zero_fill(*args, **kwargs): + """ + Left-pad strings to a given width, preserving leading sign characters + + For each string in `strings`, emit a string of length `width` by + prepending the given padding character (defaults to '0' if not specified). + If the string starts with '+' or '-', the sign is preserved and padding + occurs after the sign. Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + width : int + Desired string length. + padding : str, default "0" + Padding character. Should be one Unicode codepoint. + options : pyarrow.compute.ZeroFillOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ +utf8_zfill = _clone_signature(utf8_zero_fill) + # ========================= 2.13 String trimming ========================= @overload def ascii_ltrim( @@ -4448,38 +4784,74 @@ def extract_regex(*args, **kwargs): If not passed, will allocate memory from the default memory pool. """ -# ========================= 2.16 String join ========================= -def binary_join( - strings, separator, /, *, memory_pool: lib.MemoryPool | None = None -) -> StringScalar | StringArray: +def extract_regex_span( + strings: StringOrBinaryArray, + /, + pattern: str, + *, + options: ExtractRegexSpanOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: """ - Join a list of strings together with a separator. + Extract string spans captured by a regex pattern + + For each string in `strings`, match the regular expression and, if + successful, emit a struct with field names and values coming from the + regular expression's named capture groups. Each struct field value + will be a fixed_size_list(offset_type, 2) where offset_type is int32 + or int64, depending on the input string type. The two elements in + each fixed-size list are the index and the length of the substring + matched by the corresponding named capture group. + + If the input is null or the regular expression fails matching, + a null output value is emitted. - Concatenate the strings in `list`. The `separator` is inserted - between each given string. - Any null input and any null `list` element emits a null output. + Regular expression matching is done using the Google RE2 library. Parameters ---------- strings : Array-like or scalar-like Argument to compute function. - separator : Array-like or scalar-like - Argument to compute function. + pattern : str + Regular expression with named capture fields. + options : pyarrow.compute.ExtractRegexSpanOptions, optional + Alternative way of passing options. memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """ +# ========================= 2.16 String join ========================= +def binary_join( + strings: ArrayOrChunkedArray[lib.ListType[lib.BinaryType]], separator, /, *, memory_pool: lib.MemoryPool | None = None, +) -> StringArray | BinaryArray: ... +""" +Join a list of strings together with a separator. + +Concatenate the strings in `list`. The `separator` is inserted +between each given string. +Any null input and any null `list` element emits a null output. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +separator : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + @overload def binary_join_element_wise( - *strings: _StringOrBinaryScalarT, + *strings: _StringOrBinaryScalarT | str, null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", null_replacement: str = "", options: JoinOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT: ... +) -> _StringScalarT | _BinaryScalarT: ... @overload def binary_join_element_wise( - *strings: _StringOrBinaryArrayT, + *strings: _StringOrBinaryArrayT | Sequence[str | None], null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", null_replacement: str = "", options: JoinOptions | None = None, @@ -4646,55 +5018,30 @@ def utf8_slice_codeunits(*args, **kwargs): # ========================= 2.18 Containment tests ========================= @overload def count_substring( - strings: lib.StringScalar | lib.BinaryScalar, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar: ... -@overload -def count_substring( - strings: lib.LargeStringScalar | lib.LargeBinaryScalar, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def count_substring( - strings: lib.StringArray - | lib.BinaryArray - | lib.ChunkedArray[lib.StringScalar] - | lib.ChunkedArray[lib.BinaryScalar], + strings: lib.Scalar[lib.StringType | lib.BinaryType | lib.LargeStringType | lib.LargeBinaryType], /, pattern: str, *, ignore_case: bool = False, options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... +) -> lib.Int32Scalar | lib.Int64Scalar: ... @overload def count_substring( - strings: lib.LargeStringArray - | lib.LargeBinaryArray - | lib.ChunkedArray[lib.LargeStringScalar] - | lib.ChunkedArray[lib.LargeBinaryScalar], + strings: lib.Array[lib.Scalar[lib.StringType | lib.BinaryType | lib.LargeStringType | lib.LargeBinaryType]] + | lib.ChunkedArray[lib.Scalar[lib.StringType | lib.BinaryType | lib.LargeStringType | lib.LargeBinaryType]], /, pattern: str, *, ignore_case: bool = False, options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... +) -> lib.Int32Array | lib.Int64Array: ... @overload def count_substring( strings: Expression, /, - pattern: str, + pattern: Any, *, ignore_case: bool = False, options: MatchSubstringOptions | None = None, @@ -5236,7 +5583,7 @@ def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): """ def coalesce( - *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None + *values: _ScalarOrArrayT | Expression, memory_pool: lib.MemoryPool | None = None ) -> _ScalarOrArrayT: """ Select the first non-null value. @@ -5380,7 +5727,7 @@ def list_value_length(*args, **kwargs): @overload def make_struct( - *args: lib.Scalar, + *args: lib.Scalar | lib._AsPyType, field_names: list[str] | tuple[str, ...] = (), field_nullability: bool | None = None, field_metadata: list[lib.KeyValueMetadata] | None = None, @@ -5389,7 +5736,7 @@ def make_struct( ) -> lib.StructScalar: ... @overload def make_struct( - *args: lib.Array | lib.ChunkedArray, + *args: lib.Array | lib.ChunkedArray | list[lib._AsPyType], field_names: list[str] | tuple[str, ...] = (), field_nullability: bool | None = None, field_metadata: list[lib.KeyValueMetadata] | None = None, @@ -5430,6 +5777,59 @@ def make_struct(*args, **kwargs): """ # ========================= 2.22 Conversions ========================= + +def run_end_decode( + array: lib.Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Decode run-end encoded array. + + Return a decoded version of a run-end encoded input array. + + Parameters + ---------- + array : Array-like + Argument to compute function. + + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + + +def run_end_encode( + array: lib.Array, + /, + run_end_type: lib.Type_INT16 | lib.Type_INT32 | lib.Type_INT64 = lib.Type_INT32, + *, + options: RunEndEncodeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Run-end encode array. + + Return a run-end encoded version of the input array. + + Parameters + ---------- + + array : Array-like + Argument to compute function. + + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + + options : pyarrow.compute.RunEndEncodeOptions, optional + Alternative way of passing options. + + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def ceil_temporal( timestamps: _TemporalScalarT, @@ -5666,7 +6066,7 @@ memory_pool : pyarrow.MemoryPool, optional @overload def cast( arr: lib.Scalar, - target_type: _DataTypeT, + target_type: _DataTypeT | None = None, safe: bool | None = None, options: CastOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -5674,7 +6074,7 @@ def cast( @overload def cast( arr: lib.Array, - target_type: _DataTypeT, + target_type: _DataTypeT | str | None = None, safe: bool | None = None, options: CastOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -5682,7 +6082,7 @@ def cast( @overload def cast( arr: lib.ChunkedArray, - target_type: _DataTypeT, + target_type: _DataTypeT | None = None, safe: bool | None = None, options: CastOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -5744,7 +6144,7 @@ def cast(*args, **kwargs): @overload def strftime( - timestamps: TemporalScalar, + timestamps: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C", @@ -5754,7 +6154,7 @@ def strftime( ) -> lib.StringScalar: ... @overload def strftime( - timestamps: TemporalArray, + timestamps: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C", @@ -5866,11 +6266,11 @@ def strptime(*args, **kwargs): # ========================= 2.23 Temporal component extraction ========================= @overload def day( - values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Scalar: ... @overload def day( - values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None + values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Array: ... @overload def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... @@ -5892,7 +6292,7 @@ def day(*args, **kwargs): @overload def day_of_week( - values: TemporalScalar, + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, count_from_zero: bool = True, @@ -5902,7 +6302,7 @@ def day_of_week( ) -> lib.Int64Scalar: ... @overload def day_of_week( - values: TemporalArray, + values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, *, count_from_zero: bool = True, @@ -5967,17 +6367,17 @@ memory_pool : pyarrow.MemoryPool, optional @overload def hour( - values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Scalar: ... @overload def hour( - values: lib.TimestampArray[Any] + values: _ZonedTimestampArrayT + | _ZonelessTimestampArrayT | lib.Time32Array[Any] | lib.Time64Array[Any] - | lib.ChunkedArray[lib.TimestampScalar[Any]] | lib.ChunkedArray[lib.Time32Scalar[Any]] | lib.ChunkedArray[lib.Time64Scalar[Any]], /, @@ -6009,11 +6409,11 @@ def hour(*args, **kwargs): @overload def is_dst( - values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanScalar: ... @overload def is_dst( - values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, *, memory_pool: lib.MemoryPool | None = None, @@ -6039,11 +6439,11 @@ def is_dst(*args, **kwargs): @overload def iso_week( - values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Scalar: ... @overload def iso_week( - values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, *, memory_pool: lib.MemoryPool | None = None, @@ -6088,16 +6488,48 @@ memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """ +@overload +def iso_calendar( + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar: ... +@overload +def iso_calendar( + values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def iso_calendar( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def iso_calendar(*args, **kwargs): + """ + Extract (ISO year, ISO week, ISO day of week) struct. + + ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7. + Null values emit null. An error is returned if the values have a defined + timezone, but it cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def is_leap_year( - values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar, + values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT | lib.Date32Scalar | lib.Date64Scalar, /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanScalar: ... @overload def is_leap_year( - values: lib.TimestampArray + values: _ZonedTimestampArrayT + | _ZonelessTimestampArrayT | lib.Date32Array | lib.Date64Array | lib.ChunkedArray[lib.TimestampScalar] @@ -6310,7 +6742,7 @@ memory_pool : pyarrow.MemoryPool, optional @overload def week( - values: lib.TimestampScalar, + values: lib.Scalar[lib.TimestampType[Any, Any]], /, *, week_starts_monday: bool = True, @@ -6321,7 +6753,7 @@ def week( ) -> lib.Int64Scalar: ... @overload def week( - values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, *, week_starts_monday: bool = True, @@ -6677,7 +7109,7 @@ memory_pool : pyarrow.MemoryPool, optional # ========================= 2.25 Timezone handling ========================= @overload def assume_timezone( - timestamps: lib.TimestampScalar, + timestamps: _ZonelessTimestampScalarT, /, timezone: str, *, @@ -6685,10 +7117,10 @@ def assume_timezone( nonexistent: Literal["raise", "earliest", "latest"] = "raise", options: AssumeTimezoneOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampScalar: ... +) -> _ZonedTimestampScalarT: ... @overload def assume_timezone( - timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + timestamps: _ZonelessTimestampArrayT, /, timezone: str, *, @@ -6696,7 +7128,23 @@ def assume_timezone( nonexistent: Literal["raise", "earliest", "latest"] = "raise", options: AssumeTimezoneOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampArray: ... +) -> _ZonedTimestampArrayT: ... +@overload +def assume_timezone( + timestamps: _ZonelessTimestampScalarT, + /, + *, + options: AssumeTimezoneOptions, + memory_pool: lib.MemoryPool | None = None, +) -> _ZonedTimestampScalarT: ... +@overload +def assume_timezone( + timestamps: _ZonelessTimestampArrayT, + /, + *, + options: AssumeTimezoneOptions, + memory_pool: lib.MemoryPool | None = None, +) -> _ZonedTimestampArrayT: ... @overload def assume_timezone( timestamps: Expression, @@ -6741,15 +7189,15 @@ def assume_timezone(*args, **kwargs): @overload def local_timestamp( - timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.TimestampScalar: ... + timestamps: _ZonedTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _ZonelessTimestampScalarT: ... @overload def local_timestamp( - timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + timestamps: _ZonedTimestampArrayT, /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampArray: ... +) -> _ZonelessTimestampArrayT: ... @overload def local_timestamp( timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None @@ -6777,7 +7225,7 @@ def local_timestamp(*args, **kwargs): def random( n: int, *, - initializer: Literal["system"] | int = "system", + initializer: Hashable = "system", options: RandomOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleArray: @@ -6810,7 +7258,7 @@ def random( def cumulative_sum( values: _NumericArrayT, /, - start: lib.Scalar | None = None, + start: lib.Scalar | int | None = None, *, skip_nulls: bool = False, options: CumulativeSumOptions | None = None, @@ -7009,6 +7457,22 @@ def dictionary_encode( memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... @overload +def dictionary_decode(array: _ScalarOrArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ScalarOrArrayT: ... +@overload +def dictionary_decode(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def dictionary_decode(*args, **kwargs): + """ + Decodes a DictionaryArray to an Array + + Return a plain-encoded version of the array input. + This function does nothing if the input is not a dictionary. + + Parameters + ---------- + array : Array-like + Argument to compute function. + """ +@overload def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... @overload def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... @@ -7045,14 +7509,7 @@ def array_filter( @overload def array_take( array: _ArrayT, - indices: list[int] - | list[int | None] - | lib.Int16Array - | lib.Int32Array - | lib.Int64Array - | lib.ChunkedArray[lib.Int16Scalar] - | lib.ChunkedArray[lib.Int32Scalar] - | lib.ChunkedArray[lib.Int64Scalar], + indices: Indices | list[int | None], /, *, boundscheck: bool = True, @@ -7210,7 +7667,7 @@ def array_sort_indices(*args, **kwargs): @overload def partition_nth_indices( - array: lib.Array | lib.ChunkedArray, + array: lib.Array | lib.ChunkedArray | Sequence[int | float | str | None], /, pivot: int, *, @@ -7315,12 +7772,92 @@ def rank( If not passed, will allocate memory from the default memory pool. """ +def rank_quantile( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: RankQuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: + """ + Compute quantile ranks of an array (1-based). + + This function computes a quantile rank of the input array. + By default, null values are considered greater than any other value and + are therefore sorted at the end of the input. For floating-point types, + NaNs are considered greater than any other non-null value, but smaller + than null values. + + The results are real values strictly between 0 and 1. They are + computed as in https://en.wikipedia.org/wiki/Quantile_rank + but without multiplying by 100. + + The handling of nulls and NaNs can be changed in RankQuantileOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.RankQuantileOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + + +rank_normal = _clone_signature(rank_quantile) +""" +Compute normal (gaussian) ranks of an array (1-based). + +This function computes a normal (gaussian) rank of the input array. +By default, null values are considered greater than any other value and +are therefore sorted at the end of the input. For floating-point types, +NaNs are considered greater than any other non-null value, but smaller +than null values. +The results are finite real values. They are obtained as if first +calling the "rank_quantile" function and then applying the normal +percent-point function (PPF) to the resulting quantile values. + +The handling of nulls and NaNs can be changed in RankQuantileOptions. + +Parameters +---------- +input : Array-like or scalar-like + Argument to compute function. +sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. +null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". +options : pyarrow.compute.RankQuantileOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + @overload def select_k_unstable( - input: lib.Array | lib.ChunkedArray, + input: lib.Array | lib.ChunkedArray | lib.Table, /, k: int, - sort_keys: list[tuple[str, _Order]], + sort_keys: list[tuple[str | Expression, _Order]] | None = None, *, options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -7330,11 +7867,27 @@ def select_k_unstable( input: Expression, /, k: int, - sort_keys: list[tuple[str, _Order]], + sort_keys: list[tuple[str | Expression, _Order]] | None = None, *, options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +@overload +def select_k_unstable( + input: lib.Array | lib.ChunkedArray | lib.Table, + /, + options: SelectKOptions, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def select_k_unstable( + input: Expression, + /, + options: SelectKOptions, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... def select_k_unstable(*args, **kwargs): """ Select the indices of the first `k` ordered elements from the input. @@ -7369,7 +7922,7 @@ def select_k_unstable(*args, **kwargs): def sort_indices( input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, /, - sort_keys: Sequence[tuple[str, _Order]] = (), + sort_keys: Sequence[tuple[str|Expression, _Order]] = (), *, null_placement: _Placement = "at_end", options: SortOptions | None = None, @@ -7379,7 +7932,7 @@ def sort_indices( def sort_indices( input: Expression, /, - sort_keys: Sequence[tuple[str, _Order]] = (), + sort_keys: Sequence[tuple[str|Expression, _Order]] = (), *, null_placement: _Placement = "at_end", options: SortOptions | None = None, @@ -7423,7 +7976,7 @@ def list_element( ) -> Expression: ... @overload def list_element( - lists: lib.Array[ListScalar[_DataTypeT]], + lists: lib.Array[ListScalar[_DataTypeT]] | lib.Array[lib.Scalar[lib.ListType[lib.StructType]]], index: ScalarLike, /, *, From 965910662d4b5b25c551d23efbad83870b94ca4c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 21 Aug 2025 19:55:10 +0200 Subject: [PATCH 04/26] Add Apache 2.0 license --- python/pyarrow/__init__.pyi | 18 +++++++++++++++++- python/pyarrow/__lib_pxi/__init__.pyi | 16 ++++++++++++++++ python/pyarrow/__lib_pxi/array.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/benchmark.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/builder.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/compat.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/config.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/device.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/error.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/io.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/ipc.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/memory.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/pandas_shim.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/scalar.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/table.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/tensor.pyi | 17 +++++++++++++++++ python/pyarrow/__lib_pxi/types.pyi | 17 +++++++++++++++++ python/pyarrow/_azurefs.pyi | 17 +++++++++++++++++ python/pyarrow/_compute.pyi | 17 +++++++++++++++++ python/pyarrow/_csv.pyi | 17 +++++++++++++++++ python/pyarrow/_cuda.pyi | 17 +++++++++++++++++ python/pyarrow/_dataset.pyi | 17 +++++++++++++++++ python/pyarrow/_dataset_orc.pyi | 17 +++++++++++++++++ python/pyarrow/_dataset_parquet.pyi | 17 +++++++++++++++++ .../pyarrow/_dataset_parquet_encryption.pyi | 17 +++++++++++++++++ python/pyarrow/_feather.pyi | 17 +++++++++++++++++ python/pyarrow/_flight.pyi | 17 +++++++++++++++++ python/pyarrow/_fs.pyi | 17 +++++++++++++++++ python/pyarrow/_gcsfs.pyi | 17 +++++++++++++++++ python/pyarrow/_hdfs.pyi | 17 +++++++++++++++++ python/pyarrow/_json.pyi | 17 +++++++++++++++++ python/pyarrow/_orc.pyi | 17 +++++++++++++++++ python/pyarrow/_parquet.pyi | 17 +++++++++++++++++ python/pyarrow/_parquet_encryption.pyi | 17 +++++++++++++++++ python/pyarrow/_s3fs.pyi | 17 +++++++++++++++++ python/pyarrow/_stubs_typing.pyi | 17 +++++++++++++++++ python/pyarrow/_substrait.pyi | 17 +++++++++++++++++ python/pyarrow/acero.pyi | 17 +++++++++++++++++ python/pyarrow/benchmark.pyi | 17 +++++++++++++++++ python/pyarrow/cffi.pyi | 17 +++++++++++++++++ python/pyarrow/csv.pyi | 17 +++++++++++++++++ python/pyarrow/cuda.pyi | 17 +++++++++++++++++ python/pyarrow/dataset.pyi | 17 +++++++++++++++++ python/pyarrow/feather.pyi | 17 +++++++++++++++++ python/pyarrow/flight.pyi | 17 +++++++++++++++++ python/pyarrow/fs.pyi | 17 +++++++++++++++++ python/pyarrow/gandiva.pyi | 17 +++++++++++++++++ python/pyarrow/interchange/__init__.pyi | 16 ++++++++++++++++ python/pyarrow/interchange/buffer.pyi | 17 +++++++++++++++++ python/pyarrow/interchange/column.pyi | 17 +++++++++++++++++ python/pyarrow/interchange/dataframe.pyi | 17 +++++++++++++++++ python/pyarrow/interchange/from_dataframe.pyi | 17 +++++++++++++++++ python/pyarrow/ipc.pyi | 17 +++++++++++++++++ python/pyarrow/json.pyi | 17 +++++++++++++++++ python/pyarrow/lib.pyi | 17 +++++++++++++++++ python/pyarrow/orc.pyi | 17 +++++++++++++++++ python/pyarrow/pandas_compat.pyi | 17 +++++++++++++++++ python/pyarrow/parquet/__init__.pyi | 19 ++++++++++++++++++- python/pyarrow/parquet/core.pyi | 17 +++++++++++++++++ python/pyarrow/parquet/encryption.pyi | 17 +++++++++++++++++ python/pyarrow/substrait.pyi | 17 +++++++++++++++++ python/pyarrow/types.pyi | 17 +++++++++++++++++ python/pyarrow/util.pyi | 17 +++++++++++++++++ 63 files changed, 1070 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/__init__.pyi b/python/pyarrow/__init__.pyi index 8a0d1e870c5..ed1cad1bf80 100644 --- a/python/pyarrow/__init__.pyi +++ b/python/pyarrow/__init__.pyi @@ -1,4 +1,20 @@ -# ruff: noqa: F401, I001, E402 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + __version__: str import pyarrow.lib as _lib diff --git a/python/pyarrow/__lib_pxi/__init__.pyi b/python/pyarrow/__lib_pxi/__init__.pyi index e69de29bb2d..13a83393a91 100644 --- a/python/pyarrow/__lib_pxi/__init__.pyi +++ b/python/pyarrow/__lib_pxi/__init__.pyi @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow/__lib_pxi/array.pyi b/python/pyarrow/__lib_pxi/array.pyi index 9283f57b69f..c14cd1b8c44 100644 --- a/python/pyarrow/__lib_pxi/array.pyi +++ b/python/pyarrow/__lib_pxi/array.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt import sys diff --git a/python/pyarrow/__lib_pxi/benchmark.pyi b/python/pyarrow/__lib_pxi/benchmark.pyi index 66981bf0f51..592561636af 100644 --- a/python/pyarrow/__lib_pxi/benchmark.pyi +++ b/python/pyarrow/__lib_pxi/benchmark.pyi @@ -1 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/pyarrow/__lib_pxi/builder.pyi b/python/pyarrow/__lib_pxi/builder.pyi index 4a0e9ca4708..39372f8e512 100644 --- a/python/pyarrow/__lib_pxi/builder.pyi +++ b/python/pyarrow/__lib_pxi/builder.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Iterable from pyarrow.lib import MemoryPool, _Weakrefable diff --git a/python/pyarrow/__lib_pxi/compat.pyi b/python/pyarrow/__lib_pxi/compat.pyi index ae667be453e..2ea013555c0 100644 --- a/python/pyarrow/__lib_pxi/compat.pyi +++ b/python/pyarrow/__lib_pxi/compat.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + def encode_file_path(path: str | bytes) -> bytes: ... def tobytes(o: str | bytes) -> bytes: ... def frombytes(o: bytes, *, safe: bool = False): ... diff --git a/python/pyarrow/__lib_pxi/config.pyi b/python/pyarrow/__lib_pxi/config.pyi index 166e10c9734..7c2eb8a9c98 100644 --- a/python/pyarrow/__lib_pxi/config.pyi +++ b/python/pyarrow/__lib_pxi/config.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import NamedTuple class VersionInfo(NamedTuple): diff --git a/python/pyarrow/__lib_pxi/device.pyi b/python/pyarrow/__lib_pxi/device.pyi index d1b9f39eedd..6c4f1fdeeea 100644 --- a/python/pyarrow/__lib_pxi/device.pyi +++ b/python/pyarrow/__lib_pxi/device.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from pyarrow.lib import _Weakrefable diff --git a/python/pyarrow/__lib_pxi/error.pyi b/python/pyarrow/__lib_pxi/error.pyi index 981ed51e680..c1e1a04ee40 100644 --- a/python/pyarrow/__lib_pxi/error.pyi +++ b/python/pyarrow/__lib_pxi/error.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow/__lib_pxi/io.pyi b/python/pyarrow/__lib_pxi/io.pyi index d882fd79d57..dca26a52940 100644 --- a/python/pyarrow/__lib_pxi/io.pyi +++ b/python/pyarrow/__lib_pxi/io.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys from collections.abc import Callable diff --git a/python/pyarrow/__lib_pxi/ipc.pyi b/python/pyarrow/__lib_pxi/ipc.pyi index 3d72892061e..819326443cf 100644 --- a/python/pyarrow/__lib_pxi/ipc.pyi +++ b/python/pyarrow/__lib_pxi/ipc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum import sys diff --git a/python/pyarrow/__lib_pxi/memory.pyi b/python/pyarrow/__lib_pxi/memory.pyi index 57a3bb4f1b3..e969e3738b8 100644 --- a/python/pyarrow/__lib_pxi/memory.pyi +++ b/python/pyarrow/__lib_pxi/memory.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow.lib import _Weakrefable class MemoryPool(_Weakrefable): diff --git a/python/pyarrow/__lib_pxi/pandas_shim.pyi b/python/pyarrow/__lib_pxi/pandas_shim.pyi index 0e80fae4ebf..ae8460cc2b3 100644 --- a/python/pyarrow/__lib_pxi/pandas_shim.pyi +++ b/python/pyarrow/__lib_pxi/pandas_shim.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from types import ModuleType from typing import Any, Iterable, TypeGuard diff --git a/python/pyarrow/__lib_pxi/scalar.pyi b/python/pyarrow/__lib_pxi/scalar.pyi index 77368bb264b..c6819f7e863 100644 --- a/python/pyarrow/__lib_pxi/scalar.pyi +++ b/python/pyarrow/__lib_pxi/scalar.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import collections.abc import datetime as dt import sys diff --git a/python/pyarrow/__lib_pxi/table.pyi b/python/pyarrow/__lib_pxi/table.pyi index 34960e2b903..5ad66f9d06c 100644 --- a/python/pyarrow/__lib_pxi/table.pyi +++ b/python/pyarrow/__lib_pxi/table.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt import sys diff --git a/python/pyarrow/__lib_pxi/tensor.pyi b/python/pyarrow/__lib_pxi/tensor.pyi index d849abd0f1f..5ad950c84d0 100644 --- a/python/pyarrow/__lib_pxi/tensor.pyi +++ b/python/pyarrow/__lib_pxi/tensor.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow/__lib_pxi/types.pyi b/python/pyarrow/__lib_pxi/types.pyi index 5cac864c3cc..aa965e3506c 100644 --- a/python/pyarrow/__lib_pxi/types.pyi +++ b/python/pyarrow/__lib_pxi/types.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt import sys diff --git a/python/pyarrow/_azurefs.pyi b/python/pyarrow/_azurefs.pyi index 317943ce20f..b9a83f01c56 100644 --- a/python/pyarrow/_azurefs.pyi +++ b/python/pyarrow/_azurefs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Literal from ._fs import FileSystem diff --git a/python/pyarrow/_compute.pyi b/python/pyarrow/_compute.pyi index 61ccb233feb..fa80304cf91 100644 --- a/python/pyarrow/_compute.pyi +++ b/python/pyarrow/_compute.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import ( Any, Callable, diff --git a/python/pyarrow/_csv.pyi b/python/pyarrow/_csv.pyi index 2f49f8c9a6c..c490d6be93a 100644 --- a/python/pyarrow/_csv.pyi +++ b/python/pyarrow/_csv.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from dataclasses import dataclass, field from typing import IO, Any, Callable, Literal diff --git a/python/pyarrow/_cuda.pyi b/python/pyarrow/_cuda.pyi index ad52b2f380f..6bcd9868d7f 100644 --- a/python/pyarrow/_cuda.pyi +++ b/python/pyarrow/_cuda.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any import cuda # type: ignore[import-not-found] diff --git a/python/pyarrow/_dataset.pyi b/python/pyarrow/_dataset.pyi index 114bf625983..4980cb0420f 100644 --- a/python/pyarrow/_dataset.pyi +++ b/python/pyarrow/_dataset.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow/_dataset_orc.pyi b/python/pyarrow/_dataset_orc.pyi index 9c4ac04198f..d4e5784750f 100644 --- a/python/pyarrow/_dataset_orc.pyi +++ b/python/pyarrow/_dataset_orc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from ._dataset import FileFormat class OrcFileFormat(FileFormat): diff --git a/python/pyarrow/_dataset_parquet.pyi b/python/pyarrow/_dataset_parquet.pyi index cbcc17235f1..007d3404a18 100644 --- a/python/pyarrow/_dataset_parquet.pyi +++ b/python/pyarrow/_dataset_parquet.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from dataclasses import dataclass from typing import IO, Any, Iterable, TypedDict diff --git a/python/pyarrow/_dataset_parquet_encryption.pyi b/python/pyarrow/_dataset_parquet_encryption.pyi index 7623275b865..be40c0b39b3 100644 --- a/python/pyarrow/_dataset_parquet_encryption.pyi +++ b/python/pyarrow/_dataset_parquet_encryption.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions from ._parquet import FileDecryptionProperties from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig diff --git a/python/pyarrow/_feather.pyi b/python/pyarrow/_feather.pyi index 8bb914ba45d..373fe38cdce 100644 --- a/python/pyarrow/_feather.pyi +++ b/python/pyarrow/_feather.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO from _typeshed import StrPath diff --git a/python/pyarrow/_flight.pyi b/python/pyarrow/_flight.pyi index 4450c42df49..a79475a8796 100644 --- a/python/pyarrow/_flight.pyi +++ b/python/pyarrow/_flight.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import asyncio import enum import sys diff --git a/python/pyarrow/_fs.pyi b/python/pyarrow/_fs.pyi index 7670ef5230d..45d4d922ac2 100644 --- a/python/pyarrow/_fs.pyi +++ b/python/pyarrow/_fs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt import enum import sys diff --git a/python/pyarrow/_gcsfs.pyi b/python/pyarrow/_gcsfs.pyi index 4fc7ea68e48..0ced106615a 100644 --- a/python/pyarrow/_gcsfs.pyi +++ b/python/pyarrow/_gcsfs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt from ._fs import FileSystem diff --git a/python/pyarrow/_hdfs.pyi b/python/pyarrow/_hdfs.pyi index 200f669379b..ed367379171 100644 --- a/python/pyarrow/_hdfs.pyi +++ b/python/pyarrow/_hdfs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from _typeshed import StrPath from ._fs import FileSystem diff --git a/python/pyarrow/_json.pyi b/python/pyarrow/_json.pyi index 43d2ae83cd8..f416b4b29c6 100644 --- a/python/pyarrow/_json.pyi +++ b/python/pyarrow/_json.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Any, Literal from _typeshed import StrPath diff --git a/python/pyarrow/_orc.pyi b/python/pyarrow/_orc.pyi index 71bf0dde9ba..7587cc121c3 100644 --- a/python/pyarrow/_orc.pyi +++ b/python/pyarrow/_orc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Literal from .lib import ( diff --git a/python/pyarrow/_parquet.pyi b/python/pyarrow/_parquet.pyi index a9187df0428..c75337cbf3b 100644 --- a/python/pyarrow/_parquet.pyi +++ b/python/pyarrow/_parquet.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict from _typeshed import StrPath diff --git a/python/pyarrow/_parquet_encryption.pyi b/python/pyarrow/_parquet_encryption.pyi index c707edb844a..e1228cbdb5a 100644 --- a/python/pyarrow/_parquet_encryption.pyi +++ b/python/pyarrow/_parquet_encryption.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt from typing import Callable diff --git a/python/pyarrow/_s3fs.pyi b/python/pyarrow/_s3fs.pyi index fc13c498bd9..e2f5f147096 100644 --- a/python/pyarrow/_s3fs.pyi +++ b/python/pyarrow/_s3fs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from typing import Literal, NotRequired, Required, TypedDict diff --git a/python/pyarrow/_stubs_typing.pyi b/python/pyarrow/_stubs_typing.pyi index c259513f1ea..549dc4059c3 100644 --- a/python/pyarrow/_stubs_typing.pyi +++ b/python/pyarrow/_stubs_typing.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt from collections.abc import Sequence diff --git a/python/pyarrow/_substrait.pyi b/python/pyarrow/_substrait.pyi index ff226e9521b..ee78e9720fe 100644 --- a/python/pyarrow/_substrait.pyi +++ b/python/pyarrow/_substrait.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, Callable from ._compute import Expression diff --git a/python/pyarrow/acero.pyi b/python/pyarrow/acero.pyi index 8a520bdc24a..2abb608b32c 100644 --- a/python/pyarrow/acero.pyi +++ b/python/pyarrow/acero.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow/benchmark.pyi b/python/pyarrow/benchmark.pyi index 048973301dc..3ea8f70bc34 100644 --- a/python/pyarrow/benchmark.pyi +++ b/python/pyarrow/benchmark.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow.lib import benchmark_PandasObjectIsNull __all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/pyarrow/cffi.pyi b/python/pyarrow/cffi.pyi index 2ae945c5974..e4f077d7155 100644 --- a/python/pyarrow/cffi.pyi +++ b/python/pyarrow/cffi.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import cffi c_source: str diff --git a/python/pyarrow/csv.pyi b/python/pyarrow/csv.pyi index 510229d7e72..a7abd413aab 100644 --- a/python/pyarrow/csv.pyi +++ b/python/pyarrow/csv.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._csv import ( ISO8601, ConvertOptions, diff --git a/python/pyarrow/cuda.pyi b/python/pyarrow/cuda.pyi index e11baf7d4e7..0394965bb73 100644 --- a/python/pyarrow/cuda.pyi +++ b/python/pyarrow/cuda.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._cuda import ( BufferReader, BufferWriter, diff --git a/python/pyarrow/dataset.pyi b/python/pyarrow/dataset.pyi index 98f1a38aa85..6cb7fed43e6 100644 --- a/python/pyarrow/dataset.pyi +++ b/python/pyarrow/dataset.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload from _typeshed import StrPath diff --git a/python/pyarrow/feather.pyi b/python/pyarrow/feather.pyi index 9451ee15763..ce8d83dbcd9 100644 --- a/python/pyarrow/feather.pyi +++ b/python/pyarrow/feather.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Literal import pandas as pd diff --git a/python/pyarrow/flight.pyi b/python/pyarrow/flight.pyi index 9b806ccf305..dcc6ee2244b 100644 --- a/python/pyarrow/flight.pyi +++ b/python/pyarrow/flight.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._flight import ( Action, ActionType, diff --git a/python/pyarrow/fs.pyi b/python/pyarrow/fs.pyi index 6bf75616c13..6c5a0af8d19 100644 --- a/python/pyarrow/fs.pyi +++ b/python/pyarrow/fs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._fs import ( # noqa FileSelector, FileType, diff --git a/python/pyarrow/gandiva.pyi b/python/pyarrow/gandiva.pyi index a344f885b29..bc07e15c4a6 100644 --- a/python/pyarrow/gandiva.pyi +++ b/python/pyarrow/gandiva.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Iterable, Literal from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable diff --git a/python/pyarrow/interchange/__init__.pyi b/python/pyarrow/interchange/__init__.pyi index e69de29bb2d..13a83393a91 100644 --- a/python/pyarrow/interchange/__init__.pyi +++ b/python/pyarrow/interchange/__init__.pyi @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow/interchange/buffer.pyi b/python/pyarrow/interchange/buffer.pyi index 46673961a75..78d1dabb8b7 100644 --- a/python/pyarrow/interchange/buffer.pyi +++ b/python/pyarrow/interchange/buffer.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from pyarrow.lib import Buffer diff --git a/python/pyarrow/interchange/column.pyi b/python/pyarrow/interchange/column.pyi index e6662867b6b..ce7e169bfb5 100644 --- a/python/pyarrow/interchange/column.pyi +++ b/python/pyarrow/interchange/column.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from typing import Any, Iterable, TypeAlias, TypedDict diff --git a/python/pyarrow/interchange/dataframe.pyi b/python/pyarrow/interchange/dataframe.pyi index 526a58926a9..a7ea6aeac74 100644 --- a/python/pyarrow/interchange/dataframe.pyi +++ b/python/pyarrow/interchange/dataframe.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow/interchange/from_dataframe.pyi b/python/pyarrow/interchange/from_dataframe.pyi index b04b6268975..aa6217b6181 100644 --- a/python/pyarrow/interchange/from_dataframe.pyi +++ b/python/pyarrow/interchange/from_dataframe.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, Protocol, TypeAlias from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table diff --git a/python/pyarrow/ipc.pyi b/python/pyarrow/ipc.pyi index c7f2af004d4..985cf0678f9 100644 --- a/python/pyarrow/ipc.pyi +++ b/python/pyarrow/ipc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from io import IOBase import pandas as pd diff --git a/python/pyarrow/json.pyi b/python/pyarrow/json.pyi index db1d35e0b8b..67768db42e4 100644 --- a/python/pyarrow/json.pyi +++ b/python/pyarrow/json.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json __all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow/lib.pyi b/python/pyarrow/lib.pyi index 1698b55520b..3292c52b2c0 100644 --- a/python/pyarrow/lib.pyi +++ b/python/pyarrow/lib.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # ruff: noqa: F403 from typing import NamedTuple diff --git a/python/pyarrow/orc.pyi b/python/pyarrow/orc.pyi index 2eba8d40a11..557f38a2b9e 100644 --- a/python/pyarrow/orc.pyi +++ b/python/pyarrow/orc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow/pandas_compat.pyi b/python/pyarrow/pandas_compat.pyi index efbd05ac2fe..82fcb19ad97 100644 --- a/python/pyarrow/pandas_compat.pyi +++ b/python/pyarrow/pandas_compat.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, TypedDict, TypeVar import numpy as np diff --git a/python/pyarrow/parquet/__init__.pyi b/python/pyarrow/parquet/__init__.pyi index 4ef88705809..8d0b5374ea0 100644 --- a/python/pyarrow/parquet/__init__.pyi +++ b/python/pyarrow/parquet/__init__.pyi @@ -1 +1,18 @@ -from .core import * # noqa +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .core import * # noqa diff --git a/python/pyarrow/parquet/core.pyi b/python/pyarrow/parquet/core.pyi index 56b2c8447d9..f5ac0510ffc 100644 --- a/python/pyarrow/parquet/core.pyi +++ b/python/pyarrow/parquet/core.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys from pathlib import Path diff --git a/python/pyarrow/parquet/encryption.pyi b/python/pyarrow/parquet/encryption.pyi index 5a77dae7ef7..fe9a454e593 100644 --- a/python/pyarrow/parquet/encryption.pyi +++ b/python/pyarrow/parquet/encryption.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._parquet_encryption import ( CryptoFactory, DecryptionConfiguration, diff --git a/python/pyarrow/substrait.pyi b/python/pyarrow/substrait.pyi index a56a8a5b40f..b78bbd8aebd 100644 --- a/python/pyarrow/substrait.pyi +++ b/python/pyarrow/substrait.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._substrait import ( BoundExpressions, SubstraitSchema, diff --git a/python/pyarrow/types.pyi b/python/pyarrow/types.pyi index 0cb4f6171d3..3ead6830421 100644 --- a/python/pyarrow/types.pyi +++ b/python/pyarrow/types.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys from typing import Any diff --git a/python/pyarrow/util.pyi b/python/pyarrow/util.pyi index c2ecf7d6b61..5c9687bb83f 100644 --- a/python/pyarrow/util.pyi +++ b/python/pyarrow/util.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from collections.abc import Callable from os import PathLike from typing import Any, Protocol, Sequence, TypeVar From dd34a49bdece15b643255ee3a172803df519d8cf Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 21 Aug 2025 21:29:07 +0200 Subject: [PATCH 05/26] Add a pyright check to CI --- .github/workflows/python.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 0d12accda4e..a4aa53e5cdc 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -138,6 +138,11 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} + - name: Type check with pyright + run: |- + python -m pip install pyright + pushd python; python -m pyright + macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 runs-on: macos-${{ matrix.macos-version }} From e3172115d39558594b789940579e3fed3b10c37a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 2 Sep 2025 23:02:42 +0200 Subject: [PATCH 06/26] automate docstring updates for stubfiles --- dev/update_stub_docstrings.py | 118 ++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 dev/update_stub_docstrings.py diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 00000000000..72db8b0d000 --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,118 @@ +# Utility to extract docstrings from pyarrow and update +# docstrings in stubfiles. +# +# Usage +# ===== +# +# python ../dev/update_stub_docstrings.py -s ./pyarrow/compute.pyi + + +import os +from pathlib import Path +from textwrap import indent + +import click +import griffe +import libcst as cst + +docstrings_map = {} + + +def extract_docstrings(pckg, path=""): + if "filepath" in pckg and pckg["filepath"].endswith(".pyi"): + return + if "docstring" in pckg: + docstrings_map[path] = pckg["docstring"].value + + for name, pckg in pckg.get("members", {}).items(): + extract_docstrings(pckg, path=f"{path}.{name}") + + +def _is_docstring_node(node): + """Checks if a node is a docstring.""" + return ( + isinstance(node, cst.SimpleStatementLine) and + isinstance(node.body[0], cst.Expr) and + isinstance(node.body[0].value, cst.SimpleString) + ) + + +class ClonedSignatureDocstringTransformer(cst.CSTTransformer): + def __init__(self, docstrings_map, module_name): + self.docstrings_map = docstrings_map + self.module_name = module_name + self.name_of_function = None + + def leave_Assign(self, original_node, updated_node): + target = original_node.targets[0].target + value = original_node.value + + if isinstance(target, cst.Name) and isinstance(value, cst.Call) and \ + value.func.value == "_clone_signature": + self.name_of_function = f"{self.module_name}.{target.value}" + return updated_node + + def leave_SimpleStatementLine(self, original_node, updated_node): + if self.name_of_function: + if len(updated_node.body) > 0 and _is_docstring_node(updated_node): + comment_content = self.docstrings_map[self.name_of_function].strip() + self.name_of_function = None + + new_string_node = cst.SimpleString(value=f'"""\n{comment_content}\n"""') + new_expr_node = updated_node.body[0].with_changes(value=new_string_node) + new_body = [new_expr_node] + list(updated_node.body[1:]) + updated_node = updated_node.with_changes(body=new_body) + + return updated_node + + +class FunctionDocstringTransformer(cst.CSTTransformer): + def __init__(self, docstrings_map, module_name): + self.docstrings_map = docstrings_map + self.module_name = module_name + + def leave_FunctionDef(self, original_node, updated_node): + full_name = f"{self.module_name}.{original_node.name.value}" + + # Check if we have a docstring for this function + if full_name in self.docstrings_map: + # Check if the function already has a docstring + body_list = list(updated_node.body.body) + has_docstring = len(body_list) > 0 and _is_docstring_node(body_list[0]) + + if has_docstring: + # Replace existing docstring + docstring = indent(self.docstrings_map[full_name], " ").strip() + docstring_value = f'"""\n {docstring}\n """' + new_docstring_node = cst.SimpleStatementLine( + body=[cst.Expr(value=cst.SimpleString(value=docstring_value))] + ) + new_body = [new_docstring_node] + body_list[1:] + return updated_node.with_changes( + body=updated_node.body.with_changes(body=new_body) + ) + + return updated_node + +@click.command() +@click.option('--stub_file', '-s', type=click.Path(resolve_path=True)) +def update_stub_file(stub_file): + package = griffe.load("pyarrow", try_relative_path=False, force_inspection=True, resolve_aliases=True) + extract_docstrings(package.as_dict(), "pyarrow") + + with open(stub_file, 'r') as f: + tree = cst.parse_module(f.read()) + + cloned_signature_transformer = ClonedSignatureDocstringTransformer(docstrings_map, "pyarrow.compute") + function_docstring_transformer = FunctionDocstringTransformer(docstrings_map, "pyarrow.compute") + + modified_tree = tree.visit(function_docstring_transformer) + modified_tree = modified_tree.visit(cloned_signature_transformer) + + + # Write the modified code + with open(stub_file, "w") as f: + f.write(modified_tree.code) + +if __name__ == "__main__": + update_stub_file(obj={}) From 30d0ca02e0d3c4af21065e2b78db9ef6cea9afc4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 12 Sep 2025 15:30:33 +0200 Subject: [PATCH 07/26] Remove some stubs --- python/pyarrow/__lib_pxi/benchmark.pyi | 18 - python/pyarrow/__lib_pxi/builder.pyi | 106 - python/pyarrow/__lib_pxi/compat.pyi | 22 - python/pyarrow/__lib_pxi/config.pyi | 58 - python/pyarrow/__lib_pxi/device.pyi | 105 - python/pyarrow/__lib_pxi/error.pyi | 70 - python/pyarrow/__lib_pxi/ipc.pyi | 722 -- python/pyarrow/__lib_pxi/pandas_shim.pyi | 68 - python/pyarrow/__lib_pxi/table.pyi | 5640 --------------- python/pyarrow/_azurefs.pyi | 91 - python/pyarrow/_compute.pyi | 1768 ----- python/pyarrow/_csv.pyi | 658 -- python/pyarrow/_cuda.pyi | 573 -- python/pyarrow/_dataset.pyi | 2318 ------ python/pyarrow/_dataset_orc.pyi | 23 - python/pyarrow/_dataset_parquet.pyi | 331 - python/pyarrow/_flight.pyi | 1397 ---- python/pyarrow/_fs.pyi | 1022 --- python/pyarrow/_hdfs.pyi | 92 - python/pyarrow/_json.pyi | 186 - python/pyarrow/_orc.pyi | 73 - python/pyarrow/_parquet.pyi | 462 -- python/pyarrow/_s3fs.pyi | 91 - python/pyarrow/_substrait.pyi | 56 - python/pyarrow/acero.pyi | 102 - python/pyarrow/benchmark.pyi | 20 - python/pyarrow/cffi.pyi | 21 - python/pyarrow/compute.pyi | 8332 ---------------------- python/pyarrow/cuda.py | 25 - python/pyarrow/dataset.pyi | 246 - python/pyarrow/feather.pyi | 67 - python/pyarrow/gandiva.pyi | 82 - python/pyarrow/ipc.pyi | 140 - python/pyarrow/json.pyi | 20 - python/pyarrow/orc.pyi | 296 - python/pyarrow/pandas_compat.pyi | 71 - python/pyarrow/substrait.pyi | 38 - python/pyarrow/util.pyi | 44 - 38 files changed, 25454 deletions(-) delete mode 100644 python/pyarrow/__lib_pxi/benchmark.pyi delete mode 100644 python/pyarrow/__lib_pxi/builder.pyi delete mode 100644 python/pyarrow/__lib_pxi/compat.pyi delete mode 100644 python/pyarrow/__lib_pxi/config.pyi delete mode 100644 python/pyarrow/__lib_pxi/device.pyi delete mode 100644 python/pyarrow/__lib_pxi/error.pyi delete mode 100644 python/pyarrow/__lib_pxi/ipc.pyi delete mode 100644 python/pyarrow/__lib_pxi/pandas_shim.pyi delete mode 100644 python/pyarrow/__lib_pxi/table.pyi delete mode 100644 python/pyarrow/_azurefs.pyi delete mode 100644 python/pyarrow/_compute.pyi delete mode 100644 python/pyarrow/_csv.pyi delete mode 100644 python/pyarrow/_cuda.pyi delete mode 100644 python/pyarrow/_dataset.pyi delete mode 100644 python/pyarrow/_dataset_orc.pyi delete mode 100644 python/pyarrow/_dataset_parquet.pyi delete mode 100644 python/pyarrow/_flight.pyi delete mode 100644 python/pyarrow/_fs.pyi delete mode 100644 python/pyarrow/_hdfs.pyi delete mode 100644 python/pyarrow/_json.pyi delete mode 100644 python/pyarrow/_orc.pyi delete mode 100644 python/pyarrow/_parquet.pyi delete mode 100644 python/pyarrow/_s3fs.pyi delete mode 100644 python/pyarrow/_substrait.pyi delete mode 100644 python/pyarrow/acero.pyi delete mode 100644 python/pyarrow/benchmark.pyi delete mode 100644 python/pyarrow/cffi.pyi delete mode 100644 python/pyarrow/compute.pyi delete mode 100644 python/pyarrow/cuda.py delete mode 100644 python/pyarrow/dataset.pyi delete mode 100644 python/pyarrow/feather.pyi delete mode 100644 python/pyarrow/gandiva.pyi delete mode 100644 python/pyarrow/ipc.pyi delete mode 100644 python/pyarrow/json.pyi delete mode 100644 python/pyarrow/orc.pyi delete mode 100644 python/pyarrow/pandas_compat.pyi delete mode 100644 python/pyarrow/substrait.pyi delete mode 100644 python/pyarrow/util.pyi diff --git a/python/pyarrow/__lib_pxi/benchmark.pyi b/python/pyarrow/__lib_pxi/benchmark.pyi deleted file mode 100644 index 592561636af..00000000000 --- a/python/pyarrow/__lib_pxi/benchmark.pyi +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/pyarrow/__lib_pxi/builder.pyi b/python/pyarrow/__lib_pxi/builder.pyi deleted file mode 100644 index 39372f8e512..00000000000 --- a/python/pyarrow/__lib_pxi/builder.pyi +++ /dev/null @@ -1,106 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Iterable - -from pyarrow.lib import MemoryPool, _Weakrefable - -from .array import StringArray, StringViewArray - -class StringBuilder(_Weakrefable): - """ - Builder class for UTF8 strings. - - This class exposes facilities for incrementally adding string values and - building the null bitmap for a pyarrow.Array (type='string'). - """ - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): - """ - Append a single value to the builder. - - The value can either be a string/bytes object or a null value - (np.nan or None). - - Parameters - ---------- - value : string/bytes or np.nan/None - The value to append to the string array builder. - """ - def append_values(self, values: Iterable[str | bytes | None]): - """ - Append all the values from an iterable. - - Parameters - ---------- - values : iterable of string/bytes or np.nan/None values - The values to append to the string array builder. - """ - def finish(self) -> StringArray: - """ - Return result of builder as an Array object; also resets the builder. - - Returns - ------- - array : pyarrow.Array - """ - @property - def null_count(self) -> int: ... - def __len__(self) -> int: ... - -class StringViewBuilder(_Weakrefable): - """ - Builder class for UTF8 string views. - - This class exposes facilities for incrementally adding string values and - building the null bitmap for a pyarrow.Array (type='string_view'). - """ - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): - """ - Append a single value to the builder. - - The value can either be a string/bytes object or a null value - (np.nan or None). - - Parameters - ---------- - value : string/bytes or np.nan/None - The value to append to the string array builder. - """ - def append_values(self, values: Iterable[str | bytes | None]): - """ - Append all the values from an iterable. - - Parameters - ---------- - values : iterable of string/bytes or np.nan/None values - The values to append to the string array builder. - """ - def finish(self) -> StringViewArray: - """ - Return result of builder as an Array object; also resets the builder. - - Returns - ------- - array : pyarrow.Array - """ - @property - def null_count(self) -> int: ... - def __len__(self) -> int: ... - -__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow/__lib_pxi/compat.pyi b/python/pyarrow/__lib_pxi/compat.pyi deleted file mode 100644 index 2ea013555c0..00000000000 --- a/python/pyarrow/__lib_pxi/compat.pyi +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -def encode_file_path(path: str | bytes) -> bytes: ... -def tobytes(o: str | bytes) -> bytes: ... -def frombytes(o: bytes, *, safe: bool = False): ... - -__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/pyarrow/__lib_pxi/config.pyi b/python/pyarrow/__lib_pxi/config.pyi deleted file mode 100644 index 7c2eb8a9c98..00000000000 --- a/python/pyarrow/__lib_pxi/config.pyi +++ /dev/null @@ -1,58 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import NamedTuple - -class VersionInfo(NamedTuple): - major: int - minor: int - patch: int - -class BuildInfo(NamedTuple): - version: str - version_info: VersionInfo - so_version: str - full_so_version: str - compiler_id: str - compiler_version: str - compiler_flags: str - git_id: str - git_description: str - package_kind: str - build_type: str - -class RuntimeInfo(NamedTuple): - simd_level: str - detected_simd_level: str - -cpp_build_info: BuildInfo -cpp_version: str -cpp_version_info: VersionInfo - -def runtime_info() -> RuntimeInfo: ... -def set_timezone_db_path(path: str) -> None: ... - -__all__ = [ - "VersionInfo", - "BuildInfo", - "RuntimeInfo", - "cpp_build_info", - "cpp_version", - "cpp_version_info", - "runtime_info", - "set_timezone_db_path", -] diff --git a/python/pyarrow/__lib_pxi/device.pyi b/python/pyarrow/__lib_pxi/device.pyi deleted file mode 100644 index 6c4f1fdeeea..00000000000 --- a/python/pyarrow/__lib_pxi/device.pyi +++ /dev/null @@ -1,105 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import enum - -from pyarrow.lib import _Weakrefable - -class DeviceAllocationType(enum.Flag): - CPU = enum.auto() - CUDA = enum.auto() - CUDA_HOST = enum.auto() - OPENCL = enum.auto() - VULKAN = enum.auto() - METAL = enum.auto() - VPI = enum.auto() - ROCM = enum.auto() - ROCM_HOST = enum.auto() - EXT_DEV = enum.auto() - CUDA_MANAGED = enum.auto() - ONEAPI = enum.auto() - WEBGPU = enum.auto() - HEXAGON = enum.auto() - -class Device(_Weakrefable): - """ - Abstract interface for hardware devices - - This object represents a device with access to some memory spaces. - When handling a Buffer or raw memory address, it allows deciding in which - context the raw memory address should be interpreted - (e.g. CPU-accessible memory, or embedded memory on some particular GPU). - """ - - @property - def type_name(self) -> str: - """ - A shorthand for this device's type. - """ - @property - def device_id(self) -> int: - """ - A device ID to identify this device if there are multiple of this type. - - If there is no "device_id" equivalent (such as for the main CPU device on - non-numa systems) returns -1. - """ - @property - def is_cpu(self) -> bool: - """ - Whether this device is the main CPU device. - - This shorthand method is very useful when deciding whether a memory address - is CPU-accessible. - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - Return the DeviceAllocationType of this device. - """ - -class MemoryManager(_Weakrefable): - """ - An object that provides memory management primitives. - - A MemoryManager is always tied to a particular Device instance. - It can also have additional parameters (such as a MemoryPool to - allocate CPU memory). - - """ - @property - def device(self) -> Device: - """ - The device this MemoryManager is tied to. - """ - @property - def is_cpu(self) -> bool: - """ - Whether this MemoryManager is tied to the main CPU device. - - This shorthand method is very useful when deciding whether a memory - address is CPU-accessible. - """ - -def default_cpu_memory_manager() -> MemoryManager: - """ - Return the default CPU MemoryManager instance. - - The returned singleton instance uses the default MemoryPool. - """ - -__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow/__lib_pxi/error.pyi b/python/pyarrow/__lib_pxi/error.pyi deleted file mode 100644 index c1e1a04ee40..00000000000 --- a/python/pyarrow/__lib_pxi/error.pyi +++ /dev/null @@ -1,70 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self - -class ArrowException(Exception): ... -class ArrowInvalid(ValueError, ArrowException): ... -class ArrowMemoryError(MemoryError, ArrowException): ... -class ArrowKeyError(KeyError, ArrowException): ... -class ArrowTypeError(TypeError, ArrowException): ... -class ArrowNotImplementedError(NotImplementedError, ArrowException): ... -class ArrowCapacityError(ArrowException): ... -class ArrowIndexError(IndexError, ArrowException): ... -class ArrowSerializationError(ArrowException): ... - -class ArrowCancelled(ArrowException): - signum: int | None - def __init__(self, message: str, signum: int | None = None) -> None: ... - -ArrowIOError = IOError - -class StopToken: ... - -def enable_signal_handlers(enable: bool) -> None: ... - -have_signal_refcycle: bool - -class SignalStopHandler: - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... - def __dealloc__(self) -> None: ... - @property - def stop_token(self) -> StopToken: ... - -__all__ = [ - "ArrowException", - "ArrowInvalid", - "ArrowMemoryError", - "ArrowKeyError", - "ArrowTypeError", - "ArrowNotImplementedError", - "ArrowCapacityError", - "ArrowIndexError", - "ArrowSerializationError", - "ArrowCancelled", - "ArrowIOError", - "StopToken", - "enable_signal_handlers", - "have_signal_refcycle", - "SignalStopHandler", -] diff --git a/python/pyarrow/__lib_pxi/ipc.pyi b/python/pyarrow/__lib_pxi/ipc.pyi deleted file mode 100644 index 819326443cf..00000000000 --- a/python/pyarrow/__lib_pxi/ipc.pyi +++ /dev/null @@ -1,722 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import enum -import sys - -from io import IOBase - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import Iterable, Iterator, Literal, Mapping, NamedTuple - -import pandas as pd - -from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer -from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable - -from .io import Buffer, Codec, NativeFile -from .types import DictionaryMemo, KeyValueMetadata - -class MetadataVersion(enum.IntEnum): - V1 = enum.auto() - V2 = enum.auto() - V3 = enum.auto() - V4 = enum.auto() - V5 = enum.auto() - -class WriteStats(NamedTuple): - """IPC write statistics - - Parameters - ---------- - num_messages : int - Number of messages. - num_record_batches : int - Number of record batches. - num_dictionary_batches : int - Number of dictionary batches. - num_dictionary_deltas : int - Delta of dictionaries. - num_replaced_dictionaries : int - Number of replaced dictionaries. - """ - - num_messages: int - num_record_batches: int - num_dictionary_batches: int - num_dictionary_deltas: int - num_replaced_dictionaries: int - -class ReadStats(NamedTuple): - """IPC read statistics - - Parameters - ---------- - num_messages : int - Number of messages. - num_record_batches : int - Number of record batches. - num_dictionary_batches : int - Number of dictionary batches. - num_dictionary_deltas : int - Delta of dictionaries. - num_replaced_dictionaries : int - Number of replaced dictionaries. - """ - - num_messages: int - num_record_batches: int - num_dictionary_batches: int - num_dictionary_deltas: int - num_replaced_dictionaries: int - -class IpcReadOptions(_Weakrefable): - """ - Serialization options for reading IPC format. - - Parameters - ---------- - ensure_native_endian : bool, default True - Whether to convert incoming data to platform-native endianness. - use_threads : bool - Whether to use the global CPU thread pool to parallelize any - computational tasks like decompression - included_fields : list - If empty (the default), return all deserialized fields. - If non-empty, the values are the indices of fields to read on - the top-level schema - """ - - ensure_native_endian: bool - use_threads: bool - included_fields: list[int] - def __init__( - self, - *, - ensure_native_endian: bool = True, - use_threads: bool = True, - included_fields: list[int] | None = None, - ) -> None: ... - -class IpcWriteOptions(_Weakrefable): - """ - Serialization options for the IPC format. - - Parameters - ---------- - metadata_version : MetadataVersion, default MetadataVersion.V5 - The metadata version to write. V5 is the current and latest, - V4 is the pre-1.0 metadata version (with incompatible Union layout). - allow_64bit : bool, default False - If true, allow field lengths that don't fit in a signed 32-bit int. - use_legacy_format : bool, default False - Whether to use the pre-Arrow 0.15 IPC format. - compression : str, Codec, or None - compression codec to use for record batch buffers. - If None then batch buffers will be uncompressed. - Must be "lz4", "zstd" or None. - To specify a compression_level use `pyarrow.Codec` - use_threads : bool - Whether to use the global CPU thread pool to parallelize any - computational tasks like compression. - emit_dictionary_deltas : bool - Whether to emit dictionary deltas. Default is false for maximum - stream compatibility. - unify_dictionaries : bool - If true then calls to write_table will attempt to unify dictionaries - across all batches in the table. This can help avoid the need for - replacement dictionaries (which the file format does not support) - but requires computing the unified dictionary and then remapping - the indices arrays. - - This parameter is ignored when writing to the IPC stream format as - the IPC stream format can support replacement dictionaries. - """ - - metadata_version: MetadataVersion - allow_64bit: bool - use_legacy_format: bool - compression: Codec | Literal["lz4", "zstd"] | None - use_threads: bool - emit_dictionary_deltas: bool - unify_dictionaries: bool - def __init__( - self, - *, - metadata_version: MetadataVersion = MetadataVersion.V5, - allow_64bit: bool = False, - use_legacy_format: bool = False, - compression: Codec | Literal["lz4", "zstd"] | None = None, - use_threads: bool = True, - emit_dictionary_deltas: bool = False, - unify_dictionaries: bool = False, - ) -> None: ... - -class Message(_Weakrefable): - """ - Container for an Arrow IPC message with metadata and optional body - """ - - @property - def type(self) -> str: ... - @property - def metadata(self) -> Buffer: ... - @property - def metadata_version(self) -> MetadataVersion: ... - @property - def body(self) -> Buffer | None: ... - def equals(self, other: Message) -> bool: ... - def serialize_to( - self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None - ): - """ - Write message to generic OutputStream - - Parameters - ---------- - sink : NativeFile - alignment : int, default 8 - Byte alignment for metadata and body - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - """ - def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write message as encapsulated IPC message - - Parameters - ---------- - alignment : int, default 8 - Byte alignment for metadata and body - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - """ - -class MessageReader(_Weakrefable): - """ - Interface for reading Message objects from some source (like an - InputStream) - """ - @classmethod - def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: - """ - Open stream from source, if you want to use memory map use - MemoryMappedFile as source. - - Parameters - ---------- - source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object - A readable source, like an InputStream - """ - def __iter__(self) -> Self: ... - def read_next_message(self) -> Message: - """ - Read next Message from the stream. - - Raises - ------ - StopIteration - At end of stream - """ - __next__ = read_next_message - -# ---------------------------------------------------------------------- -# File and stream readers and writers - -class _CRecordBatchWriter(_Weakrefable): - """The base RecordBatchWriter wrapper. - - Provides common implementations of convenience methods. Should not - be instantiated directly by user code. - """ - def write(self, table_or_batch: Table | RecordBatch): - """ - Write RecordBatch or Table to stream. - - Parameters - ---------- - table_or_batch : {RecordBatch, Table} - """ - def write_batch( - self, - batch: RecordBatch, - custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, - ): - """ - Write RecordBatch to stream. - - Parameters - ---------- - batch : RecordBatch - custom_metadata : mapping or KeyValueMetadata - Keys and values must be string-like / coercible to bytes - """ - def write_table(self, table: Table, max_chunksize: int | None = None) -> None: - """ - Write Table to stream in (contiguous) RecordBatch objects. - - Parameters - ---------- - table : Table - max_chunksize : int, default None - Maximum number of rows for RecordBatch chunks. Individual chunks may - be smaller depending on the chunk layout of individual columns. - """ - def close(self) -> None: - """ - Close stream and write end-of-stream 0 marker. - """ - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_val, exc_tb): ... - @property - def stats(self) -> WriteStats: - """ - Current IPC write statistics. - """ - -class _RecordBatchStreamWriter(_CRecordBatchWriter): - def __dealloc__(self) -> None: ... - def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... - -class _ReadPandasMixin: - def read_pandas(self, **options) -> pd.DataFrame: - """ - Read contents of stream to a pandas.DataFrame. - - Read all record batches as a pyarrow.Table then convert it to a - pandas.DataFrame using Table.to_pandas. - - Parameters - ---------- - **options - Arguments to forward to :meth:`Table.to_pandas`. - - Returns - ------- - df : pandas.DataFrame - """ - -class RecordBatchReader(_Weakrefable): - """Base class for reading stream of record batches. - - Record batch readers function as iterators of record batches that also - provide the schema (without the need to get any batches). - - Warnings - -------- - Do not call this class's constructor directly, use one of the - ``RecordBatchReader.from_*`` functions instead. - - Notes - ----- - To import and export using the Arrow C stream interface, use the - ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this - interface is intended for expert users. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([("x", pa.int64())]) - >>> def iter_record_batches(): - ... for i in range(2): - ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) - >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) - >>> print(reader.schema) - x: int64 - >>> for batch in reader: - ... print(batch) - pyarrow.RecordBatch - x: int64 - ---- - x: [1,2,3] - pyarrow.RecordBatch - x: int64 - ---- - x: [1,2,3] - """ - - def __iter__(self) -> Self: ... - def read_next_batch(self) -> RecordBatch: - """ - Read next RecordBatch from the stream. - - Raises - ------ - StopIteration: - At end of stream. - - Returns - ------- - RecordBatch - """ - __next__ = read_next_batch - @property - def schema(self) -> Schema: - """ - Shared schema of the record batches in the stream. - - Returns - ------- - Schema - """ - def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: - """ - Read next RecordBatch from the stream along with its custom metadata. - - Raises - ------ - StopIteration: - At end of stream. - - Returns - ------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - def iter_batches_with_custom_metadata( - self, - ) -> Iterator[RecordBatchWithMetadata]: - """ - Iterate over record batches from the stream along with their custom - metadata. - - Yields - ------ - RecordBatchWithMetadata - """ - def read_all(self) -> Table: - """ - Read all record batches as a pyarrow.Table. - - Returns - ------- - Table - """ - read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] - def close(self) -> None: - """ - Release any resources associated with the reader. - """ - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_val, exc_tb): ... - def cast(self, target_schema: Schema) -> Self: - """ - Wrap this reader with one that casts each batch lazily as it is pulled. - Currently only a safe cast to target_schema is implemented. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - - Returns - ------- - RecordBatchReader - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowArrayStream struct, given its pointer. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArrayStream struct. - - Be careful: if you don't pass the ArrowArrayStream struct to a - consumer, array memory will leak. This is a low-level function - intended for expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import RecordBatchReader from a C ArrowArrayStream struct, - given its pointer. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArrayStream struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export to a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - - Returns - ------- - PyCapsule - A capsule containing a C ArrowArrayStream struct. - """ - @classmethod - def _import_from_c_capsule(cls, stream) -> Self: - """ - Import RecordBatchReader from a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - stream: PyCapsule - A capsule containing a C ArrowArrayStream PyCapsule. - - Returns - ------- - RecordBatchReader - """ - @classmethod - def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: - """ - Create RecordBatchReader from a Arrow-compatible stream object. - - This accepts objects implementing the Arrow PyCapsule Protocol for - streams, i.e. objects that have a ``__arrow_c_stream__`` method. - - Parameters - ---------- - data : Arrow-compatible stream object - Any object that implements the Arrow PyCapsule Protocol for - streams. - schema : Schema, default None - The schema to which the stream should be casted, if supported - by the stream object. - - Returns - ------- - RecordBatchReader - """ - @classmethod - def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: - """ - Create RecordBatchReader from an iterable of batches. - - Parameters - ---------- - schema : Schema - The shared schema of the record batches - batches : Iterable[RecordBatch] - The batches that this reader will return. - - Returns - ------- - reader : RecordBatchReader - """ - -class _RecordBatchStreamReader(RecordBatchReader): - @property - def stats(self) -> ReadStats: - """ - Current IPC read statistics. - """ - -class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... - -class RecordBatchWithMetadata(NamedTuple): - """RecordBatch with its custom metadata - - Parameters - ---------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - - batch: RecordBatch - custom_metadata: KeyValueMetadata - -class _RecordBatchFileReader(_Weakrefable): - @property - def num_record_batches(self) -> int: - """ - The number of record batches in the IPC file. - """ - def get_batch(self, i: int) -> RecordBatch: - """ - Read the record batch with the given index. - - Parameters - ---------- - i : int - The index of the record batch in the IPC file. - - Returns - ------- - batch : RecordBatch - """ - get_record_batch = get_batch - def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: - """ - Read the record batch with the given index along with - its custom metadata - - Parameters - ---------- - i : int - The index of the record batch in the IPC file. - - Returns - ------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - def read_all(self) -> Table: - """ - Read all record batches as a pyarrow.Table - """ - read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_val, exc_tb): ... - @property - def schema(self) -> Schema: ... - @property - def stats(self) -> ReadStats: ... - -def get_tensor_size(tensor: Tensor) -> int: - """ - Return total size of serialized Tensor including metadata and padding. - - Parameters - ---------- - tensor : Tensor - The tensor for which we want to known the size. - """ - -def get_record_batch_size(batch: RecordBatch) -> int: - """ - Return total size of serialized RecordBatch including metadata and padding. - - Parameters - ---------- - batch : RecordBatch - The recordbatch for which we want to know the size. - """ - -def write_tensor(tensor: Tensor, dest: NativeFile) -> int: - """ - Write pyarrow.Tensor to pyarrow.NativeFile object its current position. - - Parameters - ---------- - tensor : pyarrow.Tensor - dest : pyarrow.NativeFile - - Returns - ------- - bytes_written : int - Total number of bytes written to the file - """ - -def read_tensor(source: NativeFile) -> Tensor: - """Read pyarrow.Tensor from pyarrow.NativeFile object from current - position. If the file source supports zero copy (e.g. a memory map), then - this operation does not allocate any memory. This function not assume that - the stream is aligned - - Parameters - ---------- - source : pyarrow.NativeFile - - Returns - ------- - tensor : Tensor - - """ - -def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: - """ - Read length-prefixed message from file or buffer-like object - - Parameters - ---------- - source : pyarrow.NativeFile, file-like object, or buffer-like object - - Returns - ------- - message : Message - """ - -def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: - """ - Read Schema from message or buffer - - Parameters - ---------- - obj : buffer or Message - dictionary_memo : DictionaryMemo, optional - Needed to be able to reconstruct dictionary-encoded fields - with read_record_batch - - Returns - ------- - schema : Schema - """ - -def read_record_batch( - obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None -) -> RecordBatch: - """ - Read RecordBatch from message, given a known schema. If reading data from a - complete IPC stream, use ipc.open_stream instead - - Parameters - ---------- - obj : Message or Buffer-like - schema : Schema - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - - Returns - ------- - batch : RecordBatch - """ - -__all__ = [ - "MetadataVersion", - "WriteStats", - "ReadStats", - "IpcReadOptions", - "IpcWriteOptions", - "Message", - "MessageReader", - "_CRecordBatchWriter", - "_RecordBatchStreamWriter", - "_ReadPandasMixin", - "RecordBatchReader", - "_RecordBatchStreamReader", - "_RecordBatchFileWriter", - "RecordBatchWithMetadata", - "_RecordBatchFileReader", - "get_tensor_size", - "get_record_batch_size", - "write_tensor", - "read_tensor", - "read_message", - "read_schema", - "read_record_batch", -] diff --git a/python/pyarrow/__lib_pxi/pandas_shim.pyi b/python/pyarrow/__lib_pxi/pandas_shim.pyi deleted file mode 100644 index ae8460cc2b3..00000000000 --- a/python/pyarrow/__lib_pxi/pandas_shim.pyi +++ /dev/null @@ -1,68 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from types import ModuleType -from typing import Any, Iterable, TypeGuard - -import pandas as pd - -from numpy import dtype -from pandas.core.dtypes.base import ExtensionDtype - -class _PandasAPIShim: - has_sparse: bool - - def series(self, *args, **kwargs) -> pd.Series: ... - def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... - @property - def have_pandas(self) -> bool: ... - @property - def compat(self) -> ModuleType: ... - @property - def pd(self) -> ModuleType: ... - def infer_dtype(self, obj: Iterable) -> str: ... - def pandas_dtype(self, dtype: str) -> dtype: ... - @property - def loose_version(self) -> Any: ... - @property - def version(self) -> str: ... - def is_v1(self) -> bool: ... - def is_ge_v21(self) -> bool: ... - def is_ge_v23(self) -> bool: ... - def is_ge_v3(self) -> bool: ... - @property - def categorical_type(self) -> type[pd.Categorical]: ... - @property - def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... - @property - def extension_dtype(self) -> type[ExtensionDtype]: ... - def is_array_like( - self, obj: Any - ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... - def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... - def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... - def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... - def is_sparse(self, obj: Any) -> bool: ... - def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... - def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... - def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... - def get_values(self, obj: Any) -> bool: ... - def get_rangeindex_attribute(self, level, name): ... - -_pandas_api: _PandasAPIShim - -__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/pyarrow/__lib_pxi/table.pyi b/python/pyarrow/__lib_pxi/table.pyi deleted file mode 100644 index 5ad66f9d06c..00000000000 --- a/python/pyarrow/__lib_pxi/table.pyi +++ /dev/null @@ -1,5640 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import datetime as dt -import sys - -from decimal import Decimal - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias -from typing import ( - Any, - Collection, - Generator, - Generic, - Iterable, - Iterator, - Literal, - Mapping, - Sequence, - TypeVar, - overload, -) - -import numpy as np -import pandas as pd - -from numpy.typing import NDArray -from pyarrow._compute import ( - CastOptions, - CountOptions, - FunctionOptions, - ScalarAggregateOptions, - TDigestOptions, - VarianceOptions, -) -from pyarrow._stubs_typing import ( - Indices, - Mask, - NullEncoding, - NullSelectionBehavior, - Order, - SupportArrowArray, - SupportArrowDeviceArray, - SupportArrowStream, -) -from pyarrow.compute import ArrayOrChunkedArray, Expression -from pyarrow.interchange.dataframe import _PyArrowDataFrame -from pyarrow.lib import Device, MemoryManager, MemoryPool, MonthDayNano, Schema -from pyarrow.lib import Field as _Field - -from . import array, scalar, types -from .array import Array, StructArray, _CastAs, _PandasConvertible -from .device import DeviceAllocationType -from .io import Buffer -from .ipc import RecordBatchReader -from .scalar import Int64Scalar, Scalar, NullableCollection -from .tensor import Tensor -from .types import DataType, _AsPyType, _BasicDataType, _DataTypeT - -Field: TypeAlias = _Field[DataType] -_ScalarT = TypeVar("_ScalarT", bound=Scalar) -_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) - -_Aggregation: TypeAlias = Literal[ - "all", - "any", - "approximate_median", - "count", - "count_all", - "count_distinct", - "distinct", - "first", - "first_last", - "last", - "list", - "max", - "mean", - "min", - "min_max", - "one", - "product", - "stddev", - "sum", - "tdigest", - "variance", -] -_AggregationPrefixed: TypeAlias = Literal[ - "hash_all", - "hash_any", - "hash_approximate_median", - "hash_count", - "hash_count_all", - "hash_count_distinct", - "hash_distinct", - "hash_first", - "hash_first_last", - "hash_last", - "hash_list", - "hash_max", - "hash_mean", - "hash_min", - "hash_min_max", - "hash_one", - "hash_product", - "hash_stddev", - "hash_sum", - "hash_tdigest", - "hash_variance", -] -Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed -AggregateOptions: TypeAlias = ( - ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions -) - -UnarySelector: TypeAlias = str -NullarySelector: TypeAlias = tuple[()] -NarySelector: TypeAlias = list[str] | tuple[str, ...] -ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector - -class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): - """ - An array-like composed from a (possibly empty) collection of pyarrow.Arrays - - Warnings - -------- - Do not call this class's constructor directly. - - Examples - -------- - To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: - - >>> import pyarrow as pa - >>> pa.chunked_array([], type=pa.int8()) - - [ - ... - ] - - >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) - True - """ - - @property - def data(self) -> Self: ... - @property - def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: - """ - Return data type of a ChunkedArray. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.type - DataType(int64) - """ - def length(self) -> int: - """ - Return length of a ChunkedArray. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.length() - 6 - """ - __len__ = length - def to_string( - self, - *, - indent: int = 0, - window: int = 5, - container_window: int = 2, - skip_new_lines: bool = False, - ) -> str: - """ - Render a "pretty-printed" string representation of the ChunkedArray - - Parameters - ---------- - indent : int - How much to indent right the content of the array, - by default ``0``. - window : int - How many items to preview within each chunk at the begin and end - of the chunk when the chunk is bigger than the window. - The other elements will be ellipsed. - container_window : int - How many chunks to preview at the begin and end - of the array when the array is bigger than the window. - The other elements will be ellipsed. - This setting also applies to list columns. - skip_new_lines : bool - If the array should be rendered as a single line of text - or if each element should be on its own line. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_string(skip_new_lines=True) - '[[2,2,4],[4,5,100]]' - """ - format = to_string - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - @property - def null_count(self) -> int: - """ - Number of null entries - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.null_count - 1 - """ - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the chunked array. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.nbytes - 49 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the chunked array. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.get_total_buffer_size() - 49 - """ - def __sizeof__(self) -> int: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - @overload - def __getitem__(self, key: int) -> _Scalar_co: ... - def __getitem__(self, key): - """ - Slice or return value at given index - - Parameters - ---------- - key : integer or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - value : Scalar (index) or ChunkedArray (slice) - """ - def getitem(self, i: int) -> Scalar: ... - def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: - """ - Return boolean array indicating the null values. - - Parameters - ---------- - nan_is_null : bool (optional, default False) - Whether floating-point NaN values should also be considered null. - - Returns - ------- - array : boolean Array or ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.is_null() - - [ - [ - false, - false, - false, - false, - true, - false - ] - ] - """ - def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: - """ - Return boolean array indicating the NaN values. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) - >>> arr.is_nan() - - [ - [ - false, - true, - false, - false, - null, - false - ] - ] - """ - def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: - """ - Return boolean array indicating the non-null values. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.is_valid() - - [ - [ - true, - true, - true - ], - [ - true, - false, - true - ] - ] - """ - def fill_null(self, fill_value: Scalar[_DataTypeT] | _AsPyType | str | None) -> Self: - """ - Replace each null element in values with fill_value. - - See :func:`pyarrow.compute.fill_null` for full usage. - - Parameters - ---------- - fill_value : any - The replacement value for null entries. - - Returns - ------- - result : Array or ChunkedArray - A new array with nulls replaced by the given value. - - Examples - -------- - >>> import pyarrow as pa - >>> fill_value = pa.scalar(5, type=pa.int8()) - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.fill_null(fill_value) - - [ - [ - 2, - 2, - 4, - 4, - 5, - 100 - ] - ] - """ - def equals(self, other: Self | Array[Any] | Iterable[Any]) -> bool: - """ - Return whether the contents of two chunked arrays are equal. - - Parameters - ---------- - other : pyarrow.ChunkedArray - Chunked array to compare against. - - Returns - ------- - are_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) - ... ) - >>> n_legs.equals(n_legs) - True - >>> n_legs.equals(animals) - False - """ - def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: - """ - Return a NumPy copy of this array (experimental). - - Parameters - ---------- - zero_copy_only : bool, default False - Introduced for signature consistence with pyarrow.Array.to_numpy. - This must be False here since NumPy arrays' buffer must be contiguous. - - Returns - ------- - array : numpy.ndarray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_numpy() - array([ 2, 2, 4, 4, 5, 100]) - """ - def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - @overload - def cast( - self, - target_type: None = None, - safe: bool | None = None, - options: CastOptions | None = None, - ) -> Self: ... - @overload - def cast( - self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None - ) -> ChunkedArray[Scalar[_CastAs]]: ... - def cast(self, *args, **kwargs): - """ - Cast array values to another data type - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - cast : Array or ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.type - DataType(int64) - - Change the data type of an array: - - >>> n_legs_seconds = n_legs.cast(pa.duration("s")) - >>> n_legs_seconds.type - DurationType(duration[s]) - """ - def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: - """ - Compute dictionary-encoded representation of array. - - See :func:`pyarrow.compute.dictionary_encode` for full usage. - - Parameters - ---------- - null_encoding : str, default "mask" - How to handle null entries. - - Returns - ------- - encoded : ChunkedArray - A dictionary-encoded version of this array. - - Examples - -------- - >>> import pyarrow as pa - >>> animals = pa.chunked_array( - ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) - ... ) - >>> animals.dictionary_encode() - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 3, - 4, - 5 - ] - ] - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: - """ - Flatten this ChunkedArray. If it has a struct type, the column is - flattened into one array per struct field. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : list of ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> c_arr = pa.chunked_array(n_legs.value_counts()) - >>> c_arr - - [ - -- is_valid: all not null - -- child 0 type: int64 - [ - 2, - 4, - 5, - 100 - ] - -- child 1 type: int64 - [ - 2, - 2, - 1, - 1 - ] - ] - >>> c_arr.flatten() - [ - [ - [ - 2, - 4, - 5, - 100 - ] - ], - [ - [ - 2, - 2, - 1, - 1 - ] - ]] - >>> c_arr.type - StructType(struct) - >>> n_legs.type - DataType(int64) - """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: - """ - Flatten this ChunkedArray into a single non-chunked array. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.combine_chunks() - - [ - 2, - 2, - 4, - 4, - 5, - 100 - ] - """ - def unique(self) -> ChunkedArray[_Scalar_co]: - """ - Compute distinct elements in array - - Returns - ------- - pyarrow.Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.unique() - - [ - 2, - 4, - 5, - 100 - ] - """ - def value_counts(self) -> StructArray: - """ - Compute counts of unique elements in array. - - Returns - ------- - An array of structs - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.value_counts() - - -- is_valid: all not null - -- child 0 type: int64 - [ - 2, - 4, - 5, - 100 - ] - -- child 1 type: int64 - [ - 2, - 2, - 1, - 1 - ] - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this ChunkedArray - - Parameters - ---------- - offset : int, default 0 - Offset from start of array to slice - length : int, default None - Length of slice (default is until end of batch starting from - offset) - - Returns - ------- - sliced : ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.slice(2, 2) - - [ - [ - 4 - ], - [ - 4 - ] - ] - """ - def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: - """ - Select values from the chunked array. - - See :func:`pyarrow.compute.filter` for full usage. - - Parameters - ---------- - mask : Array or array-like - The boolean mask to filter the chunked array with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled. - - Returns - ------- - filtered : Array or ChunkedArray - An array of the same type, with only the elements selected by - the boolean mask. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> mask = pa.array([True, False, None, True, False, True]) - >>> n_legs.filter(mask) - - [ - [ - 2 - ], - [ - 4, - 100 - ] - ] - >>> n_legs.filter(mask, null_selection_behavior="emit_null") - - [ - [ - 2, - null - ], - [ - 4, - 100 - ] - ] - """ - @overload - def index( - self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], - value: Scalar[_DataTypeT] | _AsPyType, - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> Int64Scalar: ... - @overload - def index( - self, - value: Scalar[_DataTypeT], - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> Int64Scalar: ... - def index(self, *args, **kwargs): - """ - Find the first index of a value. - - See :func:`pyarrow.compute.index` for full usage. - - Parameters - ---------- - value : Scalar or object - The value to look for in the array. - start : int, optional - The start index where to look for `value`. - end : int, optional - The end index where to look for `value`. - memory_pool : MemoryPool, optional - A memory pool for potential memory allocations. - - Returns - ------- - index : Int64Scalar - The index of the value in the array (-1 if not found). - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.index(4) - - >>> n_legs.index(4, start=3) - - """ - def take(self, indices: Indices) -> Self: - """ - Select values from the chunked array. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the array whose values will be returned. - - Returns - ------- - taken : Array or ChunkedArray - An array with the same datatype, containing the taken values. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.take([1, 4, 5]) - - [ - [ - 2, - 5, - 100 - ] - ] - """ - def drop_null(self) -> Self: - """ - Remove missing values from a chunked array. - See :func:`pyarrow.compute.drop_null` for full description. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.drop_null() - - [ - [ - 2, - 2 - ], - [ - 4, - 5, - 100 - ] - ] - """ - def sort(self, order: Order = "ascending", **kwargs) -> Self: - """ - Sort the ChunkedArray - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : ChunkedArray - """ - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Unify dictionaries across all chunks. - - This method returns an equivalent chunked array, but where all - chunks share the same dictionary values. Dictionary indices are - transposed accordingly. - - If there are no dictionaries in the chunked array, it is returned - unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() - >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() - >>> c_arr = pa.chunked_array([arr_1, arr_2]) - >>> c_arr - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ] - ] - >>> c_arr.unify_dictionaries() - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 3, - 4, - 5 - ] - ] - """ - @property - def num_chunks(self) -> int: - """ - Number of underlying chunks. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs.num_chunks - 2 - """ - def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: - """ - Select a chunk by its index. - - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs.chunk(1) - - [ - 4, - 5, - 100 - ] - """ - @property - def chunks(self) -> list[Array[_Scalar_co]]: - """ - Convert to a list of single-chunked arrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.chunks - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ]] - """ - @overload - def iterchunks( - self: ChunkedArray[scalar.NullScalar], - ) -> Generator[array.NullArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.BooleanScalar], - ) -> Generator[array.BooleanArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt8Scalar], - ) -> Generator[array.UInt8Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int8Scalar], - ) -> Generator[array.Int8Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt16Scalar], - ) -> Generator[array.UInt16Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int16Scalar], - ) -> Generator[array.Int16Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt32Scalar], - ) -> Generator[array.UInt32Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int32Scalar], - ) -> Generator[array.Int32Array, None, None]: - """ - Convert to an iterator of ChunkArrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> for i in n_legs.iterchunks(): - ... print(i.null_count) - 0 - 1 - - """ - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt64Scalar], - ) -> Generator[array.UInt64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int64Scalar], - ) -> Generator[array.Int64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.HalfFloatScalar], - ) -> Generator[array.HalfFloatArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.FloatScalar], - ) -> Generator[array.FloatArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.DoubleScalar], - ) -> Generator[array.DoubleArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal32Scalar], - ) -> Generator[array.Decimal32Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal64Scalar], - ) -> Generator[array.Decimal64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal128Scalar], - ) -> Generator[array.Decimal128Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal256Scalar], - ) -> Generator[array.Decimal256Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Date32Scalar], - ) -> Generator[array.Date32Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Date64Scalar], - ) -> Generator[array.Date64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Time32Scalar[types._Time32Unit]], - ) -> Generator[array.Time32Array[types._Time32Unit], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Time64Scalar[types._Time64Unit]], - ) -> Generator[array.Time64Array[types._Time64Unit], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.DurationScalar[types._Unit]], - ) -> Generator[array.DurationArray[types._Unit], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.MonthDayNanoIntervalScalar], - ) -> Generator[array.MonthDayNanoIntervalArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.BinaryScalar], - ) -> Generator[array.BinaryArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeBinaryScalar], - ) -> Generator[array.LargeBinaryArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.FixedSizeBinaryScalar], - ) -> Generator[array.FixedSizeBinaryArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.StringScalar], - ) -> Generator[array.StringArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeStringScalar], - ) -> Generator[array.LargeStringArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.BinaryViewScalar], - ) -> Generator[array.BinaryViewArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.StringViewScalar], - ) -> Generator[array.StringViewArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.ListScalar[_DataTypeT]], - ) -> Generator[array.ListArray[scalar.ListScalar[_DataTypeT]], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.FixedSizeListScalar[_DataTypeT, types._Size]], - ) -> Generator[array.FixedSizeListArray[_DataTypeT, types._Size], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeListScalar[_DataTypeT]], - ) -> Generator[array.LargeListArray[_DataTypeT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeListViewScalar[_DataTypeT]], - ) -> Generator[array.LargeListViewArray[_DataTypeT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.StructScalar], - ) -> Generator[array.StructArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.MapScalar[array._MapKeyT, array._MapItemT]], - ) -> Generator[array.MapArray[array._MapKeyT, array._MapItemT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.DictionaryScalar[types._IndexT, types._BasicValueT]], - ) -> Generator[array.DictionaryArray[types._IndexT, types._BasicValueT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.RunEndEncodedScalar], - ) -> Generator[array.RunEndEncodedArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UnionScalar], - ) -> Generator[array.UnionArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Bool8Scalar], - ) -> Generator[array.Bool8Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UuidScalar], - ) -> Generator[array.UuidArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.JsonScalar], - ) -> Generator[array.JsonArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.OpaqueScalar], - ) -> Generator[array.OpaqueArray, None, None]: ... - def iterchunks(self): - """ - Convert to an iterator of ChunkArrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> for i in n_legs.iterchunks(): - ... print(i.null_count) - 0 - 1 - - """ - def __iter__(self) -> Iterator[_Scalar_co]: ... - def to_pylist( - self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: - """ - Convert to a list of native Python objects. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.to_pylist() - [2, 2, 4, 4, None, 100] - """ - def __arrow_c_stream__(self, requested_schema=None) -> Any: - """ - Export to a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - - Returns - ------- - PyCapsule - A capsule containing a C ArrowArrayStream struct. - """ - @classmethod - def _import_from_c_capsule(cls, stream) -> Self: - """ - Import ChunkedArray from a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - stream: PyCapsule - A capsule containing a C ArrowArrayStream PyCapsule. - - Returns - ------- - ChunkedArray - """ - @property - def is_cpu(self) -> bool: - """ - Whether all chunks in the ChunkedArray are CPU-accessible. - """ - -@overload -def chunked_array( - values: Iterable[NullableCollection[bool]], - type: None = None, -) -> ChunkedArray[scalar.BooleanScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[int]], - type: None = None, -) -> ChunkedArray[scalar.Int64Scalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[float]], - type: None = None, -) -> ChunkedArray[scalar.DoubleScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[Decimal]], - type: None = None, -) -> ChunkedArray[scalar.Decimal128Scalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dict[str, Any]]], - type: None = None, -) -> ChunkedArray[scalar.StructScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.datetime]], - type: None = None, -) -> ChunkedArray[scalar.TimestampScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.date]], - type: None = None, -) -> ChunkedArray[scalar.Date32Scalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.time]], - type: None = None, -) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.timedelta]], - type: None = None, -) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[MonthDayNano]], - type: None = None, -) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[str]], - type: None = None, -) -> ChunkedArray[scalar.StringScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[bytes]], - type: None = None, -) -> ChunkedArray[scalar.BinaryScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[list[Any]]], - type: None = None, -) -> ChunkedArray[scalar.ListScalar[Any]]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[types.Decimal128Type[Any, Any]]], - type: types.Decimal128Type, -) -> ChunkedArray[types.Decimal128Type]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["null"] | types.NullType, -) -> ChunkedArray[scalar.NullScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["bool", "boolean"] | types.BoolType, -) -> ChunkedArray[scalar.BooleanScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i1", "int8"] | types.Int8Type, -) -> ChunkedArray[scalar.Int8Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i2", "int16"] | types.Int16Type, -) -> ChunkedArray[scalar.Int16Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i4", "int32"] | types.Int32Type, -) -> ChunkedArray[scalar.Int32Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i8", "int64"] | types.Int64Type, -) -> ChunkedArray[scalar.Int64Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u1", "uint8"] | types.UInt8Type, -) -> ChunkedArray[scalar.UInt8Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u2", "uint16"] | types.UInt16Type, -) -> ChunkedArray[scalar.UInt16Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u4", "uint32"] | types.Uint32Type, -) -> ChunkedArray[scalar.UInt32Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u8", "uint64"] | types.UInt64Type, -) -> ChunkedArray[scalar.UInt64Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f2", "halffloat", "float16"] | types.Float16Type, -) -> ChunkedArray[scalar.HalfFloatScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f4", "float", "float32"] | types.Float32Type, -) -> ChunkedArray[scalar.FloatScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f8", "double", "float64"] | types.Float64Type, -) -> ChunkedArray[scalar.DoubleScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["string", "str", "utf8"] | types.StringType, -) -> ChunkedArray[scalar.StringScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["binary"] | types.BinaryType, -) -> ChunkedArray[scalar.BinaryScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, -) -> ChunkedArray[scalar.LargeStringScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["large_binary"] | types.LargeBinaryType, -) -> ChunkedArray[scalar.LargeBinaryScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["binary_view"] | types.BinaryViewType, -) -> ChunkedArray[scalar.BinaryViewScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["string_view"] | types.StringViewType, -) -> ChunkedArray[scalar.StringViewScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["date32", "date32[day]"] | types.Date32Type, -) -> ChunkedArray[scalar.Date32Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["date64", "date64[ms]"] | types.Date64Type, -) -> ChunkedArray[scalar.Date64Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], -) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], -) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], -) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], -) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[s]"] | types.DurationType[Literal["s"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[us]"] | types.DurationType[Literal["us"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, - type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, -) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... -@overload -def chunked_array( - values: Iterable[Array[_ScalarT]], - type: None = None, -) -> ChunkedArray[_ScalarT]: ... -def chunked_array(value, type=None): - """ - Construct chunked array from list of array-like objects - - Parameters - ---------- - arrays : Array, list of Array, or array-like - Must all be the same data type. Can be empty only if type also passed. - Any Arrow-compatible array that implements the Arrow PyCapsule Protocol - (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be - passed as well. - type : DataType or string coercible to DataType - - Returns - ------- - ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> pa.chunked_array([], type=pa.int8()) - - [ - ... - ] - - >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - """ - -_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) - -class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): - def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: - """ - Return the dataframe interchange object implementing the interchange protocol. - - Parameters - ---------- - nan_as_null : bool, default False - Whether to tell the DataFrame to overwrite null values in the data - with ``NaN`` (or ``NaT``). - allow_copy : bool, default True - Whether to allow memory copying when exporting. If set to False - it would cause non-zero-copy exports to fail. - - Returns - ------- - DataFrame interchange object - The object which consuming library can use to ingress the dataframe. - - Notes - ----- - Details on the interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - `nan_as_null` currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - """ - @overload - def __getitem__(self, key: int | str) -> _ColumnT: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - def __getitem__(self, key): - """ - Slice or return column at given index or column name - - Parameters - ---------- - key : integer, str, or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - Array (from RecordBatch) or ChunkedArray (from Table) for column input. - RecordBatch or Table for slice input. - """ - def __len__(self) -> int: ... - def column(self, i: int | str) -> _ColumnT: - """ - Select single column from Table or RecordBatch. - - Parameters - ---------- - i : int or string - The index or name of the column to retrieve. - - Returns - ------- - column : Array (for RecordBatch) or ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Select a column by numeric index: - - >>> table.column(0) - - [ - [ - 2, - 4, - 5, - 100 - ] - ] - - Select a column by its name: - - >>> table.column("animals") - - [ - [ - "Flamingo", - "Horse", - "Brittle stars", - "Centipede" - ] - ] - """ - @property - def column_names(self) -> list[str]: - """ - Names of the Table or RecordBatch columns. - - Returns - ------- - list of str - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> table = pa.Table.from_arrays( - ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], - ... names=["n_legs", "animals"], - ... ) - >>> table.column_names - ['n_legs', 'animals'] - """ - @property - def columns(self) -> list[_ColumnT]: - """ - List of all columns in numerical order. - - Returns - ------- - columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.columns - [ - [ - [ - null, - 4, - 5, - null - ] - ], - [ - [ - "Flamingo", - "Horse", - null, - "Centipede" - ] - ]] - """ - def drop_null(self) -> Self: - """ - Remove rows that contain missing values from a Table or RecordBatch. - - See :func:`pyarrow.compute.drop_null` for full usage. - - Returns - ------- - Table or RecordBatch - A tabular object with the same schema, with rows containing - no missing values. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [None, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", None, "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.drop_null() - pyarrow.Table - year: double - n_legs: int64 - animals: string - ---- - year: [[2022,2021]] - n_legs: [[4,100]] - animals: [["Horse","Centipede"]] - """ - def field(self, i: int | str) -> Field: - """ - Select a schema field by its column name or numeric index. - - Parameters - ---------- - i : int or string - The index or name of the field to retrieve. - - Returns - ------- - Field - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.field(0) - pyarrow.Field - >>> table.field(1) - pyarrow.Field - """ - @classmethod - def from_pydict( - cls, - mapping: Mapping[str, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray], - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a Table or RecordBatch from Arrow arrays or columns. - - Parameters - ---------- - mapping : dict or Mapping - A mapping of strings to Arrays or Python lists. - schema : Schema, default None - If not passed, will be inferred from the Mapping values. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table or RecordBatch - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> pydict = {"n_legs": n_legs, "animals": animals} - - Construct a Table from a dictionary of arrays: - - >>> pa.Table.from_pydict(pydict) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_pydict(pydict).schema - n_legs: int64 - animals: string - - Construct a Table from a dictionary of arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a dictionary of arrays with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.Table.from_pydict(pydict, schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - @classmethod - def from_pylist( - cls, - mapping: Sequence[Mapping[str, Any]], - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a Table or RecordBatch from list of rows / dictionaries. - - Parameters - ---------- - mapping : list of dicts of rows - A mapping of strings to row values. - schema : Schema, default None - If not passed, will be inferred from the first row of the - mapping values. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table or RecordBatch - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] - - Construct a Table from a list of rows: - - >>> pa.Table.from_pylist(pylist) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4]] - animals: [["Flamingo","Dog"]] - - Construct a Table from a list of rows with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a list of rows with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.Table.from_pylist(pylist, schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - def itercolumns(self) -> Generator[_ColumnT, None, None]: - """ - Iterator over all columns in their numerical order. - - Yields - ------ - Array (for RecordBatch) or ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> for i in table.itercolumns(): - ... print(i.null_count) - 2 - 1 - """ - @property - def num_columns(self) -> int: ... - @property - def num_rows(self) -> int: ... - @property - def shape(self) -> tuple[int, int]: - """ - Dimensions of the table or record batch: (#rows, #columns). - - Returns - ------- - (int, int) - Number of rows and number of columns. - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table.shape - (4, 2) - """ - @property - def schema(self) -> Schema: ... - @property - def nbytes(self) -> int: ... - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: - """ - Sort the Table or RecordBatch by one or multiple columns. - - Parameters - ---------- - sorting : str or list[tuple(name, order)] - Name of the column to use to sort (ascending), or - a list of multiple sorting conditions where - each entry is a tuple with column name - and sorting order ("ascending" or "descending") - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - Table or RecordBatch - A new tabular object sorted according to the sort keys. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.sort_by("animal") - pyarrow.Table - year: int64 - n_legs: int64 - animal: string - ---- - year: [[2019,2021,2021,2020,2022,2022]] - n_legs: [[5,100,4,2,4,2]] - animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] - """ - def take(self, indices: Indices) -> Self: - """ - Select rows from a Table or RecordBatch. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the tabular object whose rows will be returned. - - Returns - ------- - Table or RecordBatch - A tabular object with the same schema, containing the taken rows. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.take([1, 3]) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2022,2021]] - n_legs: [[4,100]] - animals: [["Horse","Centipede"]] - """ - def filter( - self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" - ) -> Self: - """ - Select rows from the table or record batch based on a boolean mask. - - The Table can be filtered based on a mask, which will be passed to - :func:`pyarrow.compute.filter` to perform the filtering, or it can - be filtered through a boolean :class:`.Expression` - - Parameters - ---------- - mask : Array or array-like or .Expression - The boolean mask or the :class:`.Expression` to filter the table with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled, does nothing if - an :class:`.Expression` is used. - - Returns - ------- - filtered : Table or RecordBatch - A tabular object of the same schema, with only the rows selected - by applied filtering - - Examples - -------- - Using a Table (works similarly for RecordBatch): - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Define an expression and select rows: - - >>> import pyarrow.compute as pc - >>> expr = pc.field("year") <= 2020 - >>> table.filter(expr) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2019]] - n_legs: [[2,5]] - animals: [["Flamingo","Brittle stars"]] - - Define a mask and select rows: - - >>> mask = [True, True, False, None] - >>> table.filter(mask) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022]] - n_legs: [[2,4]] - animals: [["Flamingo","Horse"]] - >>> table.filter(mask, null_selection_behavior="emit_null") - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,null]] - n_legs: [[2,4,null]] - animals: [["Flamingo","Horse",null]] - """ - def to_pydict( - self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> dict[str, list[Any]]: - """ - Convert the Table or RecordBatch to a dict or OrderedDict. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - dict - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> table.to_pydict() - {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} - """ - def to_pylist( - self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> list[dict[str, Any]]: - """ - Convert the Table or RecordBatch to a list of rows / dictionaries. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - list - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] - >>> table = pa.table(data, names=["n_legs", "animals"]) - >>> table.to_pylist() - [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... - """ - def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: - """ - Return human-readable string representation of Table or RecordBatch. - - Parameters - ---------- - show_metadata : bool, default False - Display Field-level and Schema-level KeyValueMetadata. - preview_cols : int, default 0 - Display values of the columns for the first N columns. - - Returns - ------- - str - """ - def remove_column(self, i: int) -> Self: ... - def drop_columns(self, columns: str | list[str]) -> Self: - """ - Drop one or more columns and return a new Table or RecordBatch. - - Parameters - ---------- - columns : str or list[str] - Field name(s) referencing existing column(s). - - Raises - ------ - KeyError - If any of the passed column names do not exist. - - Returns - ------- - Table or RecordBatch - A tabular object without the column(s). - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Drop one column: - - >>> table.drop_columns("animals") - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,4,5,100]] - - Drop one or more columns: - - >>> table.drop_columns(["n_legs", "animals"]) - pyarrow.Table - ... - ---- - """ - def add_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: ... - def append_column( - self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: - """ - Append column at end of columns. - - Parameters - ---------- - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - Table or RecordBatch - New table or record batch with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Append column at the end: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.append_column("year", [year]) - pyarrow.Table - n_legs: int64 - animals: string - year: int64 - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - year: [[2021,2022,2019,2021]] - """ - -class RecordBatch(_Tabular[Array]): - """ - Batch of rows of columns of equal length - - Warnings - -------- - Do not call this class's constructor directly, use one of the - ``RecordBatch.from_*`` functions instead. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Constructing a RecordBatch from arrays: - - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Constructing a RecordBatch from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.RecordBatch.from_pandas(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_pandas(df).to_pandas() - year month day n_legs animals - 0 2020 3 1 2 Flamingo - 1 2022 5 5 4 Horse - 2 2021 7 9 5 Brittle stars - 3 2022 9 13 100 Centipede - - Constructing a RecordBatch from pylist: - - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] - >>> pa.RecordBatch.from_pylist(pylist).to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Dog - - You can also construct a RecordBatch using :func:`pyarrow.record_batch`: - - >>> pa.record_batch([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - >>> pa.record_batch(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def replace_schema_metadata( - self, metadata: dict[str | bytes, str | bytes] | None = None - ) -> Self: - """ - Create shallow copy of record batch by replacing schema - key-value metadata with the indicated new metadata (which may be None, - which deletes any existing metadata - - Parameters - ---------- - metadata : dict, default None - - Returns - ------- - shallow_copy : RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - - Constructing a RecordBatch with schema and metadata: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) - >>> batch.schema - n_legs: int64 - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Shallow copy of a RecordBatch with deleted schema metadata: - - >>> batch.replace_schema_metadata().schema - n_legs: int64 - """ - @property - def num_columns(self) -> int: - """ - Number of columns - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.num_columns - 2 - """ - - @property - def num_rows(self) -> int: - """ - Number of rows - - Due to the definition of a RecordBatch, all columns have the same - number of rows. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.num_rows - 6 - """ - @property - def schema(self) -> Schema: - """ - Schema of the RecordBatch and its columns - - Returns - ------- - pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.schema - n_legs: int64 - animals: string - """ - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the record batch. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.nbytes - 116 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the record batch - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.get_total_buffer_size() - 120 - """ - - def __sizeof__(self) -> int: ... - def add_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: - """ - Add column to RecordBatch at position i. - - A new record batch is returned with the column added, the original record batch - object is left unchanged. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - RecordBatch - New record batch with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - - Add column: - - >>> year = [2021, 2022, 2019, 2021] - >>> batch.add_column(0, "year", year) - pyarrow.RecordBatch - year: int64 - n_legs: int64 - animals: string - ---- - year: [2021,2022,2019,2021] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Original record batch is left unchanged: - - >>> batch - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - def remove_column(self, i: int) -> Self: - """ - Create new RecordBatch with the indicated column removed. - - Parameters - ---------- - i : int - Index of column to remove. - - Returns - ------- - Table - New record batch without the column. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> batch.remove_column(1) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,4,5,100] - """ - def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: - """ - Replace column in RecordBatch at position. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - RecordBatch - New record batch with the passed column set. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - - Replace a column: - - >>> year = [2021, 2022, 2019, 2021] - >>> batch.set_column(1, "year", year) - pyarrow.RecordBatch - n_legs: int64 - year: int64 - ---- - n_legs: [2,4,5,100] - year: [2021,2022,2019,2021] - """ - @overload - def rename_columns(self, names: list[str]) -> Self: ... - @overload - def rename_columns(self, names: dict[str, str]) -> Self: ... - def rename_columns(self, names): - """ - Create new record batch with columns renamed to provided names. - - Parameters - ---------- - names : list[str] or dict[str, str] - List of new column names or mapping of old column names to new column names. - - If a mapping of old to new column names is passed, then all columns which are - found to match a provided old column name will be renamed to the new column name. - If any column names are not found in the mapping, a KeyError will be raised. - - Raises - ------ - KeyError - If any of the column names passed in the names mapping do not exist. - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> new_names = ["n", "name"] - >>> batch.rename_columns(new_names) - pyarrow.RecordBatch - n: int64 - name: string - ---- - n: [2,4,5,100] - name: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> new_names = {"n_legs": "n", "animals": "name"} - >>> batch.rename_columns(new_names) - pyarrow.RecordBatch - n: int64 - name: string - ---- - n: [2,4,5,100] - name: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write RecordBatch to Buffer as encapsulated IPC message, which does not - include a Schema. - - To reconstruct a RecordBatch from the encapsulated IPC message Buffer - returned by this function, a Schema must be passed separately. See - Examples. - - Parameters - ---------- - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> buf = batch.serialize() - >>> buf - - - Reconstruct RecordBatch from IPC message Buffer and original Schema - - >>> pa.ipc.read_record_batch(buf, batch.schema) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this RecordBatch - - Parameters - ---------- - offset : int, default 0 - Offset from start of record batch to slice - length : int, default None - Length of slice (default is until end of batch starting from - offset) - - Returns - ------- - sliced : RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - >>> batch.slice(offset=3).to_pandas() - n_legs animals - 0 4 Horse - 1 5 Brittle stars - 2 100 Centipede - >>> batch.slice(length=2).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - >>> batch.slice(offset=3, length=1).to_pandas() - n_legs animals - 0 4 Horse - """ - def equals(self, other: Self, check_metadata: bool = False) -> bool: - """ - Check if contents of two record batches are equal. - - Parameters - ---------- - other : pyarrow.RecordBatch - RecordBatch to compare against. - check_metadata : bool, default False - Whether schema metadata equality should be checked as well. - - Returns - ------- - are_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch_0 = pa.record_batch([]) - >>> batch_1 = pa.RecordBatch.from_arrays( - ... [n_legs, animals], - ... names=["n_legs", "animals"], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> batch.equals(batch) - True - >>> batch.equals(batch_0) - False - >>> batch.equals(batch_1) - True - >>> batch.equals(batch_1, check_metadata=True) - False - """ - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: - """ - Select columns of the RecordBatch. - - Returns a new RecordBatch with the specified columns, and metadata - preserved. - - Parameters - ---------- - columns : list-like - The column names or integer indices to select. - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) - - Select columns my indices: - - >>> batch.select([1]) - pyarrow.RecordBatch - animals: string - ---- - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - - Select columns by names: - - >>> batch.select(["n_legs"]) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,2,4,4,5,100] - """ - def cast( - self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: - """ - Cast record batch values to another schema. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - safe : bool, default True - Check for overflows or other unsafe conversions. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> batch.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - - Define new schema and cast batch values: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] - ... ) - >>> batch.cast(target_schema=my_schema) - pyarrow.RecordBatch - n_legs: duration[s] - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - @classmethod - def from_arrays( - cls, - arrays: Collection[Array], - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a RecordBatch from multiple pyarrow.Arrays - - Parameters - ---------- - arrays : list of pyarrow.Array - One for each field in RecordBatch - names : list of str, optional - Names for the batch fields. If not passed, schema must be passed - schema : Schema, default None - Schema for the created batch. If not passed, names must be passed - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - pyarrow.RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> names = ["n_legs", "animals"] - - Construct a RecordBatch from pyarrow Arrays using names: - - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Construct a RecordBatch from pyarrow Arrays using schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - @classmethod - def from_pandas( - cls, - df: pd.DataFrame, - schema: Schema | None = None, - preserve_index: bool | None = None, - nthreads: int | None = None, - columns: list[str] | None = None, - ) -> Self: - """ - Convert pandas.DataFrame to an Arrow RecordBatch - - Parameters - ---------- - df : pandas.DataFrame - schema : pyarrow.Schema, optional - The expected schema of the RecordBatch. This can be used to - indicate the type of columns if we cannot infer it automatically. - If passed, the output will have exactly this schema. Columns - specified in the schema that are not found in the DataFrame columns - or its index will raise an error. Additional columns or index - levels in the DataFrame which are not specified in the schema will - be ignored. - preserve_index : bool, optional - Whether to store the index as an additional column in the resulting - ``RecordBatch``. The default of None will store the index as a - column, except for RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None - If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. By default, this follows - :func:`pyarrow.cpu_count` (may use up to system CPU count threads). - columns : list, optional - List of column to be converted. If None, use all columns. - - Returns - ------- - pyarrow.RecordBatch - - - Examples - -------- - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Convert pandas DataFrame to RecordBatch: - - >>> import pyarrow as pa - >>> pa.RecordBatch.from_pandas(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Convert pandas DataFrame to RecordBatch using schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.RecordBatch.from_pandas(df, schema=my_schema) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Convert pandas DataFrame to RecordBatch specifying columns: - - >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,4,5,100] - """ - @classmethod - def from_struct_array( - cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] - ) -> Self: - """ - Construct a RecordBatch from a StructArray. - - Each field in the StructArray will become a column in the resulting - ``RecordBatch``. - - Parameters - ---------- - struct_array : StructArray - Array to construct the record batch from. - - Returns - ------- - pyarrow.RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> pa.RecordBatch.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 - """ - def to_struct_array(self) -> StructArray: - """ - Convert to a struct array. - """ - def to_tensor( - self, - null_to_nan: bool = False, - row_major: bool = True, - memory_pool: MemoryPool | None = None, - ) -> Tensor: - """ - Convert to a :class:`~pyarrow.Tensor`. - - RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths. - - ``null_to_nan`` is ``False`` by default and this method will raise an error in case - any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` - set to ``True``. In this case null values are converted to ``NaN`` and integer type - arrays are promoted to the appropriate float type. - - Parameters - ---------- - null_to_nan : bool, default False - Whether to write null values in the result as ``NaN``. - row_major : bool, default True - Whether resulting Tensor is row-major or column-major - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Examples - -------- - >>> import pyarrow as pa - >>> batch = pa.record_batch( - ... [ - ... pa.array([1, 2, 3, 4, None], type=pa.int32()), - ... pa.array([10, 20, 30, 40, None], type=pa.float32()), - ... ], - ... names=["a", "b"], - ... ) - - >>> batch - pyarrow.RecordBatch - a: int32 - b: float - ---- - a: [1,2,3,4,null] - b: [10,20,30,40,null] - - Convert a RecordBatch to row-major Tensor with null values - written as ``NaN``s - - >>> batch.to_tensor(null_to_nan=True) - - type: double - shape: (5, 2) - strides: (16, 8) - >>> batch.to_tensor(null_to_nan=True).to_numpy() - array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) - - Convert a RecordBatch to column-major Tensor - - >>> batch.to_tensor(null_to_nan=True, row_major=False) - - type: double - shape: (5, 2) - strides: (8, 40) - >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() - array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) - """ - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): - """ - Export to a C ArrowArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the record batch - schema is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: - """ - Import RecordBatch from a C ArrowArray struct, given its pointer - and the imported schema. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArray struct. - type: Schema or int - Either a Schema object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_array__(self, requested_schema=None): - """ - Get a pair of PyCapsules containing a C ArrowArray representation of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the batch to this schema. - If None, the batch will be returned as-is, with a schema matching the - one returned by :meth:`__arrow_c_schema__()`. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowArray, - respectively. - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the batch as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - Currently, this is not supported and will raise a - NotImplementedError if the schema doesn't match the current schema. - - Returns - ------- - PyCapsule - """ - @classmethod - def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: - """ - Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema - and ArrowArray, respectively. - - Parameters - ---------- - schema_capsule : PyCapsule - A PyCapsule containing a C ArrowSchema representation of the schema. - array_capsule : PyCapsule - A PyCapsule containing a C ArrowArray representation of the array. - - Returns - ------- - pyarrow.RecordBatch - """ - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowDeviceArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the record batch - schema is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: - """ - Import RecordBatch from a C ArrowDeviceArray struct, given its pointer - and the imported schema. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - type: Schema or int - Either a Schema object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_device_array__(self, requested_schema=None, **kwargs): - """ - Get a pair of PyCapsules containing a C ArrowDeviceArray representation - of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the batch to this data type. - If None, the batch will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - kwargs - Currently no additional keyword arguments are supported, but - this method will accept any keyword with a value of ``None`` - for compatibility with future keywords. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, - respectively. - """ - @classmethod - def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: - """ - Import RecordBatch from a pair of PyCapsules containing a - C ArrowSchema and ArrowDeviceArray, respectively. - - Parameters - ---------- - schema_capsule : PyCapsule - A PyCapsule containing a C ArrowSchema representation of the schema. - array_capsule : PyCapsule - A PyCapsule containing a C ArrowDeviceArray representation of the array. - - Returns - ------- - pyarrow.RecordBatch - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the arrays in the RecordBatch reside. - - Returns - ------- - DeviceAllocationType - """ - @property - def is_cpu(self) -> bool: - """ - Whether the RecordBatch's arrays are CPU-accessible. - """ - def copy_to(self, destination: MemoryManager | Device) -> Self: - """ - Copy the entire RecordBatch to destination device. - - This copies each column of the record batch to create - a new record batch where all underlying buffers for the columns have - been copied to the destination MemoryManager. - - Parameters - ---------- - destination : pyarrow.MemoryManager or pyarrow.Device - The destination device to copy the array to. - - Returns - ------- - RecordBatch - """ - -def table_to_blocks(options, table: Table, categories, extension_columns): ... - -JoinType: TypeAlias = Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", -] - -class Table(_Tabular[ChunkedArray[Any]]): - """ - A collection of top-level named, equal length Arrow arrays. - - Warnings - -------- - Do not call this class's constructor directly, use one of the ``from_*`` - methods instead. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from arrays: - - >>> pa.Table.from_arrays([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a RecordBatch: - - >>> batch = pa.record_batch([n_legs, animals], names=names) - >>> pa.Table.from_batches([batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.Table.from_pandas(df) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a dictionary of arrays: - - >>> pydict = {"n_legs": n_legs, "animals": animals} - >>> pa.Table.from_pydict(pydict) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_pydict(pydict).schema - n_legs: int64 - animals: string - - Construct a Table from a dictionary of arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a list of rows: - - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] - >>> pa.Table.from_pylist(pylist) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,null]] - animals: [["Flamingo","Centipede"]] - - Construct a Table from a list of rows with pyarrow schema: - - >>> my_schema = pa.schema( - ... [ - ... pa.field("year", pa.int64()), - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... ], - ... metadata={"year": "Year of entry"}, - ... ) - >>> pa.Table.from_pylist(pylist, schema=my_schema).schema - year: int64 - n_legs: int64 - animals: string - -- schema metadata -- - year: 'Year of entry' - - Construct a Table with :func:`pyarrow.table`: - - >>> pa.table([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this Table. - - Parameters - ---------- - offset : int, default 0 - Offset from start of table to slice. - length : int, default None - Length of slice (default is until end of table starting from - offset). - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.slice(length=3) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019]] - n_legs: [[2,4,5]] - animals: [["Flamingo","Horse","Brittle stars"]] - >>> table.slice(offset=2) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2019,2021]] - n_legs: [[5,100]] - animals: [["Brittle stars","Centipede"]] - >>> table.slice(offset=2, length=1) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2019]] - n_legs: [[5]] - animals: [["Brittle stars"]] - """ - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: - """ - Select columns of the Table. - - Returns a new Table with the specified columns, and metadata - preserved. - - Parameters - ---------- - columns : list-like - The column names or integer indices to select. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.select([0, 1]) - pyarrow.Table - year: int64 - n_legs: int64 - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - >>> table.select(["year"]) - pyarrow.Table - year: int64 - ---- - year: [[2020,2022,2019,2021]] - """ - def replace_schema_metadata( - self, metadata: dict[str | bytes, str | bytes] | None = None - ) -> Self: - """ - Create shallow copy of table by replacing schema - key-value metadata with the indicated new metadata (which may be None), - which deletes any existing metadata. - - Parameters - ---------- - metadata : dict, default None - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Constructing a Table with pyarrow schema and metadata: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> table = pa.table(df, my_schema) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: ... - - Create a shallow copy of a Table with deleted schema metadata: - - >>> table.replace_schema_metadata().schema - n_legs: int64 - animals: string - - Create a shallow copy of a Table with new schema metadata: - - >>> metadata = {"animals": "Which animal"} - >>> table.replace_schema_metadata(metadata=metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - animals: 'Which animal' - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Flatten this Table. - - Each column with a struct type is flattened - into one column per struct field. Other columns are left unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> month = pa.array([4, 6]) - >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) - >>> table - pyarrow.Table - a: struct - child 0, animals: string - child 1, n_legs: int64 - child 2, year: int64 - month: int64 - ---- - a: [ - -- is_valid: all not null - -- child 0 type: string - ["Parrot",null] - -- child 1 type: int64 - [2,4] - -- child 2 type: int64 - [null,2022]] - month: [[4,6]] - - Flatten the columns with struct field: - - >>> table.flatten() - pyarrow.Table - a.animals: string - a.n_legs: int64 - a.year: int64 - month: int64 - ---- - a.animals: [["Parrot",null]] - a.n_legs: [[2,4]] - a.year: [[null,2022]] - month: [[4,6]] - """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Make a new table by combining the chunks this table has. - - All the underlying chunks in the ChunkedArray of each column are - concatenated into zero or one chunk. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] - ... ) - >>> names = ["n_legs", "animals"] - >>> table = pa.table([n_legs, animals], names=names) - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4],[4,5,100]] - animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] - >>> table.combine_chunks() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4,4,5,100]] - animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] - """ - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Unify dictionaries across all chunks. - - This method returns an equivalent table, but where all chunks of - each column share the same dictionary values. Dictionary indices - are transposed accordingly. - - Columns without dictionaries are returned unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() - >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() - >>> c_arr = pa.chunked_array([arr_1, arr_2]) - >>> table = pa.table([c_arr], names=["animals"]) - >>> table - pyarrow.Table - animals: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Parrot","Dog"] -- indices: - [0,1,2], -- dictionary: - ["Horse","Brittle stars","Centipede"] -- indices: - [0,1,2]] - - Unify dictionaries across both chunks: - - >>> table.unify_dictionaries() - pyarrow.Table - animals: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: - [0,1,2], -- dictionary: - ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: - [3,4,5]] - """ - def equals(self, other: Self, check_metadata: bool = False) -> Self: - """ - Check if contents of two tables are equal. - - Parameters - ---------- - other : pyarrow.Table - Table to compare against. - check_metadata : bool, default False - Whether schema metadata equality should be checked as well. - - Returns - ------- - bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> names = ["n_legs", "animals"] - >>> table = pa.Table.from_arrays([n_legs, animals], names=names) - >>> table_0 = pa.Table.from_arrays([]) - >>> table_1 = pa.Table.from_arrays( - ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} - ... ) - >>> table.equals(table) - True - >>> table.equals(table_0) - False - >>> table.equals(table_1) - True - >>> table.equals(table_1, check_metadata=True) - False - """ - def cast( - self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: - """ - Cast table values to another schema. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - safe : bool, default True - Check for overflows or other unsafe conversions. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - - Define new schema and cast table values: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] - ... ) - >>> table.cast(target_schema=my_schema) - pyarrow.Table - n_legs: duration[s] - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - @classmethod - def from_pandas( - cls, - df: pd.DataFrame, - schema: Schema | None = None, - preserve_index: bool | None = None, - nthreads: int | None = None, - columns: list[str] | None = None, - safe: bool = True, - ) -> Self: - """ - Convert pandas.DataFrame to an Arrow Table. - - The column types in the resulting Arrow Table are inferred from the - dtypes of the pandas.Series in the DataFrame. In the case of non-object - Series, the NumPy dtype is translated to its Arrow equivalent. In the - case of `object`, we need to guess the datatype by looking at the - Python objects in this Series. - - Be aware that Series of the `object` dtype don't carry enough - information to always lead to a meaningful Arrow type. In the case that - we cannot infer a type, e.g. because the DataFrame is of length 0 or - the Series only contains None/nan objects, the type is set to - null. This behavior can be avoided by constructing an explicit schema - and passing it to this function. - - Parameters - ---------- - df : pandas.DataFrame - schema : pyarrow.Schema, optional - The expected schema of the Arrow Table. This can be used to - indicate the type of columns if we cannot infer it automatically. - If passed, the output will have exactly this schema. Columns - specified in the schema that are not found in the DataFrame columns - or its index will raise an error. Additional columns or index - levels in the DataFrame which are not specified in the schema will - be ignored. - preserve_index : bool, optional - Whether to store the index as an additional column in the resulting - ``Table``. The default of None will store the index as a column, - except for RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None - If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. By default, this follows - :func:`pyarrow.cpu_count` (may use up to system CPU count threads). - columns : list, optional - List of column to be converted. If None, use all columns. - safe : bool, default True - Check for overflows or other unsafe conversions. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.Table.from_pandas(df) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - @classmethod - def from_arrays( - cls, - arrays: Collection[ArrayOrChunkedArray[Any]], - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a Table from Arrow arrays. - - Parameters - ---------- - arrays : list of pyarrow.Array or pyarrow.ChunkedArray - Equal-length arrays that should form the table. - names : list of str, optional - Names for the table columns. If not passed, schema must be passed. - schema : Schema, default None - Schema for the created table. If not passed, names must be passed. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from arrays: - - >>> pa.Table.from_arrays([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from arrays with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"animals": "Name of the animal species"}, - ... ) - >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - animals: 'Name of the animal species' - """ - @classmethod - def from_struct_array( - cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] - ) -> Self: - """ - Construct a Table from a StructArray. - - Each field in the StructArray will become a column in the resulting - ``Table``. - - Parameters - ---------- - struct_array : StructArray or ChunkedArray - Array to construct the table from. - - Returns - ------- - pyarrow.Table - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> pa.Table.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 - """ - def to_struct_array( - self, max_chunksize: int | None = None - ) -> ChunkedArray[scalar.StructScalar]: - """ - Convert to a chunked array of struct type. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for ChunkedArray chunks. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - ChunkedArray - """ - @classmethod - def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: - """ - Construct a Table from a sequence or iterator of Arrow RecordBatches. - - Parameters - ---------- - batches : sequence or iterator of RecordBatch - Sequence of RecordBatch to be converted, all schemas must be equal. - schema : Schema, default None - If not passed, will be inferred from the first RecordBatch. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - >>> batch = pa.record_batch([n_legs, animals], names=names) - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - - Construct a Table from a RecordBatch: - - >>> pa.Table.from_batches([batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a sequence of RecordBatches: - - >>> pa.Table.from_batches([batch, batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100],[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: - """ - Convert Table to a list of RecordBatch objects. - - Note that this method is zero-copy, it merely exposes the same data - under a different API. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for each RecordBatch chunk. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - list[RecordBatch] - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Convert a Table to a RecordBatch: - - >>> table.to_batches()[0].to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - - Convert a Table to a list of RecordBatches: - - >>> table.to_batches(max_chunksize=2)[0].to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - >>> table.to_batches(max_chunksize=2)[1].to_pandas() - n_legs animals - 0 5 Brittle stars - 1 100 Centipede - """ - def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: - """ - Convert the Table to a RecordBatchReader. - - Note that this method is zero-copy, it merely exposes the same data - under a different API. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for each RecordBatch chunk. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - RecordBatchReader - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Convert a Table to a RecordBatchReader: - - >>> table.to_reader() - - - >>> reader = table.to_reader() - >>> reader.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - >>> reader.read_all() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - @property - def schema(self) -> Schema: - """ - Schema of the table and its columns. - - Returns - ------- - Schema - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... - """ - @property - def num_columns(self) -> int: - """ - Number of columns in this table. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.num_columns - 2 - """ - @property - def num_rows(self) -> int: - """ - Number of rows in this table. - - Due to the definition of a table, all columns have the same number of - rows. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.num_rows - 4 - """ - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the table. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.nbytes - 72 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the table. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.get_total_buffer_size() - 76 - """ - def __sizeof__(self) -> int: ... - def add_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: - """ - Add column to Table at position. - - A new table is returned with the column added, the original table - object is left unchanged. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array, list of Array, or values coercible to arrays - Column data. - - Returns - ------- - Table - New table with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Add column: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.add_column(0, "year", [year]) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2021,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Original table is left unchanged: - - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def remove_column(self, i: int) -> Self: - """ - Create new Table with the indicated column removed. - - Parameters - ---------- - i : int - Index of column to remove. - - Returns - ------- - Table - New table without the column. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.remove_column(1) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,4,5,100]] - """ - def set_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: - """ - Replace column in Table at position. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array, list of Array, or values coercible to arrays - Column data. - - Returns - ------- - Table - New table with the passed column set. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Replace a column: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.set_column(1, "year", [year]) - pyarrow.Table - n_legs: int64 - year: int64 - ---- - n_legs: [[2,4,5,100]] - year: [[2021,2022,2019,2021]] - """ - @overload - def rename_columns(self, names: list[str]) -> Self: ... - @overload - def rename_columns(self, names: dict[str, str]) -> Self: ... - def rename_columns(self, names): - """ - Create new table with columns renamed to provided names. - - Parameters - ---------- - names : list[str] or dict[str, str] - List of new column names or mapping of old column names to new column names. - - If a mapping of old to new column names is passed, then all columns which are - found to match a provided old column name will be renamed to the new column name. - If any column names are not found in the mapping, a KeyError will be raised. - - Raises - ------ - KeyError - If any of the column names passed in the names mapping do not exist. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> new_names = ["n", "name"] - >>> table.rename_columns(new_names) - pyarrow.Table - n: int64 - name: string - ---- - n: [[2,4,5,100]] - name: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> new_names = {"n_legs": "n", "animals": "name"} - >>> table.rename_columns(new_names) - pyarrow.Table - n: int64 - name: string - ---- - n: [[2,4,5,100]] - name: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def drop(self, columns: str | list[str]) -> Self: - """ - Drop one or more columns and return a new table. - - Alias of Table.drop_columns, but kept for backwards compatibility. - - Parameters - ---------- - columns : str or list[str] - Field name(s) referencing existing column(s). - - Returns - ------- - Table - New table without the column(s). - """ - def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: - """ - Declare a grouping over the columns of the table. - - Resulting grouping can then be used to perform aggregations - with a subsequent ``aggregate()`` method. - - Parameters - ---------- - keys : str or list[str] - Name of the columns that should be used as the grouping key. - use_threads : bool, default True - Whether to use multithreading or not. When set to True (the - default), no stable ordering of the output is guaranteed. - - Returns - ------- - TableGroupBy - - See Also - -------- - TableGroupBy.aggregate - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.group_by("year").aggregate([("n_legs", "sum")]) - pyarrow.Table - year: int64 - n_legs_sum: int64 - ---- - year: [[2020,2022,2021,2019]] - n_legs_sum: [[2,6,104,5]] - """ - def join( - self, - right_table: Self, - keys: str | list[str], - right_keys: str | list[str] | None = None, - join_type: JoinType = "left outer", - left_suffix: str | None = None, - right_suffix: str | None = None, - coalesce_keys: bool = True, - use_threads: bool = True, - ) -> Self: - """ - Perform a join between this table and another one. - - Result of the join will be a new Table, where further - operations can be applied. - - Parameters - ---------- - right_table : Table - The table to join to the current one, acting as the right table - in the join operation. - keys : str or list[str] - The columns from current table that should be used as keys - of the join operation left side. - right_keys : str or list[str], default None - The columns from the right_table that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left table. - join_type : str, default "left outer" - The kind of join that should be performed, one of - ("left semi", "right semi", "left anti", "right anti", - "inner", "left outer", "right outer", "full outer") - left_suffix : str, default None - Which suffix to add to left column names. This prevents confusion - when the columns in left and right tables have colliding names. - right_suffix : str, default None - Which suffix to add to the right column names. This prevents confusion - when the columns in left and right tables have colliding names. - coalesce_keys : bool, default True - If the duplicated keys should be omitted from one of the sides - in the join result. - use_threads : bool, default True - Whether to use multithreading or not. - - Returns - ------- - Table - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) - >>> df2 = pd.DataFrame( - ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} - ... ) - >>> t1 = pa.Table.from_pandas(df1) - >>> t2 = pa.Table.from_pandas(df2) - - Left outer join: - - >>> t1.join(t2, "id").combine_chunks().sort_by("year") - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[3,1,2]] - year: [[2019,2020,2022]] - n_legs: [[5,null,null]] - animal: [["Brittle stars",null,null]] - - Full outer join: - - >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[3,1,2,4]] - year: [[2019,2020,2022,null]] - n_legs: [[5,null,null,100]] - animal: [["Brittle stars",null,null,"Centipede"]] - - Right outer join: - - >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") - pyarrow.Table - year: int64 - id: int64 - n_legs: int64 - animal: string - ---- - year: [[2019,null]] - id: [[3,4]] - n_legs: [[5,100]] - animal: [["Brittle stars","Centipede"]] - - Right anti join - - >>> t1.join(t2, "id", join_type="right anti") - pyarrow.Table - id: int64 - n_legs: int64 - animal: string - ---- - id: [[4]] - n_legs: [[100]] - animal: [["Centipede"]] - """ - def join_asof( - self, - right_table: Self, - on: str, - by: str | list[str], - tolerance: int, - right_on: str | list[str] | None = None, - right_by: str | list[str] | None = None, - ) -> Self: - """ - Perform an asof join between this table and another one. - - This is similar to a left-join except that we match on nearest key rather - than equal keys. Both tables must be sorted by the key. This type of join - is most useful for time series data that are not perfectly aligned. - - Optionally match on equivalent keys with "by" before searching with "on". - - Result of the join will be a new Table, where further - operations can be applied. - - Parameters - ---------- - right_table : Table - The table to join to the current one, acting as the right table - in the join operation. - on : str - The column from current table that should be used as the "on" key - of the join operation left side. - - An inexact match is used on the "on" key, i.e. a row is considered a - match if and only if left_on - tolerance <= right_on <= left_on. - - The input dataset must be sorted by the "on" key. Must be a single - field of a common type. - - Currently, the "on" key must be an integer, date, or timestamp type. - by : str or list[str] - The columns from current table that should be used as the keys - of the join operation left side. The join operation is then done - only for the matches in these columns. - tolerance : int - The tolerance for inexact "on" key matching. A right row is considered - a match with the left row ``right.on - left.on <= tolerance``. The - ``tolerance`` may be: - - - negative, in which case a past-as-of-join occurs; - - or positive, in which case a future-as-of-join occurs; - - or zero, in which case an exact-as-of-join occurs. - - The tolerance is interpreted in the same units as the "on" key. - right_on : str or list[str], default None - The columns from the right_table that should be used as the on key - on the join operation right side. - When ``None`` use the same key name as the left table. - right_by : str or list[str], default None - The columns from the right_table that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left table. - - Returns - ------- - Table - - Example - -------- - >>> import pyarrow as pa - >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) - >>> t2 = pa.table( - ... { - ... "id": [3, 4], - ... "year": [2020, 2021], - ... "n_legs": [5, 100], - ... "animal": ["Brittle stars", "Centipede"], - ... } - ... ) - - >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[1,3,2,3,3]] - year: [[2020,2021,2022,2022,2023]] - n_legs: [[null,5,null,5,null]] - animal: [[null,"Brittle stars",null,"Brittle stars",null]] - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the table as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - Currently, this is not supported and will raise a - NotImplementedError if the schema doesn't match the current schema. - - Returns - ------- - PyCapsule - """ - @property - def is_cpu(self) -> bool: - """ - Whether all ChunkedArrays are CPU-accessible. - """ - -def record_batch( - data: list[ArrayOrChunkedArray[Any]] - | dict[str, list[Any] | Array[Any]] - | Iterable[Array[Any]] - | pd.DataFrame - | SupportArrowArray - | SupportArrowDeviceArray, - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, -) -> RecordBatch: - """ - Create a pyarrow.RecordBatch from another Python data structure or sequence - of arrays. - - Parameters - ---------- - data : dict, list, pandas.DataFrame, Arrow-compatible table - A mapping of strings to Arrays or Python lists, a list of Arrays, - a pandas DataFame, or any tabular object implementing the - Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or - ``__arrow_c_device_array__`` method). - names : list, default None - Column names if list of arrays passed as data. Mutually exclusive with - 'schema' argument. - schema : Schema, default None - The expected schema of the RecordBatch. If not passed, will be inferred - from the data. Mutually exclusive with 'names' argument. - metadata : dict or Mapping, default None - Optional metadata for the schema (if schema not passed). - - Returns - ------- - RecordBatch - - See Also - -------- - RecordBatch.from_arrays, RecordBatch.from_pandas, table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a RecordBatch from a python dictionary: - - >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Creating a RecordBatch from a list of arrays with names: - - >>> pa.record_batch([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - - Creating a RecordBatch from a list of arrays with names and metadata: - - >>> my_metadata = {"n_legs": "How many legs does an animal have?"} - >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'How many legs does an animal have?' - - Creating a RecordBatch from a pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.record_batch(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - >>> pa.record_batch(df).to_pandas() - year month day n_legs animals - 0 2020 3 1 2 Flamingo - 1 2022 5 5 4 Horse - 2 2021 7 9 5 Brittle stars - 3 2022 9 13 100 Centipede - - Creating a RecordBatch from a pandas DataFrame with schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.record_batch(df, my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: ... - >>> pa.record_batch(df, my_schema).to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - """ - -@overload -def table( - data: dict[str, list[Any] | Array[Any]], - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, - nthreads: int | None = None, -) -> Table: ... -@overload -def table( - data: Collection[ArrayOrChunkedArray[Any]] - | pd.DataFrame - | SupportArrowArray - | SupportArrowStream - | SupportArrowDeviceArray, - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping[str | bytes, str | bytes] | None = None, - nthreads: int | None = None, -) -> Table: ... -def table(*args, **kwargs): - """ - Create a pyarrow.Table from a Python data structure or sequence of arrays. - - Parameters - ---------- - data : dict, list, pandas.DataFrame, Arrow-compatible table - A mapping of strings to Arrays or Python lists, a list of arrays or - chunked arrays, a pandas DataFame, or any tabular object implementing - the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, - ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). - names : list, default None - Column names if list of arrays passed as data. Mutually exclusive with - 'schema' argument. - schema : Schema, default None - The expected schema of the Arrow Table. If not passed, will be inferred - from the data. Mutually exclusive with 'names' argument. - If passed, the output will have exactly this schema (raising an error - when columns are not found in the data and ignoring additional data not - specified in the schema, when data is a dict or DataFrame). - metadata : dict or Mapping, default None - Optional metadata for the schema (if schema not passed). - nthreads : int, default None - For pandas.DataFrame inputs: if greater than 1, convert columns to - Arrow in parallel using indicated number of threads. By default, - this follows :func:`pyarrow.cpu_count` (may use up to system CPU count - threads). - - Returns - ------- - Table - - See Also - -------- - Table.from_arrays, Table.from_pandas, Table.from_pydict - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from a python dictionary: - - >>> pa.table({"n_legs": n_legs, "animals": animals}) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays: - - >>> pa.table([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.table(df) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from pandas DataFrame with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.table(df, my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... - - Construct a Table from chunked arrays: - - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] - ... ) - >>> table = pa.table([n_legs, animals], names=names) - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4],[4,5,100]] - animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] - """ - -def concat_tables( - tables: Iterable[Table], - memory_pool: MemoryPool | None = None, - promote_options: Literal["none", "default", "permissive"] = "none", - **kwargs: Any, -) -> Table: - """ - Concatenate pyarrow.Table objects. - - If promote_options="none", a zero-copy concatenation will be performed. The schemas - of all the Tables must be the same (except the metadata), otherwise an - exception will be raised. The result Table will share the metadata with the - first table. - - If promote_options="default", any null type arrays will be casted to the type of other - arrays in the column of the same name. If a table is missing a particular - field, null values of the appropriate type will be generated to take the - place of the missing field. The new schema will share the metadata with the - first table. Each field in the new schema will share the metadata with the - first table which has the field defined. Note that type promotions may - involve additional allocations on the given ``memory_pool``. - - If promote_options="permissive", the behavior of default plus types will be promoted - to the common denominator that fits all the fields. - - Parameters - ---------- - tables : iterable of pyarrow.Table objects - Pyarrow tables to concatenate into a single Table. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - promote_options : str, default none - Accepts strings "none", "default" and "permissive". - **kwargs : dict, optional - - Examples - -------- - >>> import pyarrow as pa - >>> t1 = pa.table( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) - >>> pa.concat_tables([t1, t2]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100],[2,4]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] - - """ - -class TableGroupBy: - """ - A grouping of columns in a table on which to perform aggregations. - - Parameters - ---------- - table : pyarrow.Table - Input table to execute the aggregation on. - keys : str or list[str] - Name of the grouped columns. - use_threads : bool, default True - Whether to use multithreading or not. When set to True (the default), - no stable ordering of the output is guaranteed. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.table( - ... [ - ... pa.array(["a", "a", "b", "b", "c"]), - ... pa.array([1, 2, 3, 4, 5]), - ... ], - ... names=["keys", "values"], - ... ) - - Grouping of columns: - - >>> pa.TableGroupBy(t, "keys") - - - Perform aggregations: - - >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) - pyarrow.Table - keys: string - values_sum: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - """ - - keys: str | list[str] - def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... - def aggregate( - self, - aggregations: Iterable[ - tuple[ColumnSelector, Aggregation] - | tuple[ColumnSelector, Aggregation, AggregateOptions | None] - ], - ) -> Table: - """ - Perform an aggregation over the grouped columns of the table. - - Parameters - ---------- - aggregations : list[tuple(str, str)] or \ -list[tuple(str, str, FunctionOptions)] - List of tuples, where each tuple is one aggregation specification - and consists of: aggregation column name followed - by function name and optionally aggregation function option. - Pass empty list to get a single row for each group. - The column name can be a string, an empty list or a list of - column names, for unary, nullary and n-ary aggregation functions - respectively. - - For the list of function names and respective aggregation - function options see :ref:`py-grouped-aggrs`. - - Returns - ------- - Table - Results of the aggregation functions. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.table([ - ... pa.array(["a", "a", "b", "b", "c"]), - ... pa.array([1, 2, 3, 4, 5]), - ... ], names=["keys", "values"]) - - Sum the column "values" over the grouped column "keys": - - >>> t.group_by("keys").aggregate([("values", "sum")]) - pyarrow.Table - keys: string - values_sum: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - - Count the rows over the grouped column "keys": - - >>> t.group_by("keys").aggregate([([], "count_all")]) - pyarrow.Table - keys: string - count_all: int64 - ---- - keys: [["a","b","c"]] - count_all: [[2,2,1]] - - Do multiple aggregations: - - >>> t.group_by("keys").aggregate([ - ... ("values", "sum"), - ... ("keys", "count") - ... ]) - pyarrow.Table - keys: string - values_sum: int64 - keys_count: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - keys_count: [[2,2,1]] - - Count the number of non-null values for column "values" - over the grouped column "keys": - - >>> import pyarrow.compute as pc - >>> t.group_by(["keys"]).aggregate([ - ... ("values", "count", pc.CountOptions(mode="only_valid")) - ... ]) - pyarrow.Table - keys: string - values_count: int64 - ---- - keys: [["a","b","c"]] - values_count: [[2,2,1]] - - Get a single row for each group in column "keys": - - >>> t.group_by("keys").aggregate([]) - pyarrow.Table - keys: string - ---- - keys: [["a","b","c"]] - """ - def _table(self) -> Table: ... - @property - def _use_threads(self) -> bool: ... - -def concat_batches( - recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None -) -> RecordBatch: - """ - Concatenate pyarrow.RecordBatch objects. - - All recordbatches must share the same Schema, - the operation implies a copy of the data to merge - the arrays of the different RecordBatches. - - Parameters - ---------- - recordbatches : iterable of pyarrow.RecordBatch objects - Pyarrow record batches to concatenate into a single RecordBatch. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Examples - -------- - >>> import pyarrow as pa - >>> t1 = pa.record_batch( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> t2 = pa.record_batch( - ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] - ... ) - >>> pa.concat_batches([t1, t2]) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100,2,4] - animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] - - """ - -__all__ = [ - "ChunkedArray", - "chunked_array", - "_Tabular", - "RecordBatch", - "table_to_blocks", - "Table", - "record_batch", - "table", - "concat_tables", - "TableGroupBy", - "concat_batches", -] diff --git a/python/pyarrow/_azurefs.pyi b/python/pyarrow/_azurefs.pyi deleted file mode 100644 index b9a83f01c56..00000000000 --- a/python/pyarrow/_azurefs.pyi +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Literal - -from ._fs import FileSystem - -class AzureFileSystem(FileSystem): - """ - Azure Blob Storage backed FileSystem implementation - - This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. - Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific - features will be used when they provide a performance advantage. Azurite emulator is - also supported. Note: `/` is the only supported delimiter. - - The storage account is considered the root of the filesystem. When enabled, containers - will be created or deleted during relevant directory operations. Obviously, this also - requires authentication with the additional permissions. - - By default `DefaultAzureCredential `__ - is used for authentication. This means it will try several types of authentication - and go with the first one that works. If any authentication parameters are provided when - initialising the FileSystem, they will be used instead of the default credential. - - Parameters - ---------- - account_name : str - Azure Blob Storage account name. This is the globally unique identifier for the - storage account. - account_key : str, default None - Account key of the storage account. If sas_token and account_key are None the - default credential will be used. The parameters account_key and sas_token are - mutually exclusive. - blob_storage_authority : str, default None - hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful - for connecting to a local emulator, like Azurite. - dfs_storage_authority : str, default None - hostname[:port] of the Data Lake Gen 2 Service. Defaults to - `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. - blob_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like Azurite. - dfs_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like Azurite. - sas_token : str, default None - SAS token for the storage account, used as an alternative to account_key. If sas_token - and account_key are None the default credential will be used. The parameters - account_key and sas_token are mutually exclusive. - - Examples - -------- - >>> from pyarrow import fs - >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") - >>> azurite_fs = fs.AzureFileSystem( - ... account_name="devstoreaccount1", - ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", - ... blob_storage_authority="127.0.0.1:10000", - ... dfs_storage_authority="127.0.0.1:10000", - ... blob_storage_scheme="http", - ... dfs_storage_scheme="http", - ... ) - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - - def __init__( - self, - account_name: str, - account_key: str | None = None, - blob_storage_authority: str | None = None, - dfs_storage_authority: str | None = None, - blob_storage_schema: Literal["http", "https"] = "https", - dfs_storage_schema: Literal["http", "https"] = "https", - sas_token: str | None = None, - ) -> None: ... diff --git a/python/pyarrow/_compute.pyi b/python/pyarrow/_compute.pyi deleted file mode 100644 index fa80304cf91..00000000000 --- a/python/pyarrow/_compute.pyi +++ /dev/null @@ -1,1768 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import ( - Any, - Callable, - Iterable, - Literal, - Sequence, - TypeAlias, - TypedDict, - overload, -) - -from . import lib -from .compute import _NumericScalarT - -_Order: TypeAlias = Literal["ascending", "descending"] -_Placement: TypeAlias = Literal["at_start", "at_end"] - -class Kernel(lib._Weakrefable): - """ - A kernel object. - - Kernels handle the execution of a Function for a certain signature. - """ - -class Function(lib._Weakrefable): - """ - A compute function. - - A function implements a certain logical computation over a range of - possible input signatures. Each signature accepts a range of input - types and is implemented by a given Kernel. - - Functions can be of different kinds: - - * "scalar" functions apply an item-wise computation over all items - of their inputs. Each item in the output only depends on the values - of the inputs at the same position. Examples: addition, comparisons, - string predicates... - - * "vector" functions apply a collection-wise computation, such that - each item in the output may depend on the values of several items - in each input. Examples: dictionary encoding, sorting, extracting - unique values... - - * "scalar_aggregate" functions reduce the dimensionality of the inputs by - applying a reduction function. Examples: sum, min_max, mode... - - * "hash_aggregate" functions apply a reduction function to an input - subdivided by grouping criteria. They may not be directly called. - Examples: hash_sum, hash_min_max... - - * "meta" functions dispatch to other functions. - """ - @property - def arity(self) -> int: - """ - The function arity. - - If Ellipsis (i.e. `...`) is returned, the function takes a variable - number of arguments. - """ - @property - def kind( - self, - ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: - """ - The function kind. - """ - @property - def name(self) -> str: - """ - The function name. - """ - @property - def num_kernels(self) -> int: - """ - The number of kernels implementing this function. - """ - @property - def kernels(self) -> list[ScalarKernel]: - """ - A list of all kernels implementing this function. - """ - def call( - self, - args: Iterable, - options: FunctionOptions | None = None, - memory_pool: lib.MemoryPool | None = None, - length: int | None = None, - ) -> Any: - """ - Call the function on the given arguments. - - Parameters - ---------- - args : iterable - The arguments to pass to the function. Accepted types depend - on the specific function. - options : FunctionOptions, optional - Options instance for executing this function. This should have - the right concrete options type. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - length : int, optional - Batch size for execution, for nullary (no argument) functions. If - not passed, will be inferred from passed data. - """ - -class FunctionOptions(lib._Weakrefable): - def serialize(self) -> lib.Buffer: ... - @classmethod - def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... - -class FunctionRegistry(lib._Weakrefable): - def get_function(self, name: str) -> Function: - """ - Look up a function by name in the registry. - - Parameters - ---------- - name : str - The name of the function to lookup - """ - - def list_functions(self) -> list[str]: - """ - Return all function names in the registry. - """ - -class HashAggregateFunction(Function): ... -class HashAggregateKernel(Kernel): ... -class ScalarAggregateFunction(Function): ... -class ScalarAggregateKernel(Kernel): ... -class ScalarFunction(Function): ... -class ScalarKernel(Kernel): ... -class VectorFunction(Function): ... -class VectorKernel(Kernel): ... - -# ==================== _compute.pyx Option classes ==================== -class ArraySortOptions(FunctionOptions): - """ - Options for the `array_sort_indices` function. - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - null_placement : str, default "at_end" - Where nulls in the input should be sorted. - Accepted values are "at_start", "at_end". - """ - def __init__( - self, - order: _Order = "ascending", - null_placement: _Placement = "at_end", - ) -> None: ... - -class AssumeTimezoneOptions(FunctionOptions): - """ - Options for the `assume_timezone` function. - - Parameters - ---------- - timezone : str - Timezone to assume for the input. - ambiguous : str, default "raise" - How to handle timestamps that are ambiguous in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - nonexistent : str, default "raise" - How to handle timestamps that don't exist in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - """ - - def __init__( - self, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - ) -> None: ... - -class CastOptions(FunctionOptions): - """ - Options for the `cast` function. - - Parameters - ---------- - target_type : DataType, optional - The PyArrow type to cast to. - allow_int_overflow : bool, default False - Whether integer overflow is allowed when casting. - allow_time_truncate : bool, default False - Whether time precision truncation is allowed when casting. - allow_time_overflow : bool, default False - Whether date/time range overflow is allowed when casting. - allow_decimal_truncate : bool, default False - Whether decimal precision truncation is allowed when casting. - allow_float_truncate : bool, default False - Whether floating-point precision truncation is allowed when casting. - allow_invalid_utf8 : bool, default False - Whether producing invalid utf8 data is allowed when casting. - """ - - allow_int_overflow: bool - allow_time_truncate: bool - allow_time_overflow: bool - allow_decimal_truncate: bool - allow_float_truncate: bool - allow_invalid_utf8: bool - - def __init__( - self, - target_type: lib.DataType | None = None, - *, - allow_int_overflow: bool | None = None, - allow_time_truncate: bool | None = None, - allow_time_overflow: bool | None = None, - allow_decimal_truncate: bool | None = None, - allow_float_truncate: bool | None = None, - allow_invalid_utf8: bool | None = None, - ) -> None: ... - @staticmethod - def safe(target_type: lib.DataType | None = None) -> CastOptions: ... - @staticmethod - def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... - def is_safe(self) -> bool: ... - -class CountOptions(FunctionOptions): - """ - Options for the `count` function. - - Parameters - ---------- - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - """ - def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... - -class CumulativeOptions(FunctionOptions): - """ - Options for `cumulative_*` functions. - - - cumulative_sum - - cumulative_sum_checked - - cumulative_prod - - cumulative_prod_checked - - cumulative_max - - cumulative_min - - Parameters - ---------- - start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. - skip_nulls : bool, default False - When false, the first encountered null is propagated. - """ - def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... - -class CumulativeSumOptions(FunctionOptions): - """ - Options for `cumulative_sum` function. - - Parameters - ---------- - start : Scalar, default None - Starting value for sum computation - skip_nulls : bool, default False - When false, the first encountered null is propagated. - """ - def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... - -class DayOfWeekOptions(FunctionOptions): - """ - Options for the `day_of_week` function. - - Parameters - ---------- - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - """ - - def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... - -class DictionaryEncodeOptions(FunctionOptions): - """ - Options for dictionary encoding. - - Parameters - ---------- - null_encoding : str, default "mask" - How to encode nulls in the input. - Accepted values are "mask" (null inputs emit a null in the indices - array), "encode" (null inputs emit a non-null index pointing to - a null value in the dictionary array). - """ - def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... - -class RunEndEncodeOptions(FunctionOptions): - """ - Options for run-end encoding. - - Parameters - ---------- - run_end_type : DataType, default pyarrow.int32() - The data type of the run_ends array. - - Accepted values are pyarrow.{int16(), int32(), int64()}. - """ - # TODO: default is DataType(int32) - def __init__(self, run_end_type: lib.DataType | Literal["int16","int32","int64"] = Literal["int32"]) -> None: ... - -class ElementWiseAggregateOptions(FunctionOptions): - """ - Options for element-wise aggregate functions. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - """ - def __init__(self, *, skip_nulls: bool = True) -> None: ... - -class ExtractRegexOptions(FunctionOptions): - """ - Options for the `extract_regex` function. - - Parameters - ---------- - pattern : str - Regular expression with named capture fields. - """ - def __init__(self, pattern: str) -> None: ... - -class ExtractRegexSpanOptions(FunctionOptions): - """ - Options for the `extract_regex_span` function. - - Parameters - ---------- - pattern : str - Regular expression with named capture fields. - """ - def __init__(self, pattern: str) -> None: ... - -class FilterOptions(FunctionOptions): - """ - Options for selecting with a boolean filter. - - Parameters - ---------- - null_selection_behavior : str, default "drop" - How to handle nulls in the selection filter. - Accepted values are "drop", "emit_null". - """ - - def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... - -class IndexOptions(FunctionOptions): - """ - Options for the `index` function. - - Parameters - ---------- - value : Scalar - The value to search for. - """ - def __init__(self, value: lib.Scalar) -> None: ... - -class JoinOptions(FunctionOptions): - """ - Options for the `binary_join_element_wise` function. - - Parameters - ---------- - null_handling : str, default "emit_null" - How to handle null values in the inputs. - Accepted values are "emit_null", "skip", "replace". - null_replacement : str, default "" - Replacement string to emit for null inputs if `null_handling` - is "replace". - """ - @overload - def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... - @overload - def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... - -class ListSliceOptions(FunctionOptions): - """ - Options for list array slicing. - - Parameters - ---------- - start : int - Index to start slicing inner list elements (inclusive). - stop : Optional[int], default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. (NotImplemented) - step : int, default 1 - Slice step. - return_fixed_size_list : Optional[bool], default None - Whether to return a FixedSizeListArray. If true _and_ stop is after - a list element's length, nulls will be appended to create the - requested slice size. The default of `None` will return the same - type which was passed in. - """ - def __init__( - self, - start: int, - stop: int | None = None, - step: int = 1, - return_fixed_size_list: bool | None = None, - ) -> None: ... - -class ListFlattenOptions(FunctionOptions): - """ - Options for `list_flatten` function - - Parameters - ---------- - recursive : bool, default False - When True, the list array is flattened recursively until an array - of non-list values is formed. - """ - def __init__(self, recursive: bool = False) -> None: ... - -class MakeStructOptions(FunctionOptions): - """ - Options for the `make_struct` function. - - Parameters - ---------- - field_names : sequence of str - Names of the struct fields to create. - field_nullability : sequence of bool, optional - Nullability information for each struct field. - If omitted, all fields are nullable. - field_metadata : sequence of KeyValueMetadata, optional - Metadata for each struct field. - """ - def __init__( - self, - field_names: Sequence[str] = (), - *, - field_nullability: Sequence[bool] | None = None, - field_metadata: Sequence[lib.KeyValueMetadata] | None = None, - ) -> None: ... - -class MapLookupOptions(FunctionOptions): - """ - Options for the `map_lookup` function. - - Parameters - ---------- - query_key : Scalar or Object can be converted to Scalar - The key to search for. - occurrence : str - The occurrence(s) to return from the Map - Accepted values are "first", "last", or "all". - """ - # TODO: query_key: Scalar or Object can be converted to Scalar - def __init__( - self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] - ) -> None: ... - -class MatchSubstringOptions(FunctionOptions): - """ - Options for looking for a substring. - - Parameters - ---------- - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - """ - - def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... - -class ModeOptions(FunctionOptions): - """ - Options for the `mode` function. - - Parameters - ---------- - n : int, default 1 - Number of distinct most-common values to return. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... - -class NullOptions(FunctionOptions): - """ - Options for the `is_null` function. - - Parameters - ---------- - nan_is_null : bool, default False - Whether floating-point NaN values are considered null. - """ - def __init__(self, *, nan_is_null: bool = False) -> None: ... - -class PadOptions(FunctionOptions): - """ - Options for padding strings. - - Parameters - ---------- - width : int - Desired string length. - padding : str, default " " - What to pad the string with. Should be one byte or codepoint. - lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). - """ - def __init__( - self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True - ) -> None: ... - -class PairwiseOptions(FunctionOptions): - """ - Options for `pairwise` functions. - - Parameters - ---------- - period : int, default 1 - Period for applying the period function. - """ - def __init__(self, period: int = 1) -> None: ... - -class PartitionNthOptions(FunctionOptions): - """ - Options for the `partition_nth_indices` function. - - Parameters - ---------- - pivot : int - Index into the equivalent sorted array of the pivot element. - null_placement : str, default "at_end" - Where nulls in the input should be partitioned. - Accepted values are "at_start", "at_end". - """ - def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... - -class WinsorizeOptions(FunctionOptions): - """ - Options for the `winsorize` function. - - Parameters - ---------- - lower_limit : float, between 0 and 1 - The quantile below which all values are replaced with the quantile's value. - upper_limit : float, between 0 and 1 - The quantile above which all values are replaced with the quantile's value. - """ - def __init__(self, lower_limit: float, upper_limit: float) -> None: ... - -class QuantileOptions(FunctionOptions): - """ - Options for the `quantile` function. - - Parameters - ---------- - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to compute. All values must be in - [0, 1]. - interpolation : str, default "linear" - How to break ties between competing data points for a given quantile. - Accepted values are: - - - "linear": compute an interpolation - - "lower": always use the smallest of the two data points - - "higher": always use the largest of the two data points - - "nearest": select the data point that is closest to the quantile - - "midpoint": compute the (unweighted) mean of the two data points - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__( - self, - q: float | Sequence[float] = 0.5, - *, - interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", - skip_nulls: bool = True, - min_count: int = 0, - ) -> None: ... - -class RandomOptions(FunctionOptions): - """ - Options for random generation. - - Parameters - ---------- - initializer : int or str - How to initialize the underlying random generator. - If an integer is given, it is used as a seed. - If "system" is given, the random generator is initialized with - a system-specific source of (hopefully true) randomness. - Other values are invalid. - """ - def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... - -class RankOptions(FunctionOptions): - """ - Options for the `rank` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - tiebreaker : str, default "first" - Configure how ties between equal values are handled. - Accepted values are: - - - "min": Ties get the smallest possible rank in sorted order. - - "max": Ties get the largest possible rank in sorted order. - - "first": Ranks are assigned in order of when ties appear in the - input. This ensures the ranks are a stable permutation - of the input. - - "dense": The ranks span a dense [1, M] interval where M is the - number of distinct values in the input. - """ - def __init__( - self, - sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", - *, - null_placement: _Placement = "at_end", - tiebreaker: Literal["min", "max", "first", "dense"] = "first", - ) -> None: ... - -class RankQuantileOptions(FunctionOptions): - """ - Options for the `rank_quantile` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - """ - - def __init__( - self, - sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", - *, - null_placement: _Placement = "at_end", - ) -> None: ... - -class PivotWiderOptions(FunctionOptions): - """ - Options for the `pivot_wider` function. - - Parameters - ---------- - key_names : sequence of str - The pivot key names expected in the pivot key column. - For each entry in `key_names`, a column with the same name is emitted - in the struct output. - unexpected_key_behavior : str, default "ignore" - The behavior when pivot keys not in `key_names` are encountered. - Accepted values are "ignore", "raise". - If "ignore", unexpected keys are silently ignored. - If "raise", unexpected keys raise a KeyError. - """ - def __init__( - self, - key_names: Sequence[str], - *, - unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", - ) -> None: ... - -class ReplaceSliceOptions(FunctionOptions): - """ - Options for replacing slices. - - Parameters - ---------- - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - """ - def __init__(self, start: int, stop: int, replacement: str) -> None: ... - -class ReplaceSubstringOptions(FunctionOptions): - """ - Options for replacing matched substrings. - - Parameters - ---------- - pattern : str - Substring pattern to look for inside input values. - replacement : str - What to replace the pattern with. - max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). - """ - def __init__( - self, pattern: str, replacement: str, *, max_replacements: int | None = None - ) -> None: ... - -_RoundMode: TypeAlias = Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", -] - -class RoundBinaryOptions(FunctionOptions): - """ - Options for rounding numbers when ndigits is provided by a second array - - Parameters - ---------- - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__( - self, - round_mode: _RoundMode = "half_to_even", - ) -> None: ... - -class RoundOptions(FunctionOptions): - """ - Options for rounding numbers. - - Parameters - ---------- - ndigits : int, default 0 - Number of fractional digits to round to. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__( - self, - ndigits: int = 0, - round_mode: _RoundMode = "half_to_even", - ) -> None: ... - -_DateTimeUint: TypeAlias = Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", -] - -class RoundTemporalOptions(FunctionOptions): - """ - Options for rounding temporal values. - - Parameters - ---------- - multiple : int, default 1 - Number of units to round to. - unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. - calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. - """ - def __init__( - self, - multiple: int = 1, - unit: _DateTimeUint = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - ) -> None: ... - -class RoundToMultipleOptions(FunctionOptions): - """ - Options for rounding numbers to a multiple. - - Parameters - ---------- - multiple : numeric scalar, default 1.0 - Multiple to round to. Should be a scalar of a type compatible - with the argument to be rounded. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__(self, multiple: int | float | _NumericScalarT = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... - -class ScalarAggregateOptions(FunctionOptions): - """ - Options for scalar aggregations. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... - -class SelectKOptions(FunctionOptions): - """ - Options for top/bottom k-selection. - - Parameters - ---------- - k : int - Number of leading values to select in sorted order - (i.e. the largest values if sort order is "descending", - the smallest otherwise). - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - """ - - def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... - -class SetLookupOptions(FunctionOptions): - """ - Options for the `is_in` and `index_in` functions. - - Parameters - ---------- - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - """ - def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... - -class SliceOptions(FunctionOptions): - """ - Options for slicing. - - Parameters - ---------- - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - """ - - def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... - -class SortOptions(FunctionOptions): - """ - Options for the `sort_indices` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - null_placement : str, default "at_end" - Where nulls in input should be sorted, only applying to - columns/fields mentioned in `sort_keys`. - Accepted values are "at_start", "at_end". - """ - def __init__( - self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" - ) -> None: ... - -class SplitOptions(FunctionOptions): - """ - Options for splitting on whitespace. - - Parameters - ---------- - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - """ - - def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... - -class SplitPatternOptions(FunctionOptions): - """ - Options for splitting on a string pattern. - - Parameters - ---------- - pattern : str - String pattern to split on. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - """ - def __init__( - self, pattern: str, *, max_splits: int | None = None, reverse: bool = False - ) -> None: ... - -class StrftimeOptions(FunctionOptions): - """ - Options for the `strftime` function. - - Parameters - ---------- - format : str, default "%Y-%m-%dT%H:%M:%S" - Pattern for formatting input values. - locale : str, default "C" - Locale to use for locale-specific format specifiers. - """ - def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... - -class StrptimeOptions(FunctionOptions): - """ - Options for the `strptime` function. - - Parameters - ---------- - format : str - Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". - Note that the semantics of the format follow the C/C++ strptime, not the Python one. - There are differences in behavior, for example how the "%y" placeholder - handles years with less than four digits. - unit : str - Timestamp unit of the output. - Accepted values are "s", "ms", "us", "ns". - error_is_null : boolean, default False - Return null on parsing errors if true or raise if false. - """ - def __init__( - self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False - ) -> None: ... - -class StructFieldOptions(FunctionOptions): - """ - Options for the `struct_field` function. - - Parameters - ---------- - indices : List[str], List[bytes], List[int], Expression, bytes, str, or int - List of indices for chained field lookup, for example `[4, 1]` - will look up the second nested field in the fifth outer field. - """ - def __init__( - self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int - ) -> None: ... - -class TakeOptions(FunctionOptions): - """ - Options for the `take` and `array_take` functions. - - Parameters - ---------- - boundscheck : boolean, default True - Whether to check indices are within bounds. If False and an - index is out of bounds, behavior is undefined (the process - may crash). - """ - def __init__(self, boundscheck: bool = True) -> None: ... - -class TDigestOptions(FunctionOptions): - """ - Options for the `tdigest` function. - - Parameters - ---------- - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to approximate. All values must be - in [0, 1]. - delta : int, default 100 - Compression parameter for the T-digest algorithm. - buffer_size : int, default 500 - Buffer size for the T-digest algorithm. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__( - self, - q: float | Sequence[float] = 0.5, - *, - delta: int = 100, - buffer_size: int = 500, - skip_nulls: bool = True, - min_count: int = 0, - ) -> None: ... - -class TrimOptions(FunctionOptions): - """ - Options for trimming characters from strings. - - Parameters - ---------- - characters : str - Individual characters to be trimmed from the string. - """ - def __init__(self, characters: str) -> None: ... - -class Utf8NormalizeOptions(FunctionOptions): - """ - Options for the `utf8_normalize` function. - - Parameters - ---------- - form : str - Unicode normalization form. - Accepted values are "NFC", "NFKC", "NFD", NFKD". - """ - - def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... - -class ZeroFillOptions(FunctionOptions): - """ - Options for utf8_zero_fill. - - Parameters - ---------- - width : int - Desired string length. - padding : str, default "0" - Padding character. Should be one Unicode codepoint. - """ - def __init__(self, width: int, padding: str = '0') -> None: ... - -class VarianceOptions(FunctionOptions): - """ - Options for the `variance` and `stddev` functions. - - Parameters - ---------- - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... - -class SkewOptions(FunctionOptions): - """ - Options for the `skew` and `kurtosis` functions. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - biased : bool, default True - Whether the calculated value is biased. - If False, the value computed includes a correction factor to reduce bias. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__( - self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 - ) -> None: ... - -class WeekOptions(FunctionOptions): - """ - Options for the `week` function. - - Parameters - ---------- - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - count_from_zero : bool, default False - If True, dates at the start of a year that fall into the last week - of the previous year emit 0. - If False, they emit 52 or 53 (the week number of the last week - of the previous year). - first_week_is_fully_in_year : bool, default False - If True, week number 0 is fully in January. - If False, a week that begins on December 29, 30 or 31 is considered - to be week number 0 of the following year. - """ - def __init__( - self, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - ) -> None: ... - -# ==================== _compute.pyx Functions ==================== - -def call_function( - name: str, - args: list, - options: FunctionOptions | None = None, - memory_pool: lib.MemoryPool | None = None, - length: int | None = None, -) -> Any: - """ - Call a named function. - - The function is looked up in the global registry - (as returned by `function_registry()`). - - Parameters - ---------- - name : str - The name of the function to call. - args : list - The arguments to the function. - options : optional - options provided to the function. - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - length : int, optional - Batch size for execution, for nullary (no argument) functions. If not - passed, inferred from data. - """ - -def function_registry() -> FunctionRegistry: ... -def get_function(name: str) -> Function: - """ - Get a function by name. - - The function is looked up in the global registry - (as returned by `function_registry()`). - - Parameters - ---------- - name : str - The name of the function to lookup - """ - -def list_functions() -> list[str]: - """ - Return all function names in the global registry. - """ - -# ==================== _compute.pyx Udf ==================== - -def call_tabular_function( - function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None -) -> lib.RecordBatchReader: - """ - Get a record batch iterator from a tabular function. - - Parameters - ---------- - function_name : str - Name of the function. - args : iterable - The arguments to pass to the function. Accepted types depend - on the specific function. Currently, only an empty args is supported. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - """ - -class _FunctionDoc(TypedDict): - summary: str - description: str - -def register_scalar_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined scalar function. - - This API is EXPERIMENTAL. - - A scalar function is a function that executes elementwise - operations on arrays or scalars, i.e. a scalar function must - be computed row-by-row with no state where each output row - is computed only from its corresponding input row. - In other words, all argument arrays have the same length, - and the output array is of the same length as the arguments. - Scalar functions are the only functions allowed in query engine - expressions. - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return an Array or Scalar - matching the out_type. It must return a Scalar if - all arguments are scalar, else it must return an Array. - - To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs - arguments. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "simple udf" - >>> func_doc["description"] = "add a constant to a scalar" - >>> - >>> def add_constant(ctx, array): - ... return pc.add(array, 1, memory_pool=ctx.memory_pool) - >>> - >>> func_name = "py_add_func" - >>> in_types = {"array": pa.int64()} - >>> out_type = pa.int64() - >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) - >>> - >>> func = pc.get_function(func_name) - >>> func.name - 'py_add_func' - >>> answer = pc.call_function(func_name, [pa.array([20])]) - >>> answer - - [ - 21 - ] - """ - -def register_tabular_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined tabular function. - - This API is EXPERIMENTAL. - - A tabular function is one accepting a context argument of type - UdfContext and returning a generator of struct arrays. - The in_types argument must be empty and the out_type argument - specifies a schema. Each struct array must have field types - corresponding to the schema. - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The only argument is the context argument of type - UdfContext. It must return a callable that - returns on each invocation a StructArray matching - the out_type, where an empty array indicates end. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - Must be an empty dictionary (reserved for future use). - out_type : Union[Schema, DataType] - Schema of the function's output, or a corresponding flat struct type. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - """ - -def register_aggregate_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined non-decomposable aggregate function. - - This API is EXPERIMENTAL. - - A non-decomposable aggregation function is a function that executes - aggregate operations on the whole data that it is aggregating. - In other words, non-decomposable aggregate function cannot be - split into consume/merge/finalize steps. - - This is often used with ordered or segmented aggregation where groups - can be emit before accumulating all of the input data. - - Note that currently the size of any input column cannot exceed 2 GB - for a single segment (all groups combined). - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return a Scalar matching the - out_type. - To define a varargs function, pass a callable that takes - *args. The in_type needs to match in type of inputs when - the function gets called. - function_name : str - Name of the function. This name must be unique, i.e., - there should only be one function registered with - this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import numpy as np - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "simple median udf" - >>> func_doc["description"] = "compute median" - >>> - >>> def compute_median(ctx, array): - ... return pa.scalar(np.median(array)) - >>> - >>> func_name = "py_compute_median" - >>> in_types = {"array": pa.int64()} - >>> out_type = pa.float64() - >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) - >>> - >>> func = pc.get_function(func_name) - >>> func.name - 'py_compute_median' - >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) - >>> answer - - >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) - >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) - >>> result - pyarrow.Table - k: int64 - v_py_compute_median: double - ---- - k: [[1,2]] - v_py_compute_median: [[15,35]] - """ - -def register_vector_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined vector function. - - This API is EXPERIMENTAL. - - A vector function is a function that executes vector - operations on arrays. Vector function is often used - when compute doesn't fit other more specific types of - functions (e.g., scalar and aggregate). - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return an Array or Scalar - matching the out_type. It must return a Scalar if - all arguments are scalar, else it must return an Array. - - To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs - arguments. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "percent rank" - >>> func_doc["description"] = "compute percent rank" - >>> - >>> def list_flatten_udf(ctx, x): - ... return pc.list_flatten(x) - >>> - >>> func_name = "list_flatten_udf" - >>> in_types = {"array": pa.list_(pa.int64())} - >>> out_type = pa.int64() - >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) - >>> - >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) - >>> answer - - [ - 1, - 2, - 3, - 4 - ] - """ - -class UdfContext: - """ - Per-invocation function context/state. - - This object will always be the first argument to a user-defined - function. It should not be used outside of a call to the function. - """ - - @property - def batch_length(self) -> int: - """ - The common length of all input arguments (int). - - In the case that all arguments are scalars, this value - is used to pass the "actual length" of the arguments, - e.g. because the scalar values are encoding a column - with a constant value. - """ - @property - def memory_pool(self) -> lib.MemoryPool: - """ - A memory pool for allocations (:class:`MemoryPool`). - - This is the memory pool supplied by the user when they invoked - the function and it should be used in any calls to arrow that the - UDF makes if that call accepts a memory_pool. - """ - -# ==================== _compute.pyx Expression ==================== -class Expression(lib._Weakrefable): - """ - A logical expression to be evaluated against some input. - - To create an expression: - - - Use the factory function ``pyarrow.compute.scalar()`` to create a - scalar (not necessary when combined, see example below). - - Use the factory function ``pyarrow.compute.field()`` to reference - a field (column in table). - - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. - - Combine expressions using python operators ``&`` (logical and), - ``|`` (logical or) and ``~`` (logical not). - Note: python keywords ``and``, ``or`` and ``not`` cannot be used - to combine expressions. - - Create expression predicates using Expression methods such as - ``pyarrow.compute.Expression.isin()``. - - Examples - -------- - - >>> import pyarrow.compute as pc - >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) - 7))> - >>> pc.field("a") != 3 - - >>> pc.field("a").isin([1, 2, 3]) - - """ - - def equals(self, other: Expression | lib.Array | Iterable) -> bool: - """ - Parameters - ---------- - other : pyarrow.dataset.Expression - - Returns - ------- - bool - """ - - @staticmethod - def from_substrait(message: bytes | lib.Buffer) -> Expression: - """ - Deserialize an expression from Substrait - - The serialized message must be an ExtendedExpression message that has - only a single expression. The name of the expression and the schema - the expression was bound to will be ignored. Use - pyarrow.substrait.deserialize_expressions if this information is needed - or if the message might contain multiple expressions. - - Parameters - ---------- - message : bytes or Buffer or a protobuf Message - The Substrait message to deserialize - - Returns - ------- - Expression - The deserialized expression - """ - def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: - """ - Serialize the expression using Substrait - - The expression will be serialized as an ExtendedExpression message that has a - single expression named "expression" - - Parameters - ---------- - schema : Schema - The input schema the expression will be bound to - allow_arrow_extensions : bool, default False - If False then only functions that are part of the core Substrait function - definitions will be allowed. Set this to True to allow pyarrow-specific functions - but the result may not be accepted by other compute libraries. - - Returns - ------- - Buffer - A buffer containing the serialized Protobuf plan. - """ - def __invert__(self) -> Expression: ... - def __and__(self, other) -> Expression: ... - def __or__(self, other) -> Expression: ... - def __add__(self, other) -> Expression: ... - def __mul__(self, other) -> Expression: ... - def __sub__(self, other) -> Expression: ... - def __eq__(self, value: object) -> Expression: ... # type: ignore[override] - def __ne__(self, value: object) -> Expression: ... # type: ignore[override] - def __gt__(self, value: object) -> Expression: ... # type: ignore[override] - def __lt__(self, value: object) -> Expression: ... # type: ignore[override] - def __ge__(self, value: object) -> Expression: ... # type: ignore[override] - def __le__(self, value: object) -> Expression: ... # type: ignore[override] - def __truediv__(self, other) -> Expression: ... - def is_valid(self) -> bool: - """ - Check whether the expression is not-null (valid). - - This creates a new expression equivalent to calling the - `is_valid` compute function on this expression. - - Returns - ------- - is_valid : Expression - """ - def is_null(self, nan_is_null: bool = False) -> Expression: - """ - Check whether the expression is null. - - This creates a new expression equivalent to calling the - `is_null` compute function on this expression. - - Parameters - ---------- - nan_is_null : boolean, default False - Whether floating-point NaNs are considered null. - - Returns - ------- - is_null : Expression - """ - def is_nan(self) -> Expression: - """ - Check whether the expression is NaN. - - This creates a new expression equivalent to calling the - `is_nan` compute function on this expression. - - Returns - ------- - is_nan : Expression - """ - def cast( - self, type: lib.DataType | Literal["bool"], safe: bool = True, options: CastOptions | None = None - ) -> Expression: - """ - Explicitly set or change the expression's data type. - - This creates a new expression equivalent to calling the - `cast` compute function on this expression. - - Parameters - ---------- - type : DataType, default None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - cast : Expression - """ - def isin(self, values: lib.Array | Iterable) -> Expression: - """ - Check whether the expression is contained in values. - - This creates a new expression equivalent to calling the - `is_in` compute function on this expression. - - Parameters - ---------- - values : Array or iterable - The values to check for. - - Returns - ------- - isin : Expression - A new expression that, when evaluated, checks whether - this expression's value is contained in `values`. - """ - -# ==================== _compute.py ==================== diff --git a/python/pyarrow/_csv.pyi b/python/pyarrow/_csv.pyi deleted file mode 100644 index c490d6be93a..00000000000 --- a/python/pyarrow/_csv.pyi +++ /dev/null @@ -1,658 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from dataclasses import dataclass, field -from typing import IO, Any, Callable, Literal - -from _typeshed import StrPath - -from . import lib - -@dataclass(kw_only=True) -class ReadOptions(lib._Weakrefable): - """ - Options for reading CSV files. - - Parameters - ---------- - use_threads : bool, optional (default True) - Whether to use multiple threads to accelerate reading - block_size : int, optional - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as - the size of individual record batches or table chunks. - Minimum valid value for block size is 1 - skip_rows : int, optional (default 0) - The number of rows to skip before the column names (if any) - and the CSV data. - skip_rows_after_names : int, optional (default 0) - The number of rows to skip after the column names. - This number can be larger than the number of rows in one - block, and empty rows are counted. - The order of application is as follows: - - `skip_rows` is applied (if non-zero); - - column names are read (unless `column_names` is set); - - `skip_rows_after_names` is applied (if non-zero). - column_names : list, optional - The column names of the target table. If empty, fall back on - `autogenerate_column_names`. - autogenerate_column_names : bool, optional (default False) - Whether to autogenerate column names if `column_names` is empty. - If true, column names will be of the form "f0", "f1"... - If false, column names will be read from the first CSV row - after `skip_rows`. - encoding : str, optional (default 'utf8') - The character encoding of the CSV data. Columns that cannot - decode using this encoding can still be read as Binary. - - Examples - -------- - - Defining an example data: - - >>> import io - >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" - >>> print(s) - 1,2,3 - Flamingo,2,2022-03-01 - Horse,4,2022-03-02 - Brittle stars,5,2022-03-03 - Centipede,100,2022-03-04 - - Ignore the first numbered row and substitute it with defined - or autogenerated column names: - - >>> from pyarrow import csv - >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - - >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - f0: string - f1: int64 - f2: date32[day] - ---- - f0: [["Flamingo","Horse","Brittle stars","Centipede"]] - f1: [[2,4,5,100]] - f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - - Remove the first 2 rows of the data: - - >>> read_options = csv.ReadOptions(skip_rows_after_names=2) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - 1: string - 2: int64 - 3: date32[day] - ---- - 1: [["Brittle stars","Centipede"]] - 2: [[5,100]] - 3: [[2022-03-03,2022-03-04]] - """ - - use_threads: bool = field(default=True, kw_only=False) - block_size: int | None = None - skip_rows: int = 0 - skip_rows_after_names: int = 0 - column_names: list[str] | None = None - autogenerate_column_names: bool = False - encoding: str = "utf8" - - def validate(self) -> None: ... - -@dataclass(kw_only=True) -class ParseOptions(lib._Weakrefable): - """ - Options for parsing CSV files. - - Parameters - ---------- - delimiter : 1-character string, optional (default ',') - The character delimiting individual cells in the CSV data. - quote_char : 1-character string or False, optional (default '"') - The character used optionally for quoting CSV values - (False if quoting is not allowed). - double_quote : bool, optional (default True) - Whether two quotes in a quoted CSV value denote a single quote - in the data. - escape_char : 1-character string or False, optional (default False) - The character used optionally for escaping special characters - (False if escaping is not allowed). - newlines_in_values : bool, optional (default False) - Whether newline characters are allowed in CSV values. - Setting this to True reduces the performance of multi-threaded - CSV reading. - ignore_empty_lines : bool, optional (default True) - Whether empty lines are ignored in CSV input. - If False, an empty line is interpreted as containing a single empty - value (assuming a one-column CSV file). - invalid_row_handler : callable, optional (default None) - If not None, this object is called for each CSV row that fails - parsing (because of a mismatching number of columns). - It should accept a single InvalidRow argument and return either - "skip" or "error" depending on the desired outcome. - - Examples - -------- - - Defining an example file from bytes object: - - >>> import io - >>> s = ( - ... "animals;n_legs;entry\\n" - ... "Flamingo;2;2022-03-01\\n" - ... "# Comment here:\\n" - ... "Horse;4;2022-03-02\\n" - ... "Brittle stars;5;2022-03-03\\n" - ... "Centipede;100;2022-03-04" - ... ) - >>> print(s) - animals;n_legs;entry - Flamingo;2;2022-03-01 - # Comment here: - Horse;4;2022-03-02 - Brittle stars;5;2022-03-03 - Centipede;100;2022-03-04 - >>> source = io.BytesIO(s.encode()) - - Read the data from a file skipping rows with comments - and defining the delimiter: - - >>> from pyarrow import csv - >>> def skip_comment(row): - ... if row.text.startswith("# "): - ... return "skip" - ... else: - ... return "error" - >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) - >>> csv.read_csv(source, parse_options=parse_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - """ - - delimiter: str = field(default=",", kw_only=False) - quote_char: str | Literal[False] = '"' - double_quote: bool = True - escape_char: str | Literal[False] = False - newlines_in_values: bool = False - ignore_empty_lines: bool = True - invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None - - def validate(self) -> None: ... - -@dataclass(kw_only=True) -class ConvertOptions(lib._Weakrefable): - """ - Options for converting CSV data. - - Parameters - ---------- - check_utf8 : bool, optional (default True) - Whether to check UTF8 validity of string columns. - column_types : pyarrow.Schema or dict, optional - Explicitly map column names to column types. Passing this argument - disables type inference on the defined columns. - null_values : list, optional - A sequence of strings that denote nulls in the data - (defaults are appropriate in most cases). Note that by default, - string columns are not checked for null values. To enable - null checking for those, specify ``strings_can_be_null=True``. - true_values : list, optional - A sequence of strings that denote true booleans in the data - (defaults are appropriate in most cases). - false_values : list, optional - A sequence of strings that denote false booleans in the data - (defaults are appropriate in most cases). - decimal_point : 1-character string, optional (default '.') - The character used as decimal point in floating-point and decimal - data. - strings_can_be_null : bool, optional (default False) - Whether string / binary columns can have null values. - If true, then strings in null_values are considered null for - string columns. - If false, then all strings are valid string values. - quoted_strings_can_be_null : bool, optional (default True) - Whether quoted values can be null. - If true, then strings in "null_values" are also considered null - when they appear quoted in the CSV file. Otherwise, quoted values - are never considered null. - include_columns : list, optional - The names of columns to include in the Table. - If empty, the Table will include all columns from the CSV file. - If not empty, only these columns will be included, in this order. - include_missing_columns : bool, optional (default False) - If false, columns in `include_columns` but not in the CSV file will - error out. - If true, columns in `include_columns` but not in the CSV file will - produce a column of nulls (whose type is selected using - `column_types`, or null by default). - This option is ignored if `include_columns` is empty. - auto_dict_encode : bool, optional (default False) - Whether to try to automatically dict-encode string / binary data. - If true, then when type inference detects a string or binary column, - it it dict-encoded up to `auto_dict_max_cardinality` distinct values - (per chunk), after which it switches to regular encoding. - This setting is ignored for non-inferred columns (those in - `column_types`). - auto_dict_max_cardinality : int, optional - The maximum dictionary cardinality for `auto_dict_encode`. - This value is per chunk. - timestamp_parsers : list, optional - A sequence of strptime()-compatible format strings, tried in order - when attempting to infer or convert timestamp values (the special - value ISO8601() can also be given). By default, a fast built-in - ISO-8601 parser is used. - - Examples - -------- - - Defining an example data: - - >>> import io - >>> s = ( - ... "animals,n_legs,entry,fast\\n" - ... "Flamingo,2,01/03/2022,Yes\\n" - ... "Horse,4,02/03/2022,Yes\\n" - ... "Brittle stars,5,03/03/2022,No\\n" - ... "Centipede,100,04/03/2022,No\\n" - ... ",6,05/03/2022," - ... ) - >>> print(s) - animals,n_legs,entry,fast - Flamingo,2,01/03/2022,Yes - Horse,4,02/03/2022,Yes - Brittle stars,5,03/03/2022,No - Centipede,100,04/03/2022,No - ,6,05/03/2022, - - Change the type of a column: - - >>> import pyarrow as pa - >>> from pyarrow import csv - >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: double - entry: string - fast: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] - fast: [["Yes","Yes","No","No",""]] - - Define a date parsing format to get a timestamp type column - (in case dates are not in ISO format and not converted by default): - - >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: timestamp[s] - fast: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] - fast: [["Yes","Yes","No","No",""]] - - Specify a subset of columns to be read: - - >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - - List additional column to be included as a null typed column: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - location: null - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - location: [5 nulls] - - Define columns as dictionary type (by default only the - string/binary columns are dictionary encoded): - - >>> convert_options = csv.ConvertOptions( - ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: dictionary - n_legs: int64 - entry: timestamp[s] - fast: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: - [0,1,2,3,4]] - n_legs: [[2,4,5,100,6]] - entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] - fast: [ -- dictionary: - ["Yes","No",""] -- indices: - [0,0,1,1,2]] - - Set upper limit for the number of categories. If the categories - is more than the limit, the conversion to dictionary will not - happen: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - - Set empty strings to missing values: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals", "n_legs"], strings_can_be_null=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] - n_legs: [[2,4,5,100,6]] - - Define values to be True and False when converting a column - into a bool type: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - fast: bool - ---- - fast: [[true,true,false,false,null]] - """ - - check_utf8: bool = field(default=True, kw_only=False) - column_types: lib.Schema | dict | None = None - null_values: list[str] | None = None - true_values: list[str] | None = None - false_values: list[str] | None = None - decimal_point: str = "." - strings_can_be_null: bool = False - quoted_strings_can_be_null: bool = True - include_columns: list[str] | None = None - include_missing_columns: bool = False - auto_dict_encode: bool = False - auto_dict_max_cardinality: int | None = None - timestamp_parsers: list[str] | None = None - - def validate(self) -> None: ... - -@dataclass(kw_only=True) -class WriteOptions(lib._Weakrefable): - """ - Options for writing CSV files. - - Parameters - ---------- - include_header : bool, optional (default True) - Whether to write an initial header line with column names - batch_size : int, optional (default 1024) - How many rows to process together when converting and writing - CSV data - delimiter : 1-character string, optional (default ",") - The character delimiting individual cells in the CSV data. - quoting_style : str, optional (default "needed") - Whether to quote values, and if so, which quoting style to use. - The following values are accepted: - - - "needed" (default): only enclose values in quotes when needed. - - "all_valid": enclose all valid values in quotes; nulls are not quoted. - - "none": do not enclose any values in quotes; values containing - special characters (such as quotes, cell delimiters or line endings) - will raise an error. - """ - - include_header: bool = field(default=True, kw_only=False) - batch_size: int = 1024 - delimiter: str = "," - quoting_style: Literal["needed", "all_valid", "none"] = "needed" - - def validate(self) -> None: ... - -@dataclass -class InvalidRow(lib._Weakrefable): - """ - Description of an invalid row in a CSV file. - - Parameters - ---------- - expected_columns : int - The expected number of columns in the row. - actual_columns : int - The actual number of columns in the row. - number : int or None - The physical row number if known, otherwise None. - text : str - The contents of the row. - """ - - expected_columns: int - actual_columns: int - number: int | None - text: str - -class CSVWriter(lib._CRecordBatchWriter): - """ - Writer to create a CSV file. - - Parameters - ---------- - sink : str, path, pyarrow.OutputStream or file-like object - The location where to write the CSV data. - schema : pyarrow.Schema - The schema of the data to be written. - write_options : pyarrow.csv.WriteOptions - Options to configure writing the CSV data. - memory_pool : MemoryPool, optional - Pool for temporary allocations. - """ - - def __init__( - self, - # TODO: OutputStream - sink: StrPath | IO[Any], - schema: lib.Schema, - write_options: WriteOptions | None = None, - *, - memory_pool: lib.MemoryPool | None = None, - ) -> None: ... - -class CSVStreamingReader(lib.RecordBatchReader): ... - -ISO8601: lib._Weakrefable - -def open_csv( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - convert_options: ConvertOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> CSVStreamingReader: - """ - Open a streaming reader of CSV data. - - Reading using this function is always single-threaded. - - Parameters - ---------- - input_file : string, path or file-like object - The location of CSV data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.csv.ReadOptions, optional - Options for the CSV reader (see pyarrow.csv.ReadOptions constructor - for defaults) - parse_options : pyarrow.csv.ParseOptions, optional - Options for the CSV parser - (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options : pyarrow.csv.ConvertOptions, optional - Options for converting CSV data - (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate RecordBatch memory from - - Returns - ------- - :class:`pyarrow.csv.CSVStreamingReader` - """ - -def read_csv( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - convert_options: ConvertOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Table: - """ - Read a Table from a stream of CSV data. - - Parameters - ---------- - input_file : string, path or file-like object - The location of CSV data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.csv.ReadOptions, optional - Options for the CSV reader (see pyarrow.csv.ReadOptions constructor - for defaults) - parse_options : pyarrow.csv.ParseOptions, optional - Options for the CSV parser - (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options : pyarrow.csv.ConvertOptions, optional - Options for converting CSV data - (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate Table memory from - - Returns - ------- - :class:`pyarrow.Table` - Contents of the CSV file as a in-memory table. - - Examples - -------- - - Defining an example file from bytes object: - - >>> import io - >>> s = ( - ... "animals,n_legs,entry\\n" - ... "Flamingo,2,2022-03-01\\n" - ... "Horse,4,2022-03-02\\n" - ... "Brittle stars,5,2022-03-03\\n" - ... "Centipede,100,2022-03-04" - ... ) - >>> print(s) - animals,n_legs,entry - Flamingo,2,2022-03-01 - Horse,4,2022-03-02 - Brittle stars,5,2022-03-03 - Centipede,100,2022-03-04 - >>> source = io.BytesIO(s.encode()) - - Reading from the file - - >>> from pyarrow import csv - >>> csv.read_csv(source) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - """ - -def write_csv( - data: lib.RecordBatch | lib.Table, - output_file: StrPath | lib.NativeFile | IO[Any], - write_options: WriteOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> None: - """ - Write record batch or table to a CSV file. - - Parameters - ---------- - data : pyarrow.RecordBatch or pyarrow.Table - The data to write. - output_file : string, path, pyarrow.NativeFile, or file-like object - The location where to write the CSV data. - write_options : pyarrow.csv.WriteOptions - Options to configure writing the CSV data. - memory_pool : MemoryPool, optional - Pool for temporary allocations. - - Examples - -------- - - >>> import pyarrow as pa - >>> from pyarrow import csv - - >>> legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) - >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) - - >>> csv.write_csv(table, "animals.csv") - - >>> write_options = csv.WriteOptions(include_header=False) - >>> csv.write_csv(table, "animals.csv", write_options=write_options) - - >>> write_options = csv.WriteOptions(delimiter=";") - >>> csv.write_csv(table, "animals.csv", write_options=write_options) - """ diff --git a/python/pyarrow/_cuda.pyi b/python/pyarrow/_cuda.pyi deleted file mode 100644 index 6bcd9868d7f..00000000000 --- a/python/pyarrow/_cuda.pyi +++ /dev/null @@ -1,573 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any - -import cuda # type: ignore[import-not-found] - -from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] - -from . import lib -from ._stubs_typing import ArrayLike - -class Context(lib._Weakrefable): - """ - CUDA driver context. - """ - - def __init__(self, device_number: int = 0, handle: int | None = None) -> None: - """ - Create a CUDA driver context for a particular device. - - If a CUDA context handle is passed, it is wrapped, otherwise - a default CUDA context for the given device is requested. - - Parameters - ---------- - device_number : int (default 0) - Specify the GPU device for which the CUDA driver context is - requested. - handle : int, optional - Specify CUDA handle for a shared context that has been created - by another library. - """ - @staticmethod - def from_numba(context: _numba_driver.Context | None = None) -> Context: - """ - Create a Context instance from a Numba CUDA context. - - Parameters - ---------- - context : {numba.cuda.cudadrv.driver.Context, None} - A Numba CUDA context instance. - If None, the current Numba context is used. - - Returns - ------- - shared_context : pyarrow.cuda.Context - Context instance. - """ - def to_numba(self) -> _numba_driver.Context: - """ - Convert Context to a Numba CUDA context. - - Returns - ------- - context : numba.cuda.cudadrv.driver.Context - Numba CUDA context instance. - """ - @staticmethod - def get_num_devices() -> int: - """Return the number of GPU devices.""" - @property - def device_number(self) -> int: - """Return context device number.""" - @property - def handle(self) -> int: - """Return pointer to context handle.""" - def synchronize(self) -> None: - """Blocks until the device has completed all preceding requested - tasks. - """ - @property - def bytes_allocated(self) -> int: - """Return the number of allocated bytes.""" - def get_device_address(self, address: int) -> int: - """Return the device address that is reachable from kernels running in - the context - - Parameters - ---------- - address : int - Specify memory address value - - Returns - ------- - device_address : int - Device address accessible from device context - - Notes - ----- - The device address is defined as a memory address accessible - by device. While it is often a device memory address but it - can be also a host memory address, for instance, when the - memory is allocated as host memory (using cudaMallocHost or - cudaHostAlloc) or as managed memory (using cudaMallocManaged) - or the host memory is page-locked (using cudaHostRegister). - """ - def new_buffer(self, nbytes: int) -> CudaBuffer: - """Return new device buffer. - - Parameters - ---------- - nbytes : int - Specify the number of bytes to be allocated. - - Returns - ------- - buf : CudaBuffer - Allocated buffer. - """ - @property - def memory_manager(self) -> lib.MemoryManager: - """ - The default memory manager tied to this context's device. - - Returns - ------- - MemoryManager - """ - @property - def device(self) -> lib.Device: - """ - The device instance associated with this context. - - Returns - ------- - Device - """ - def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: - """ - Create device buffer from address and size as a view. - - The caller is responsible for allocating and freeing the - memory. When `address==size==0` then a new zero-sized buffer - is returned. - - Parameters - ---------- - address : int - Specify the starting address of the buffer. The address can - refer to both device or host memory but it must be - accessible from device after mapping it with - `get_device_address` method. - size : int - Specify the size of device buffer in bytes. - base : {None, object} - Specify object that owns the referenced memory. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device reachable memory. - - """ - def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: - """Open existing CUDA IPC memory handle - - Parameters - ---------- - ipc_handle : IpcMemHandle - Specify opaque pointer to CUipcMemHandle (driver API). - - Returns - ------- - buf : CudaBuffer - referencing device buffer - """ - def buffer_from_data( - self, - data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, - offset: int = 0, - size: int = -1, - ) -> CudaBuffer: - """Create device buffer and initialize with data. - - Parameters - ---------- - data : {CudaBuffer, HostBuffer, Buffer, array-like} - Specify data to be copied to device buffer. - offset : int - Specify the offset of input buffer for device data - buffering. Default: 0. - size : int - Specify the size of device buffer in bytes. Default: all - (starting from input offset) - - Returns - ------- - cbuf : CudaBuffer - Device buffer with copied data. - """ - def buffer_from_object(self, obj: Any) -> CudaBuffer: - """Create device buffer view of arbitrary object that references - device accessible memory. - - When the object contains a non-contiguous view of device - accessible memory then the returned device buffer will contain - contiguous view of the memory, that is, including the - intermediate data that is otherwise invisible to the input - object. - - Parameters - ---------- - obj : {object, Buffer, HostBuffer, CudaBuffer, ...} - Specify an object that holds (device or host) address that - can be accessed from device. This includes objects with - types defined in pyarrow.cuda as well as arbitrary objects - that implement the CUDA array interface as defined by numba. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device accessible memory. - - """ - -class IpcMemHandle(lib._Weakrefable): - """A serializable container for a CUDA IPC handle.""" - @staticmethod - def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: - """Create IpcMemHandle from opaque buffer (e.g. from another - process) - - Parameters - ---------- - opaque_handle : - a CUipcMemHandle as a const void* - - Returns - ------- - ipc_handle : IpcMemHandle - """ - def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: - """Write IpcMemHandle to a Buffer - - Parameters - ---------- - pool : {MemoryPool, None} - Specify a pool to allocate memory from - - Returns - ------- - buf : Buffer - The serialized buffer. - """ - -class CudaBuffer(lib.Buffer): - """An Arrow buffer with data located in a GPU device. - - To create a CudaBuffer instance, use Context.device_buffer(). - - The memory allocated in a CudaBuffer is freed when the buffer object - is deleted. - """ - - @staticmethod - def from_buffer(buf: lib.Buffer) -> CudaBuffer: - """Convert back generic buffer into CudaBuffer - - Parameters - ---------- - buf : Buffer - Specify buffer containing CudaBuffer - - Returns - ------- - dbuf : CudaBuffer - Resulting device buffer. - """ - @staticmethod - def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: - """Create a CudaBuffer view from numba MemoryPointer instance. - - Parameters - ---------- - mem : numba.cuda.cudadrv.driver.MemoryPointer - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of numba MemoryPointer. - """ - def to_numba(self) -> _numba_driver.MemoryPointer: - """Return numba memory pointer of CudaBuffer instance.""" - def copy_to_host( - self, - position: int = 0, - nbytes: int = -1, - buf: lib.Buffer | None = None, - memory_pool: lib.MemoryPool | None = None, - resizable: bool = False, - ) -> lib.Buffer: - """Copy memory from GPU device to CPU host - - Caller is responsible for ensuring that all tasks affecting - the memory are finished. Use - - `.context.synchronize()` - - when needed. - - Parameters - ---------- - position : int - Specify the starting position of the source data in GPU - device buffer. Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - the position until host buffer is full). - buf : Buffer - Specify a pre-allocated output buffer in host. Default: None - (allocate new output buffer). - memory_pool : MemoryPool - resizable : bool - Specify extra arguments to allocate_buffer. Used only when - buf is None. - - Returns - ------- - buf : Buffer - Output buffer in host. - - """ - def copy_from_host( - self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 - ) -> int: - """Copy data from host to device. - - The device buffer must be pre-allocated. - - Parameters - ---------- - data : {Buffer, array-like} - Specify data in host. It can be array-like that is valid - argument to py_buffer - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - """ - def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: - """Copy data from device to device. - - Parameters - ---------- - buf : CudaBuffer - Specify source device buffer. - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - - """ - def export_for_ipc(self) -> IpcMemHandle: - """ - Expose this device buffer as IPC memory which can be used in other - processes. - - After calling this function, this device memory will not be - freed when the CudaBuffer is destructed. - - Returns - ------- - ipc_handle : IpcMemHandle - The exported IPC handle - - """ - @property - def context(self) -> Context: - """Returns the CUDA driver context of this buffer.""" - def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: - """Return slice of device buffer - - Parameters - ---------- - offset : int, default 0 - Specify offset from the start of device buffer to slice - length : int, default None - Specify the length of slice (default is until end of device - buffer starting from offset). If the length is larger than - the data available, the returned slice will have a size of - the available data starting from the offset. - - Returns - ------- - sliced : CudaBuffer - Zero-copy slice of device buffer. - - """ - def to_pybytes(self) -> bytes: - """Return device buffer content as Python bytes.""" - -class HostBuffer(lib.Buffer): - """Device-accessible CPU memory created using cudaHostAlloc. - - To create a HostBuffer instance, use - - cuda.new_host_buffer() - """ - @property - def size(self) -> int: ... - -class BufferReader(lib.NativeFile): - """File interface for zero-copy read from CUDA buffers. - - Note: Read methods return pointers to device memory. This means - you must be careful using this interface with any Arrow code which - may expect to be able to do anything other than pointer arithmetic - on the returned buffers. - """ - def __init__(self, obj: CudaBuffer) -> None: ... - def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: - """Return a slice view of the underlying device buffer. - - The slice will start at the current reader position and will - have specified size in bytes. - - Parameters - ---------- - nbytes : int, default None - Specify the number of bytes to read. Default: None (read all - remaining bytes). - - Returns - ------- - cbuf : CudaBuffer - New device buffer. - - """ - -class BufferWriter(lib.NativeFile): - """File interface for writing to CUDA buffers. - - By default writes are unbuffered. Use set_buffer_size to enable - buffering. - """ - def __init__(self, obj: CudaBuffer) -> None: ... - def writeat(self, position: int, data: ArrayLike) -> None: - """Write data to buffer starting from position. - - Parameters - ---------- - position : int - Specify device buffer position where the data will be - written. - data : array-like - Specify data, the data instance must implement buffer - protocol. - """ - @property - def buffer_size(self) -> int: - """Returns size of host (CPU) buffer, 0 for unbuffered""" - @buffer_size.setter - def buffer_size(self, buffer_size: int): - """Set CPU buffer size to limit calls to cudaMemcpy - - Parameters - ---------- - buffer_size : int - Specify the size of CPU buffer to allocate in bytes. - """ - @property - def num_bytes_buffered(self) -> int: - """Returns number of bytes buffered on host""" - -def new_host_buffer(size: int, device: int = 0) -> HostBuffer: - """Return buffer with CUDA-accessible memory on CPU host - - Parameters - ---------- - size : int - Specify the number of bytes to be allocated. - device : int - Specify GPU device number. - - Returns - ------- - dbuf : HostBuffer - Allocated host buffer - """ - -def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: - """Write record batch message to GPU device memory - - Parameters - ---------- - batch : RecordBatch - Record batch to write - ctx : Context - CUDA Context to allocate device memory from - - Returns - ------- - dbuf : CudaBuffer - device buffer which contains the record batch message - """ - -def read_message( - source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None -) -> lib.Message: - """Read Arrow IPC message located on GPU device - - Parameters - ---------- - source : {CudaBuffer, cuda.BufferReader} - Device buffer or reader of device buffer. - pool : MemoryPool (optional) - Pool to allocate CPU memory for the metadata - - Returns - ------- - message : Message - The deserialized message, body still on device - """ - -def read_record_batch( - buffer: lib.Buffer, - object: lib.Schema, - *, - dictionary_memo: lib.DictionaryMemo | None = None, - pool: lib.MemoryPool | None = None, -) -> lib.RecordBatch: - """Construct RecordBatch referencing IPC message located on CUDA device. - - While the metadata is copied to host memory for deserialization, - the record batch data remains on the device. - - Parameters - ---------- - buffer : - Device buffer containing the complete IPC message - schema : Schema - The schema for the record batch - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - pool : MemoryPool (optional) - Pool to allocate metadata from - - Returns - ------- - batch : RecordBatch - Reconstructed record batch, with device pointers - - """ diff --git a/python/pyarrow/_dataset.pyi b/python/pyarrow/_dataset.pyi deleted file mode 100644 index 4980cb0420f..00000000000 --- a/python/pyarrow/_dataset.pyi +++ /dev/null @@ -1,2318 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import ( - IO, - Any, - Callable, - Generic, - Iterator, - Literal, - NamedTuple, - TypeVar, - overload, -) - -from _typeshed import StrPath - -from . import _csv, _json, _parquet, lib -from ._fs import FileSelector, FileSystem, SupportedFileSystem -from ._stubs_typing import Indices, JoinType, Order -from .acero import ExecNodeOptions -from .compute import Expression -from .ipc import IpcWriteOptions, RecordBatchReader - -class Dataset(lib._Weakrefable): - """ - Collection of data fragments and potentially child datasets. - - Arrow Datasets allow you to query against data that has been split across - multiple files. This sharding of data may indicate partitioning, which - can accelerate queries that only touch some partitions (files). - """ - - @property - def partition_expression(self) -> Expression: - """ - An Expression which evaluates to true for all data viewed by this - Dataset. - """ - def replace_schema(self, schema: lib.Schema) -> None: - """ - Return a copy of this Dataset with a different schema. - - The copy will view the same Fragments. If the new schema is not - compatible with the original dataset's schema then an error will - be raised. - - Parameters - ---------- - schema : Schema - The new dataset schema. - """ - def get_fragments(self, filter: Expression | None = None): - """Returns an iterator over the fragments in this dataset. - - Parameters - ---------- - filter : Expression, default None - Return fragments matching the optional filter, either using the - partition_expression or internal information like Parquet's - statistics. - - Returns - ------- - fragments : iterator of Fragment - """ - def scanner( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Build a scan operation against the dataset. - - Data is not loaded immediately. Instead, this produces a Scanner, - which exposes further operations (e.g. loading all data as a - table, counting rows). - - See the :meth:`Scanner.from_dataset` method for further information. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - scanner : Scanner - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "dataset_scanner.parquet") - - >>> import pyarrow.dataset as ds - >>> dataset = ds.dataset("dataset_scanner.parquet") - - Selecting a subset of the columns: - - >>> dataset.scanner(columns=["year", "n_legs"]).to_table() - pyarrow.Table - year: int64 - n_legs: int64 - ---- - year: [[2020,2022,2021,2022,2019,2021]] - n_legs: [[2,2,4,4,5,100]] - - Projecting selected columns using an expression: - - >>> dataset.scanner( - ... columns={ - ... "n_legs_uint": ds.field("n_legs").cast("uint8"), - ... } - ... ).to_table() - pyarrow.Table - n_legs_uint: uint8 - ---- - n_legs_uint: [[2,2,4,4,5,100]] - - Filtering rows while scanning: - - >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() - pyarrow.Table - year: int64 - n_legs: int64 - animal: string - ---- - year: [[2022,2021,2022,2021]] - n_legs: [[2,4,4,100]] - animal: [["Parrot","Dog","Horse","Centipede"]] - """ - def to_batches( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: - """ - Read the dataset as materialized record batches. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def to_table( - self, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Read the dataset to an Arrow table. - - Note that this method reads all the selected data from the dataset - into memory. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def take( - self, - indices: Indices, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Select rows of data by index. - - Parameters - ---------- - indices : Array or array-like - indices of rows to select in the dataset. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def head( - self, - num_rows: int, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Load the first N rows of the dataset. - - Parameters - ---------- - num_rows : int - The number of rows to load. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def count_rows( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> int: - """ - Count rows matching the scanner filter. - - Parameters - ---------- - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - count : int - """ - @property - def schema(self) -> lib.Schema: - """The common schema of the full Dataset""" - def filter(self, expression: Expression) -> Self: - """ - Apply a row filter to the dataset. - - Parameters - ---------- - expression : Expression - The filter that should be applied to the dataset. - - Returns - ------- - Dataset - """ - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: - """ - Sort the Dataset by one or multiple columns. - - Parameters - ---------- - sorting : str or list[tuple(name, order)] - Name of the column to use to sort (ascending), or - a list of multiple sorting conditions where - each entry is a tuple with column name - and sorting order ("ascending" or "descending") - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - InMemoryDataset - A new dataset sorted according to the sort keys. - """ - def join( - self, - right_dataset: Dataset, - keys: str | list[str], - right_keys: str | list[str] | None = None, - join_type: JoinType = "left outer", - left_suffix: str | None = None, - right_suffix: str | None = None, - coalesce_keys: bool = True, - use_threads: bool = True, - ) -> InMemoryDataset: - """ - Perform a join between this dataset and another one. - - Result of the join will be a new dataset, where further - operations can be applied. - - Parameters - ---------- - right_dataset : dataset - The dataset to join to the current one, acting as the right dataset - in the join operation. - keys : str or list[str] - The columns from current dataset that should be used as keys - of the join operation left side. - right_keys : str or list[str], default None - The columns from the right_dataset that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left dataset. - join_type : str, default "left outer" - The kind of join that should be performed, one of - ("left semi", "right semi", "left anti", "right anti", - "inner", "left outer", "right outer", "full outer") - left_suffix : str, default None - Which suffix to add to right column names. This prevents confusion - when the columns in left and right datasets have colliding names. - right_suffix : str, default None - Which suffix to add to the left column names. This prevents confusion - when the columns in left and right datasets have colliding names. - coalesce_keys : bool, default True - If the duplicated keys should be omitted from one of the sides - in the join result. - use_threads : bool, default True - Whenever to use multithreading or not. - - Returns - ------- - InMemoryDataset - """ - def join_asof( - self, - right_dataset: Dataset, - on: str, - by: str | list[str], - tolerance: int, - right_on: str | list[str] | None = None, - right_by: str | list[str] | None = None, - ) -> InMemoryDataset: - """ - Perform an asof join between this dataset and another one. - - This is similar to a left-join except that we match on nearest key rather - than equal keys. Both datasets must be sorted by the key. This type of join - is most useful for time series data that are not perfectly aligned. - - Optionally match on equivalent keys with "by" before searching with "on". - - Result of the join will be a new Dataset, where further - operations can be applied. - - Parameters - ---------- - right_dataset : dataset - The dataset to join to the current one, acting as the right dataset - in the join operation. - on : str - The column from current dataset that should be used as the "on" key - of the join operation left side. - - An inexact match is used on the "on" key, i.e. a row is considered a - match if and only if left_on - tolerance <= right_on <= left_on. - - The input table must be sorted by the "on" key. Must be a single - field of a common type. - - Currently, the "on" key must be an integer, date, or timestamp type. - by : str or list[str] - The columns from current dataset that should be used as the keys - of the join operation left side. The join operation is then done - only for the matches in these columns. - tolerance : int - The tolerance for inexact "on" key matching. A right row is considered - a match with the left row `right.on - left.on <= tolerance`. The - `tolerance` may be: - - - negative, in which case a past-as-of-join occurs; - - or positive, in which case a future-as-of-join occurs; - - or zero, in which case an exact-as-of-join occurs. - - The tolerance is interpreted in the same units as the "on" key. - right_on : str or list[str], default None - The columns from the right_dataset that should be used as the on key - on the join operation right side. - When ``None`` use the same key name as the left dataset. - right_by : str or list[str], default None - The columns from the right_dataset that should be used as by keys - on the join operation right side. - When ``None`` use the same key names as the left dataset. - - Returns - ------- - InMemoryDataset - """ - -class InMemoryDataset(Dataset): - """ - A Dataset wrapping in-memory data. - - Parameters - ---------- - source : RecordBatch, Table, list, tuple - The data for this dataset. Can be a RecordBatch, Table, list of - RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader - If an iterable is provided, the schema must also be provided. - schema : Schema, optional - Only required if passing an iterable as the source - """ - -class UnionDataset(Dataset): - """ - A Dataset wrapping child datasets. - - Children's schemas must agree with the provided schema. - - Parameters - ---------- - schema : Schema - A known schema to conform to. - children : list of Dataset - One or more input children - """ - - @property - def children(self) -> list[Dataset]: ... - -class FileSystemDataset(Dataset): - """ - A Dataset of file fragments. - - A FileSystemDataset is composed of one or more FileFragment. - - Parameters - ---------- - fragments : list[Fragments] - List of fragments to consume. - schema : Schema - The top-level schema of the Dataset. - format : FileFormat - File format of the fragments, currently only ParquetFileFormat, - IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. - filesystem : FileSystem - FileSystem of the fragments. - root_partition : Expression, optional - The top-level partition of the DataDataset. - """ - - def __init__( - self, - fragments: list[Fragment], - schema: lib.Schema, - format: FileFormat, - filesystem: SupportedFileSystem | None = None, - root_partition: Expression | None = None, - ) -> None: ... - @classmethod - def from_paths( - cls, - paths: list[str], - schema: lib.Schema | None = None, - format: FileFormat | None = None, - filesystem: SupportedFileSystem | None = None, - partitions: list[Expression] | None = None, - root_partition: Expression | None = None, - ) -> FileSystemDataset: - """ - A Dataset created from a list of paths on a particular filesystem. - - Parameters - ---------- - paths : list of str - List of file paths to create the fragments from. - schema : Schema - The top-level schema of the DataDataset. - format : FileFormat - File format to create fragments from, currently only - ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. - filesystem : FileSystem - The filesystem which files are from. - partitions : list[Expression], optional - Attach additional partition information for the file paths. - root_partition : Expression, optional - The top-level partition of the DataDataset. - """ - @property - def filesystem(self) -> FileSystem: ... - @property - def partitioning(self) -> Partitioning | None: - """ - The partitioning of the Dataset source, if discovered. - - If the FileSystemDataset is created using the ``dataset()`` factory - function with a partitioning specified, this will return the - finalized Partitioning object from the dataset discovery. In all - other cases, this returns None. - """ - @property - def files(self) -> list[str]: - """List of the files""" - @property - def format(self) -> FileFormat: - """The FileFormat of this source.""" - -class FileWriteOptions(lib._Weakrefable): - @property - def format(self) -> FileFormat: ... - -class FileFormat(lib._Weakrefable): - def inspect( - self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None - ) -> lib.Schema: - """ - Infer the schema of a file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to infer a schema from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - - Returns - ------- - schema : Schema - The schema inferred from the file - """ - def make_fragment( - self, - file: StrPath | IO, - filesystem: SupportedFileSystem | None = None, - partition_expression: Expression | None = None, - *, - file_size: int | None = None, - ) -> Fragment: - """ - Make a FileFragment from a given file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to make a fragment from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - partition_expression : Expression, optional - An expression that is guaranteed true for all rows in the fragment. Allows - fragment to be potentially skipped while scanning with a filter. - file_size : int, optional - The size of the file in bytes. Can improve performance with high-latency filesystems - when file size needs to be known before reading. - - Returns - ------- - fragment : Fragment - The file fragment - """ - def make_write_options(self) -> FileWriteOptions: ... - @property - def default_extname(self) -> str: ... - @property - def default_fragment_scan_options(self) -> FragmentScanOptions: ... - @default_fragment_scan_options.setter - def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... - -class Fragment(lib._Weakrefable): - """Fragment of data from a Dataset.""" - @property - def physical_schema(self) -> lib.Schema: - """Return the physical schema of this Fragment. This schema can be - different from the dataset read schema.""" - @property - def partition_expression(self) -> Expression: - """An Expression which evaluates to true for all data viewed by this - Fragment. - """ - def scanner( - self, - schema: lib.Schema | None = None, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Build a scan operation against the fragment. - - Data is not loaded immediately. Instead, this produces a Scanner, - which exposes further operations (e.g. loading all data as a - table, counting rows). - - Parameters - ---------- - schema : Schema - Schema to use for scanning. This is used to unify a Fragment to - its Dataset's schema. If not specified this will use the - Fragment's physical schema which might differ for each Fragment. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - scanner : Scanner - """ - def to_batches( - self, - schema: lib.Schema | None = None, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: - """ - Read the fragment as materialized record batches. - - Parameters - ---------- - schema : Schema, optional - Concrete schema to use for scanning. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def to_table( - self, - schema: lib.Schema | None = None, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Convert this Fragment into a Table. - - Use this convenience utility with care. This will serially materialize - the Scan result in memory before creating the Table. - - Parameters - ---------- - schema : Schema, optional - Concrete schema to use for scanning. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def take( - self, - indices: Indices, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Select rows of data by index. - - Parameters - ---------- - indices : Array or array-like - The indices of row to select in the dataset. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - Table - """ - def head( - self, - num_rows: int, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Load the first N rows of the fragment. - - Parameters - ---------- - num_rows : int - The number of rows to load. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - Table - """ - def count_rows( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> int: - """ - Count rows matching the scanner filter. - - Parameters - ---------- - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - count : int - """ - -class FileFragment(Fragment): - """A Fragment representing a data file.""" - - def open(self) -> lib.NativeFile: - """ - Open a NativeFile of the buffer or file viewed by this fragment. - """ - @property - def path(self) -> str: - """ - The path of the data file viewed by this fragment, if it views a - file. If instead it views a buffer, this will be "". - """ - @property - def filesystem(self) -> FileSystem: - """ - The FileSystem containing the data file viewed by this fragment, if - it views a file. If instead it views a buffer, this will be None. - """ - @property - def buffer(self) -> lib.Buffer: - """ - The buffer viewed by this fragment, if it views a buffer. If - instead it views a file, this will be None. - """ - @property - def format(self) -> FileFormat: - """ - The format of the data file viewed by this fragment. - """ - -class FragmentScanOptions(lib._Weakrefable): - """Scan options specific to a particular fragment and scan operation.""" - - @property - def type_name(self) -> str: ... - -class IpcFileWriteOptions(FileWriteOptions): - @property - def write_options(self) -> IpcWriteOptions: ... - @write_options.setter - def write_options(self, write_options: IpcWriteOptions) -> None: ... - -class IpcFileFormat(FileFormat): - def equals(self, other: IpcFileFormat) -> bool: ... - def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... - @property - def default_extname(self) -> str: ... - -class FeatherFileFormat(IpcFileFormat): ... - -class CsvFileFormat(FileFormat): - """ - FileFormat for CSV files. - - Parameters - ---------- - parse_options : pyarrow.csv.ParseOptions - Options regarding CSV parsing. - default_fragment_scan_options : CsvFragmentScanOptions - Default options for fragments scan. - convert_options : pyarrow.csv.ConvertOptions - Options regarding value conversion. - read_options : pyarrow.csv.ReadOptions - General read options. - """ - def __init__( - self, - parse_options: _csv.ParseOptions | None = None, - default_fragment_scan_options: CsvFragmentScanOptions | None = None, - convert_options: _csv.ConvertOptions | None = None, - read_options: _csv.ReadOptions | None = None, - ) -> None: ... - def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] - @property - def parse_options(self) -> _csv.ParseOptions: ... - @parse_options.setter - def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... - def equals(self, other: CsvFileFormat) -> bool: ... - -class CsvFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for CSV fragments. - - Parameters - ---------- - convert_options : pyarrow.csv.ConvertOptions - Options regarding value conversion. - read_options : pyarrow.csv.ReadOptions - General read options. - """ - - convert_options: _csv.ConvertOptions - read_options: _csv.ReadOptions - - def __init__( - self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions - ) -> None: ... - def equals(self, other: CsvFragmentScanOptions) -> bool: ... - -class CsvFileWriteOptions(FileWriteOptions): - write_options: _csv.WriteOptions - -class JsonFileFormat(FileFormat): - """ - FileFormat for JSON files. - - Parameters - ---------- - default_fragment_scan_options : JsonFragmentScanOptions - Default options for fragments scan. - parse_options : pyarrow.json.ParseOptions - Options regarding json parsing. - read_options : pyarrow.json.ReadOptions - General read options. - """ - def __init__( - self, - default_fragment_scan_options: JsonFragmentScanOptions | None = None, - parse_options: _json.ParseOptions | None = None, - read_options: _json.ReadOptions | None = None, - ) -> None: ... - def equals(self, other: JsonFileFormat) -> bool: ... - -class JsonFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for JSON fragments. - - Parameters - ---------- - parse_options : pyarrow.json.ParseOptions - Options regarding JSON parsing. - read_options : pyarrow.json.ReadOptions - General read options. - """ - - parse_options: _json.ParseOptions - read_options: _json.ReadOptions - def __init__( - self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions - ) -> None: ... - def equals(self, other: JsonFragmentScanOptions) -> bool: ... - -class Partitioning(lib._Weakrefable): - def parse(self, path: str) -> Expression: - """ - Parse a path into a partition expression. - - Parameters - ---------- - path : str - - Returns - ------- - pyarrow.dataset.Expression - """ - def format(self, expr: Expression) -> tuple[str, str]: - """ - Convert a filter expression into a tuple of (directory, filename) using - the current partitioning scheme - - Parameters - ---------- - expr : pyarrow.dataset.Expression - - Returns - ------- - tuple[str, str] - - Examples - -------- - - Specify the Schema for paths like "/2009/June": - - >>> import pyarrow as pa - >>> import pyarrow.dataset as ds - >>> import pyarrow.compute as pc - >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) - >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) - ('1862/Jan', '') - """ - @property - def schema(self) -> lib.Schema: - """The arrow Schema attached to the partitioning.""" - -class PartitioningFactory(lib._Weakrefable): - @property - def type_name(self) -> str: ... - -class KeyValuePartitioning(Partitioning): - @property - def dictionaries(self) -> list[lib.Array | None]: - """ - The unique values for each partition field, if available. - - Those values are only available if the Partitioning object was - created through dataset discovery from a PartitioningFactory, or - if the dictionaries were manually specified in the constructor. - If no dictionary field is available, this returns an empty list. - """ - -class DirectoryPartitioning(KeyValuePartitioning): - """ - A Partitioning based on a specified Schema. - - The DirectoryPartitioning expects one segment in the file path for each - field in the schema (all fields are required to be present). - For example given schema the path "/2009/11" would - be parsed to ("year"_ == 2009 and "month"_ == 11). - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - DirectoryPartitioning - - Examples - -------- - >>> from pyarrow.dataset import DirectoryPartitioning - >>> partitioning = DirectoryPartitioning( - ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) - ... ) - >>> print(partitioning.parse("/2009/11/")) - ((year == 2009) and (month == 11)) - """ - - @staticmethod - def discover( - field_names: list[str] | None = None, - infer_dictionary: bool = False, - max_partition_dictionary_size: int = 0, - schema: lib.Schema | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a DirectoryPartitioning. - - Parameters - ---------- - field_names : list of str - The names to associate with the values from the subdirectory names. - If schema is given, will be populated from the schema. - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain types. This can be more efficient - when materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - max_partition_dictionary_size : int, default 0 - Synonymous with infer_dictionary for backwards compatibility with - 1.0: setting this to -1 or None is equivalent to passing - infer_dictionary=True. - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ - def __init__( - self, - schema: lib.Schema, - dictionaries: dict[str, lib.Array] | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> None: ... - -class HivePartitioning(KeyValuePartitioning): - """ - A Partitioning for "/$key=$value/" nested directories as found in - Apache Hive. - - Multi-level, directory based partitioning scheme originating from - Apache Hive with all data files stored in the leaf directories. Data is - partitioned by static values of a particular column in the schema. - Partition keys are represented in the form $key=$value in directory names. - Field order is ignored, as are missing or unrecognized field names. - - For example, given schema, a possible - path would be "/year=2009/month=11/day=15". - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" - If any field is None then this fallback will be used as a label - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - HivePartitioning - - Examples - -------- - >>> from pyarrow.dataset import HivePartitioning - >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) - >>> print(partitioning.parse("/year=2009/month=11/")) - ((year == 2009) and (month == 11)) - - """ - def __init__( - self, - schema: lib.Schema, - dictionaries: dict[str, lib.Array] | None = None, - null_fallback: str = "__HIVE_DEFAULT_PARTITION__", - segment_encoding: Literal["uri", "none"] = "uri", - ) -> None: ... - @staticmethod - def discover( - infer_dictionary: bool = False, - max_partition_dictionary_size: int = 0, - null_fallback="__HIVE_DEFAULT_PARTITION__", - schema: lib.Schema | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a HivePartitioning. - - Parameters - ---------- - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain. This can be more efficient when - materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - max_partition_dictionary_size : int, default 0 - Synonymous with infer_dictionary for backwards compatibility with - 1.0: setting this to -1 or None is equivalent to passing - infer_dictionary=True. - null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" - When inferring a schema for partition fields this value will be - replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ - for compatibility with Spark - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ - -class FilenamePartitioning(KeyValuePartitioning): - """ - A Partitioning based on a specified Schema. - - The FilenamePartitioning expects one segment in the file name for each - field in the schema (all fields are required to be present) separated - by '_'. For example given schema the name - ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - FilenamePartitioning - - Examples - -------- - >>> from pyarrow.dataset import FilenamePartitioning - >>> partitioning = FilenamePartitioning( - ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) - ... ) - >>> print(partitioning.parse("2009_11_data.parquet")) - ((year == 2009) and (month == 11)) - """ - - def __init__( - self, - schema: lib.Schema, - dictionaries: dict[str, lib.Array] | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> None: ... - @staticmethod - def discover( - field_names: list[str] | None = None, - infer_dictionary: bool = False, - schema: lib.Schema | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a FilenamePartitioning. - - Parameters - ---------- - field_names : list of str - The names to associate with the values from the subdirectory names. - If schema is given, will be populated from the schema. - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain types. This can be more efficient - when materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ - -class DatasetFactory(lib._Weakrefable): - """ - DatasetFactory is used to create a Dataset, inspect the Schema - of the fragments contained in it, and declare a partitioning. - """ - - root_partition: Expression - def finish(self, schema: lib.Schema | None = None) -> Dataset: - """ - Create a Dataset using the inspected schema or an explicit schema - (if given). - - Parameters - ---------- - schema : Schema, default None - The schema to conform the source to. If None, the inspected - schema is used. - - Returns - ------- - Dataset - """ - def inspect(self) -> lib.Schema: - """ - Inspect all data fragments and return a common Schema. - - Returns - ------- - Schema - """ - def inspect_schemas(self) -> list[lib.Schema]: ... - -class FileSystemFactoryOptions(lib._Weakrefable): - """ - Influences the discovery of filesystem paths. - - Parameters - ---------- - partition_base_dir : str, optional - For the purposes of applying the partitioning, paths will be - stripped of the partition_base_dir. Files not matching the - partition_base_dir prefix will be skipped for partitioning discovery. - The ignored files will still be part of the Dataset, but will not - have partition information. - partitioning : Partitioning/PartitioningFactory, optional - Apply the Partitioning to every discovered Fragment. See Partitioning or - PartitioningFactory documentation. - exclude_invalid_files : bool, optional (default True) - If True, invalid files will be excluded (file format specific check). - This will incur IO for each files in a serial and single threaded - fashion. Disabling this feature will skip the IO, but unsupported - files may be present in the Dataset (resulting in an error at scan - time). - selector_ignore_prefixes : list, optional - When discovering from a Selector (and not from an explicit file list), - ignore files and directories matching any of these prefixes. - By default this is ['.', '_']. - """ - - partitioning: Partitioning - partitioning_factory: PartitioningFactory - partition_base_dir: str - exclude_invalid_files: bool - selector_ignore_prefixes: list[str] - - def __init__( - self, - artition_base_dir: str | None = None, - partitioning: Partitioning | PartitioningFactory | None = None, - exclude_invalid_files: bool = True, - selector_ignore_prefixes: list[str] | None = None, - ) -> None: ... - -class FileSystemDatasetFactory(DatasetFactory): - """ - Create a DatasetFactory from a list of paths with schema inspection. - - Parameters - ---------- - filesystem : pyarrow.fs.FileSystem - Filesystem to discover. - paths_or_selector : pyarrow.fs.FileSelector or list of path-likes - Either a Selector object or a list of path-like objects. - format : FileFormat - Currently only ParquetFileFormat and IpcFileFormat are supported. - options : FileSystemFactoryOptions, optional - Various flags influencing the discovery of filesystem paths. - """ - - def __init__( - self, - filesystem: SupportedFileSystem, - paths_or_selector: FileSelector, - format: FileFormat, - options: FileSystemFactoryOptions | None = None, - ) -> None: ... - -class UnionDatasetFactory(DatasetFactory): - """ - Provides a way to inspect/discover a Dataset's expected schema before - materialization. - - Parameters - ---------- - factories : list of DatasetFactory - """ - def __init__(self, factories: list[DatasetFactory]) -> None: ... - -_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) - -class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): - """An iterator over a sequence of record batches.""" - def __iter__(self) -> Self: ... - def __next__(self) -> _RecordBatchT: ... - -class TaggedRecordBatch(NamedTuple): - """ - A combination of a record batch and the fragment it came from. - - Parameters - ---------- - record_batch : RecordBatch - The record batch. - fragment : Fragment - Fragment of the record batch. - """ - - record_batch: lib.RecordBatch - fragment: Fragment - -class TaggedRecordBatchIterator(lib._Weakrefable): - """An iterator over a sequence of record batches with fragments.""" - def __iter__(self) -> Self: ... - def __next__(self) -> TaggedRecordBatch: ... - -class Scanner(lib._Weakrefable): - """A materialized scan operation with context and options bound. - - A scanner is the class that glues the scan tasks, data fragments and data - sources together. - """ - @staticmethod - def from_dataset( - dataset: Dataset, - *, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create Scanner from Dataset, - - Parameters - ---------- - dataset : Dataset - Dataset to scan. - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ - @staticmethod - def from_fragment( - fragment: Fragment, - *, - schema: lib.Schema | None = None, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create Scanner from Fragment, - - Parameters - ---------- - fragment : Fragment - fragment to scan. - schema : Schema, optional - The schema of the fragment. - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ - @overload - @staticmethod - def from_batches( - source: Iterator[lib.RecordBatch], - *, - schema: lib.Schema, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... - @overload - @staticmethod - def from_batches( - source: RecordBatchReader, - *, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... - @staticmethod - def from_batches(*args, **kwargs): - """ - Create a Scanner from an iterator of batches. - - This creates a scanner which can be used only once. It is - intended to support writing a dataset (which takes a scanner) - from a source which can be read only once (e.g. a - RecordBatchReader or generator). - - Parameters - ---------- - source : Iterator or Arrow-compatible stream object - The iterator of Batches. This can be a pyarrow RecordBatchReader, - any object that implements the Arrow PyCapsule Protocol for - streams, or an actual Python iterator of RecordBatches. - schema : Schema - The schema of the batches (required when passing a Python - iterator). - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ - @property - def dataset_schema(self) -> lib.Schema: - """The schema with which batches will be read from fragments.""" - @property - def projected_schema(self) -> lib.Schema: - """ - The materialized schema of the data, accounting for projections. - - This is the schema of any data returned from the scanner. - """ - def to_batches(self) -> Iterator[lib.RecordBatch]: - """ - Consume a Scanner in record batches. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def scan_batches(self) -> TaggedRecordBatchIterator: - """ - Consume a Scanner in record batches with corresponding fragments. - - Returns - ------- - record_batches : iterator of TaggedRecordBatch - """ - def to_table(self) -> lib.Table: - """ - Convert a Scanner into a Table. - - Use this convenience utility with care. This will serially materialize - the Scan result in memory before creating the Table. - - Returns - ------- - Table - """ - def take(self, indices: Indices) -> lib.Table: - """ - Select rows of data by index. - - Will only consume as many batches of the underlying dataset as - needed. Otherwise, this is equivalent to - ``to_table().take(indices)``. - - Parameters - ---------- - indices : Array or array-like - indices of rows to select in the dataset. - - Returns - ------- - Table - """ - def head(self, num_rows: int) -> lib.Table: - """ - Load the first N rows of the dataset. - - Parameters - ---------- - num_rows : int - The number of rows to load. - - Returns - ------- - Table - """ - def count_rows(self) -> int: - """ - Count rows matching the scanner filter. - - Returns - ------- - count : int - """ - def to_reader(self) -> RecordBatchReader: - """Consume this scanner as a RecordBatchReader. - - Returns - ------- - RecordBatchReader - """ - -def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: - """ - Extract partition keys (equality constraints between a field and a scalar) - from an expression as a dict mapping the field's name to its value. - - NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning - will be conjunctions of equality conditions and are accessible through this - function. Other subexpressions will be ignored. - - Parameters - ---------- - partition_expression : pyarrow.dataset.Expression - - Returns - ------- - dict - - Examples - -------- - - For example, an expression of - - is converted to {'part': 'A', 'year': 2016} - """ - -class WrittenFile(lib._Weakrefable): - """ - Metadata information about files written as - part of a dataset write operation - - Parameters - ---------- - path : str - Path to the file. - metadata : pyarrow.parquet.FileMetaData, optional - For Parquet files, the Parquet file metadata. - size : int - The size of the file in bytes. - """ - def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... - -def _filesystemdataset_write( - data: Scanner, - base_dir: StrPath, - basename_template: str, - filesystem: SupportedFileSystem, - partitioning: Partitioning, - file_options: FileWriteOptions, - max_partitions: int, - file_visitor: Callable[[str], None], - existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], - max_open_files: int, - max_rows_per_file: int, - min_rows_per_group: int, - max_rows_per_group: int, - create_dir: bool, -): ... - -class _ScanNodeOptions(ExecNodeOptions): - def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... - -class ScanNodeOptions(_ScanNodeOptions): - """ - A Source node which yields batches from a Dataset scan. - - This is the option class for the "scan" node factory. - - This node is capable of applying pushdown projections or filters - to the file readers which reduce the amount of data that needs to - be read (if supported by the file format). But note that this does not - construct associated filter or project nodes to perform the final - filtering or projection. Rather, you may supply the same filter - expression or projection to the scan node that you also supply - to the filter or project node. - - Yielded batches will be augmented with fragment/batch indices when - implicit_ordering=True to enable stable ordering for simple ExecPlans. - - Parameters - ---------- - dataset : pyarrow.dataset.Dataset - The table which acts as the data source. - **kwargs : dict, optional - Scan options. See `Scanner.from_dataset` for possible arguments. - require_sequenced_output : bool, default False - Batches are yielded sequentially, like single-threaded - implicit_ordering : bool, default False - Preserve implicit ordering of data. - """ - - def __init__( - self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs - ) -> None: ... diff --git a/python/pyarrow/_dataset_orc.pyi b/python/pyarrow/_dataset_orc.pyi deleted file mode 100644 index d4e5784750f..00000000000 --- a/python/pyarrow/_dataset_orc.pyi +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from ._dataset import FileFormat - -class OrcFileFormat(FileFormat): - def equals(self, other: OrcFileFormat) -> bool: ... - @property - def default_extname(self): ... diff --git a/python/pyarrow/_dataset_parquet.pyi b/python/pyarrow/_dataset_parquet.pyi deleted file mode 100644 index 007d3404a18..00000000000 --- a/python/pyarrow/_dataset_parquet.pyi +++ /dev/null @@ -1,331 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from dataclasses import dataclass -from typing import IO, Any, Iterable, TypedDict - -from _typeshed import StrPath - -from ._compute import Expression -from ._dataset import ( - DatasetFactory, - FileFormat, - FileFragment, - FileWriteOptions, - Fragment, - FragmentScanOptions, - Partitioning, - PartitioningFactory, -) -from ._dataset_parquet_encryption import ParquetDecryptionConfig -from ._fs import SupportedFileSystem -from ._parquet import FileDecryptionProperties, FileMetaData -from .lib import CacheOptions, Schema, _Weakrefable - -parquet_encryption_enabled: bool - -class ParquetFileFormat(FileFormat): - """ - FileFormat for Parquet - - Parameters - ---------- - read_options : ParquetReadOptions - Read options for the file. - default_fragment_scan_options : ParquetFragmentScanOptions - Scan Options for the file. - **kwargs : dict - Additional options for read option or scan option - """ - def __init__( - self, - read_options: ParquetReadOptions | None = None, - default_fragment_scan_options: ParquetFragmentScanOptions | None = None, - **kwargs, - ) -> None: ... - @property - def read_options(self) -> ParquetReadOptions: ... - def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] - def equals(self, other: ParquetFileFormat) -> bool: ... - @property - def default_extname(self) -> str: ... - def make_fragment( - self, - file: StrPath | IO, - filesystem: SupportedFileSystem | None = None, - partition_expression: Expression | None = None, - row_groups: Iterable[int] | None = None, - *, - file_size: int | None = None, - ) -> Fragment: - """ - Make a FileFragment from a given file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to make a fragment from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - partition_expression : Expression, optional - An expression that is guaranteed true for all rows in the fragment. Allows - fragment to be potentially skipped while scanning with a filter. - row_groups : Iterable, optional - The indices of the row groups to include - file_size : int, optional - The size of the file in bytes. Can improve performance with high-latency filesystems - when file size needs to be known before reading. - - Returns - ------- - fragment : Fragment - The file fragment - """ - -class _NameStats(TypedDict): - min: Any - max: Any - -class RowGroupInfo: - """ - A wrapper class for RowGroup information - - Parameters - ---------- - id : integer - The group ID. - metadata : FileMetaData - The rowgroup metadata. - schema : Schema - Schema of the rows. - """ - - id: int - metadata: FileMetaData - schema: Schema - - def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... - @property - def num_rows(self) -> int: ... - @property - def total_byte_size(self) -> int: ... - @property - def statistics(self) -> dict[str, _NameStats]: ... - -class ParquetFileFragment(FileFragment): - """A Fragment representing a parquet file.""" - - def ensure_complete_metadata(self) -> None: ... - @property - def row_groups(self) -> list[RowGroupInfo]: ... - @property - def metadata(self) -> FileMetaData: ... - @property - def num_row_groups(self) -> int: - """ - Return the number of row groups viewed by this fragment (not the - number of row groups in the origin file). - """ - def split_by_row_group( - self, filter: Expression | None = None, schema: Schema | None = None - ) -> list[Fragment]: - """ - Split the fragment into multiple fragments. - - Yield a Fragment wrapping each row group in this ParquetFileFragment. - Row groups will be excluded whose metadata contradicts the optional - filter. - - Parameters - ---------- - filter : Expression, default None - Only include the row groups which satisfy this predicate (using - the Parquet RowGroup statistics). - schema : Schema, default None - Schema to use when filtering row groups. Defaults to the - Fragment's physical schema - - Returns - ------- - A list of Fragments - """ - def subset( - self, - filter: Expression | None = None, - schema: Schema | None = None, - row_group_ids: list[int] | None = None, - ) -> ParquetFileFormat: - """ - Create a subset of the fragment (viewing a subset of the row groups). - - Subset can be specified by either a filter predicate (with optional - schema) or by a list of row group IDs. Note that when using a filter, - the resulting fragment can be empty (viewing no row groups). - - Parameters - ---------- - filter : Expression, default None - Only include the row groups which satisfy this predicate (using - the Parquet RowGroup statistics). - schema : Schema, default None - Schema to use when filtering row groups. Defaults to the - Fragment's physical schema - row_group_ids : list of ints - The row group IDs to include in the subset. Can only be specified - if `filter` is None. - - Returns - ------- - ParquetFileFragment - """ - -class ParquetReadOptions(_Weakrefable): - """ - Parquet format specific options for reading. - - Parameters - ---------- - dictionary_columns : list of string, default None - Names of columns which should be dictionary encoded as - they are read - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds - """ - def __init__( - self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None - ) -> None: ... - @property - def coerce_int96_timestamp_unit(self) -> str: ... - @coerce_int96_timestamp_unit.setter - def coerce_int96_timestamp_unit(self, unit: str) -> None: ... - def equals(self, other: ParquetReadOptions) -> bool: ... - -class ParquetFileWriteOptions(FileWriteOptions): - def update(self, **kwargs) -> None: ... - def _set_properties(self) -> None: ... - def _set_arrow_properties(self) -> None: ... - def _set_encryption_config(self) -> None: ... - -@dataclass(kw_only=True) -class ParquetFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for Parquet fragments. - - Parameters - ---------- - use_buffered_stream : bool, default False - Read files through buffered input streams rather than loading entire - row groups at once. This may be enabled to reduce memory overhead. - Disabled by default. - buffer_size : int, default 8192 - Size of buffered stream, if enabled. Default is 8KB. - pre_buffer : bool, default True - If enabled, pre-buffer the raw Parquet data instead of issuing one - read per column chunk. This can improve performance on high-latency - filesystems (e.g. S3, GCS) by coalescing and issuing file reads in - parallel using a background I/O thread pool. - Set to False if you want to prioritize minimal memory usage - over maximum speed. - cache_options : pyarrow.CacheOptions, default None - Cache options used when pre_buffer is enabled. The default values should - be good for most use cases. You may want to adjust these for example if - you have exceptionally high latency to the file system. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None - If not None, use the provided ParquetDecryptionConfig to decrypt the - Parquet file. - decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None - If not None, use the provided FileDecryptionProperties to decrypt encrypted - Parquet file. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. - """ - - use_buffered_stream: bool = False - buffer_size: int = 8192 - pre_buffer: bool = True - cache_options: CacheOptions | None = None - thrift_string_size_limit: int | None = None - thrift_container_size_limit: int | None = None - decryption_config: ParquetDecryptionConfig | None = None - decryption_properties: FileDecryptionProperties | None = None - page_checksum_verification: bool = False - - def equals(self, other: ParquetFragmentScanOptions) -> bool: ... - -@dataclass -class ParquetFactoryOptions(_Weakrefable): - """ - Influences the discovery of parquet dataset. - - Parameters - ---------- - partition_base_dir : str, optional - For the purposes of applying the partitioning, paths will be - stripped of the partition_base_dir. Files not matching the - partition_base_dir prefix will be skipped for partitioning discovery. - The ignored files will still be part of the Dataset, but will not - have partition information. - partitioning : Partitioning, PartitioningFactory, optional - The partitioning scheme applied to fragments, see ``Partitioning``. - validate_column_chunk_paths : bool, default False - Assert that all ColumnChunk paths are consistent. The parquet spec - allows for ColumnChunk data to be stored in multiple files, but - ParquetDatasetFactory supports only a single file with all ColumnChunk - data. If this flag is set construction of a ParquetDatasetFactory will - raise an error if ColumnChunk data is not resident in a single file. - """ - - partition_base_dir: str | None = None - partitioning: Partitioning | PartitioningFactory | None = None - validate_column_chunk_paths: bool = False - -class ParquetDatasetFactory(DatasetFactory): - """ - Create a ParquetDatasetFactory from a Parquet `_metadata` file. - - Parameters - ---------- - metadata_path : str - Path to the `_metadata` parquet metadata-only file generated with - `pyarrow.parquet.write_metadata`. - filesystem : pyarrow.fs.FileSystem - Filesystem to read the metadata_path from, and subsequent parquet - files. - format : ParquetFileFormat - Parquet format options. - options : ParquetFactoryOptions, optional - Various flags influencing the discovery of filesystem paths. - """ - def __init__( - self, - metadata_path: str, - filesystem: SupportedFileSystem, - format: FileFormat, - options: ParquetFactoryOptions | None = None, - ) -> None: ... diff --git a/python/pyarrow/_flight.pyi b/python/pyarrow/_flight.pyi deleted file mode 100644 index a79475a8796..00000000000 --- a/python/pyarrow/_flight.pyi +++ /dev/null @@ -1,1397 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import asyncio -import enum -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar - -from typing_extensions import deprecated - -from .ipc import _ReadPandasMixin -from .lib import ( - ArrowCancelled, - ArrowException, - ArrowInvalid, - Buffer, - IpcReadOptions, - IpcWriteOptions, - RecordBatch, - RecordBatchReader, - Schema, - Table, - TimestampScalar, - _CRecordBatchWriter, - _Weakrefable, -) - -_T = TypeVar("_T") - -class FlightCallOptions(_Weakrefable): - """RPC-layer options for a Flight call.""" - - def __init__( - self, - timeout: float | None = None, - write_options: IpcWriteOptions | None = None, - headers: list[tuple[str, str]] | None = None, - read_options: IpcReadOptions | None = None, - ) -> None: - """Create call options. - - Parameters - ---------- - timeout : float, None - A timeout for the call, in seconds. None means that the - timeout defaults to an implementation-specific value. - write_options : pyarrow.ipc.IpcWriteOptions, optional - IPC write options. The default options can be controlled - by environment variables (see pyarrow.ipc). - headers : List[Tuple[str, str]], optional - A list of arbitrary headers as key, value tuples - read_options : pyarrow.ipc.IpcReadOptions, optional - Serialization options for reading IPC format. - """ - -class CertKeyPair(NamedTuple): - """A TLS certificate and key for use in Flight.""" - - cert: str - key: str - -class FlightError(Exception): - """ - The base class for Flight-specific errors. - - A server may raise this class or one of its subclasses to provide - a more detailed error to clients. - - Parameters - ---------- - message : str, optional - The error message. - extra_info : bytes, optional - Extra binary error details that were provided by the - server/will be sent to the client. - - Attributes - ---------- - extra_info : bytes - Extra binary error details that were provided by the - server/will be sent to the client. - """ - - extra_info: bytes - -class FlightInternalError(FlightError, ArrowException): - """An error internal to the Flight server occurred.""" - -class FlightTimedOutError(FlightError, ArrowException): - """The Flight RPC call timed out.""" - -class FlightCancelledError(FlightError, ArrowCancelled): - """The operation was cancelled.""" - -class FlightServerError(FlightError, ArrowException): - """A server error occurred.""" - -class FlightUnauthenticatedError(FlightError, ArrowException): - """The client is not authenticated.""" - -class FlightUnauthorizedError(FlightError, ArrowException): - """The client is not authorized to perform the given operation.""" - -class FlightUnavailableError(FlightError, ArrowException): - """The server is not reachable or available.""" - -class FlightWriteSizeExceededError(ArrowInvalid): - """A write operation exceeded the client-configured limit.""" - - limit: int - actual: int - -class Action(_Weakrefable): - """An action executable on a Flight service.""" - - def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: - """Create an action from a type and a buffer. - - Parameters - ---------- - action_type : bytes or str - buf : Buffer or bytes-like object - """ - @property - def type(self) -> str: - """The action type.""" - @property - def body(self) -> Buffer: - """The action body (arguments for the action).""" - def serialize(self) -> bytes: - """Get the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - @classmethod - def deserialize(cls, serialized: bytes) -> Self: - """Parse the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - -class ActionType(NamedTuple): - """A type of action that is executable on a Flight service.""" - - type: str - description: str - - def make_action(self, buf: Buffer | bytes) -> Action: - """Create an Action with this type. - - Parameters - ---------- - buf : obj - An Arrow buffer or Python bytes or bytes-like object. - """ - -class Result(_Weakrefable): - """A result from executing an Action.""" - def __init__(self, buf: Buffer | bytes) -> None: - """Create a new result. - - Parameters - ---------- - buf : Buffer or bytes-like object - """ - @property - def body(self) -> Buffer: - """Get the Buffer containing the result.""" - def serialize(self) -> bytes: - """Get the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - @classmethod - def deserialize(cls, serialized: bytes) -> Self: - """Parse the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - -class BasicAuth(_Weakrefable): - """A container for basic auth.""" - def __init__( - self, username: str | bytes | None = None, password: str | bytes | None = None - ) -> None: - """Create a new basic auth object. - - Parameters - ---------- - username : string - password : string - """ - @property - def username(self) -> bytes: ... - @property - def password(self) -> bytes: ... - def serialize(self) -> str: ... - @staticmethod - def deserialize(serialized: str | bytes) -> BasicAuth: ... - -class DescriptorType(enum.Enum): - """ - The type of a FlightDescriptor. - - Attributes - ---------- - - UNKNOWN - An unknown descriptor type. - - PATH - A Flight stream represented by a path. - - CMD - A Flight stream represented by an application-defined command. - - """ - - UNKNOWN = 0 - PATH = 1 - CMD = 2 - -class FlightMethod(enum.Enum): - """The implemented methods in Flight.""" - - INVALID = 0 - HANDSHAKE = 1 - LIST_FLIGHTS = 2 - GET_FLIGHT_INFO = 3 - GET_SCHEMA = 4 - DO_GET = 5 - DO_PUT = 6 - DO_ACTION = 7 - LIST_ACTIONS = 8 - DO_EXCHANGE = 9 - -class FlightDescriptor(_Weakrefable): - """A description of a data stream available from a Flight service.""" - @staticmethod - def for_path(*path: str | bytes) -> FlightDescriptor: - """Create a FlightDescriptor for a resource path.""" - - @staticmethod - def for_command(command: str | bytes) -> FlightDescriptor: - """Create a FlightDescriptor for an opaque command.""" - @property - def descriptor_type(self) -> DescriptorType: - """Get the type of this descriptor.""" - @property - def path(self) -> list[bytes] | None: - """Get the path for this descriptor.""" - @property - def command(self) -> bytes | None: - """Get the command for this descriptor.""" - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class Ticket(_Weakrefable): - """A ticket for requesting a Flight stream.""" - def __init__(self, ticket: str | bytes) -> None: ... - @property - def ticket(self) -> bytes: ... - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class Location(_Weakrefable): - """The location of a Flight service.""" - def __init__(self, uri: str | bytes) -> None: ... - @property - def uri(self) -> bytes: ... - def equals(self, other: Location) -> bool: ... - @staticmethod - def for_grpc_tcp(host: str | bytes, port: int) -> Location: - """Create a Location for a TCP-based gRPC service.""" - @staticmethod - def for_grpc_tls(host: str | bytes, port: int) -> Location: - """Create a Location for a TLS-based gRPC service.""" - @staticmethod - def for_grpc_unix(path: str | bytes) -> Location: - """Create a Location for a domain socket-based gRPC service.""" - -class FlightEndpoint(_Weakrefable): - """A Flight stream, along with the ticket and locations to access it.""" - def __init__( - self, - ticket: Ticket | str | bytes, - locations: list[str | Location], - expiration_time: TimestampScalar | None = ..., - app_metadata: bytes | str = ..., - ): - """Create a FlightEndpoint from a ticket and list of locations. - - Parameters - ---------- - ticket : Ticket or bytes - the ticket needed to access this flight - locations : list of string URIs - locations where this flight is available - expiration_time : TimestampScalar, default None - Expiration time of this stream. If present, clients may assume - they can retry DoGet requests. Otherwise, clients should avoid - retrying DoGet requests. - app_metadata : bytes or str, default "" - Application-defined opaque metadata. - - Raises - ------ - ArrowException - If one of the location URIs is not a valid URI. - """ - @property - def ticket(self) -> Ticket: - """Get the ticket in this endpoint.""" - @property - def locations(self) -> list[Location]: - """Get locations where this flight is available.""" - def serialize(self) -> bytes: ... - @property - def expiration_time(self) -> TimestampScalar | None: - """Get the expiration time of this stream. - - If present, clients may assume they can retry DoGet requests. - Otherwise, clients should avoid retrying DoGet requests. - - """ - @property - def app_metadata(self) -> bytes | str: - """Get application-defined opaque metadata.""" - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class SchemaResult(_Weakrefable): - """The serialized schema returned from a GetSchema request.""" - def __init__(self, schema: Schema) -> None: - """Create a SchemaResult from a schema. - - Parameters - ---------- - schema: Schema - the schema of the data in this flight. - """ - @property - def schema(self) -> Schema: - """The schema of the data in this flight.""" - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class FlightInfo(_Weakrefable): - """A description of a Flight stream.""" - def __init__( - self, - schema: Schema, - descriptor: FlightDescriptor, - endpoints: list[FlightEndpoint], - total_records: int = ..., - total_bytes: int = ..., - ordered: bool = ..., - app_metadata: bytes | str = ..., - ) -> None: - """Create a FlightInfo object from a schema, descriptor, and endpoints. - - Parameters - ---------- - schema : Schema - the schema of the data in this flight. - descriptor : FlightDescriptor - the descriptor for this flight. - endpoints : list of FlightEndpoint - a list of endpoints where this flight is available. - total_records : int, default None - the total records in this flight, -1 or None if unknown. - total_bytes : int, default None - the total bytes in this flight, -1 or None if unknown. - ordered : boolean, default False - Whether endpoints are in the same order as the data. - app_metadata : bytes or str, default "" - Application-defined opaque metadata. - """ - @property - def schema(self) -> Schema: - """The schema of the data in this flight.""" - @property - def descriptor(self) -> FlightDescriptor: - """The descriptor of the data in this flight.""" - @property - def endpoints(self) -> list[FlightEndpoint]: - """The endpoints where this flight is available.""" - @property - def total_records(self) -> int: - """The total record count of this flight, or -1 if unknown.""" - @property - def total_bytes(self) -> int: - """The size in bytes of the data in this flight, or -1 if unknown.""" - @property - def ordered(self) -> bool: - """Whether endpoints are in the same order as the data.""" - @property - def app_metadata(self) -> bytes | str: - """ - Application-defined opaque metadata. - - There is no inherent or required relationship between this and the - app_metadata fields in the FlightEndpoints or resulting FlightData - messages. Since this metadata is application-defined, a given - application could define there to be a relationship, but there is - none required by the spec. - - """ - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class FlightStreamChunk(_Weakrefable): - """A RecordBatch with application metadata on the side.""" - @property - def data(self) -> RecordBatch | None: ... - @property - def app_metadata(self) -> Buffer | None: ... - def __iter__(self): ... - -class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): - """A reader for Flight streams.""" - - # Needs to be separate class so the "real" class can subclass the - # pure-Python mixin class - - def __iter__(self) -> Self: ... - def __next__(self) -> FlightStreamChunk: ... - @property - def schema(self) -> Schema: - """Get the schema for this reader.""" - def read_all(self) -> Table: - """Read the entire contents of the stream as a Table.""" - def read_chunk(self) -> FlightStreamChunk: - """Read the next FlightStreamChunk along with any metadata. - - Returns - ------- - chunk : FlightStreamChunk - The next FlightStreamChunk in the stream. - - Raises - ------ - StopIteration - when the stream is finished - """ - def to_reader(self) -> RecordBatchReader: - """Convert this reader into a regular RecordBatchReader. - - This may fail if the schema cannot be read from the remote end. - - Returns - ------- - RecordBatchReader - """ - -class MetadataRecordBatchReader(_MetadataRecordBatchReader): - """The base class for readers for Flight streams. - - See Also - -------- - FlightStreamReader - """ - -class FlightStreamReader(MetadataRecordBatchReader): - """A reader that can also be canceled.""" - def cancel(self) -> None: - """Cancel the read operation.""" - def read_all(self) -> Table: - """Read the entire contents of the stream as a Table.""" - -class MetadataRecordBatchWriter(_CRecordBatchWriter): - """A RecordBatchWriter that also allows writing application metadata. - - This class is a context manager; on exit, close() will be called. - """ - - def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: - """Prepare to write data to this stream with the given schema.""" - def write_metadata(self, buf: Buffer) -> None: - """Write Flight metadata by itself.""" - def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] - """ - Write RecordBatch to stream. - - Parameters - ---------- - batch : RecordBatch - """ - def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: - """ - Write Table to stream in (contiguous) RecordBatch objects. - - Parameters - ---------- - table : Table - max_chunksize : int, default None - Maximum number of rows for RecordBatch chunks. Individual chunks may - be smaller depending on the chunk layout of individual columns. - """ - def close(self) -> None: - """ - Close stream and write end-of-stream 0 marker. - """ - def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: - """Write a RecordBatch along with Flight metadata. - - Parameters - ---------- - batch : RecordBatch - The next RecordBatch in the stream. - buf : Buffer - Application-specific metadata for the batch as defined by - Flight. - """ - -class FlightStreamWriter(MetadataRecordBatchWriter): - """A writer that also allows closing the write side of a stream.""" - def done_writing(self) -> None: - """Indicate that the client is done writing, but not done reading.""" - -class FlightMetadataReader(_Weakrefable): - """A reader for Flight metadata messages sent during a DoPut.""" - def read(self) -> Buffer | None: - """Read the next metadata message.""" - -class FlightMetadataWriter(_Weakrefable): - """A sender for Flight metadata messages during a DoPut.""" - def write(self, message: Buffer) -> None: - """Write the next metadata message. - - Parameters - ---------- - message : Buffer - """ - -class AsyncioCall(Generic[_T]): - """State for an async RPC using asyncio.""" - - _future: asyncio.Future[_T] - - def as_awaitable(self) -> asyncio.Future[_T]: ... - def wakeup(self, result_or_exception: BaseException | _T) -> None: ... - -class AsyncioFlightClient: - """ - A FlightClient with an asyncio-based async interface. - - This interface is EXPERIMENTAL. - """ - - def __init__(self, client: FlightClient) -> None: ... - async def get_flight_info( - self, - descriptor: FlightDescriptor, - *, - options: FlightCallOptions | None = None, - ): ... - -class FlightClient(_Weakrefable): - """A client to a Flight service. - - Connect to a Flight service on the given host and port. - - Parameters - ---------- - location : str, tuple or Location - Location to connect to. Either a gRPC URI like `grpc://localhost:port`, - a tuple of (host, port) pair, or a Location instance. - tls_root_certs : bytes or None - PEM-encoded - cert_chain: bytes or None - Client certificate if using mutual TLS - private_key: bytes or None - Client private key for cert_chain is using mutual TLS - override_hostname : str or None - Override the hostname checked by TLS. Insecure, use with caution. - middleware : list optional, default None - A list of ClientMiddlewareFactory instances. - write_size_limit_bytes : int optional, default None - A soft limit on the size of a data payload sent to the - server. Enabled if positive. If enabled, writing a record - batch that (when serialized) exceeds this limit will raise an - exception; the client can retry the write with a smaller - batch. - disable_server_verification : boolean optional, default False - A flag that indicates that, if the client is connecting - with TLS, that it skips server verification. If this is - enabled, all other TLS settings are overridden. - generic_options : list optional, default None - A list of generic (string, int or string) option tuples passed - to the underlying transport. Effect is implementation - dependent. - """ - def __init__( - self, - location: str | tuple[str, int] | Location, - *, - tls_root_certs: str | None = None, - cert_chain: str | None = None, - private_key: str | None = None, - override_hostname: str | None = None, - middleware: list[ClientMiddlewareFactory] | None = None, - write_size_limit_bytes: int | None = None, - disable_server_verification: bool = False, - generic_options: list[tuple[str, int | str]] | None = None, - ): ... - @property - def supports_async(self) -> bool: ... - def as_async(self) -> AsyncioFlightClient: ... - def wait_for_available(self, timeout: int = 5) -> None: - """Block until the server can be contacted. - - Parameters - ---------- - timeout : int, default 5 - The maximum seconds to wait. - """ - @deprecated( - "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." - ) - @classmethod - def connect( - cls, - location: str | tuple[str, int] | Location, - tls_root_certs: str | None = None, - cert_chain: str | None = None, - private_key: str | None = None, - override_hostname: str | None = None, - disable_server_verification: bool = False, - ) -> FlightClient: - """Connect to a Flight server. - - .. deprecated:: 0.15.0 - Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. - """ - def authenticate( - self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None - ) -> None: - """Authenticate to the server. - - Parameters - ---------- - auth_handler : ClientAuthHandler - The authentication mechanism to use. - options : FlightCallOptions - Options for this call. - """ - def authenticate_basic_token( - self, username: str, password: str, options: FlightCallOptions | None = None - ) -> tuple[str, str]: - """Authenticate to the server with HTTP basic authentication. - - Parameters - ---------- - username : string - Username to authenticate with - password : string - Password to authenticate with - options : FlightCallOptions - Options for this call - - Returns - ------- - tuple : Tuple[str, str] - A tuple representing the FlightCallOptions authorization - header entry of a bearer token. - """ - def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: - """List the actions available on a service.""" - def do_action( - self, action: Action, options: FlightCallOptions | None = None - ) -> Iterator[Result]: - """ - Execute an action on a service. - - Parameters - ---------- - action : str, tuple, or Action - Can be action type name (no body), type and body, or any Action - object - options : FlightCallOptions - RPC options - - Returns - ------- - results : iterator of Result values - """ - def list_flights( - self, criteria: str | None = None, options: FlightCallOptions | None = None - ) -> Generator[FlightInfo, None, None]: - """List the flights available on a service.""" - def get_flight_info( - self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> FlightInfo: - """Request information about an available flight.""" - def get_schema( - self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> Schema: - """Request schema for an available flight.""" - def do_get( - self, ticket: Ticket, options: FlightCallOptions | None = None - ) -> FlightStreamReader: - """Request the data for a flight. - - Returns - ------- - reader : FlightStreamReader - """ - def do_put( - self, - descriptor: FlightDescriptor, - schema: Schema, - options: FlightCallOptions | None = None, - ) -> tuple[FlightStreamWriter, FlightStreamReader]: - """Upload data to a flight. - - Returns - ------- - writer : FlightStreamWriter - reader : FlightMetadataReader - """ - def do_exchange( - self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> tuple[FlightStreamWriter, FlightStreamReader]: - """Start a bidirectional data exchange with a server. - - Parameters - ---------- - descriptor : FlightDescriptor - A descriptor for the flight. - options : FlightCallOptions - RPC options. - - Returns - ------- - writer : FlightStreamWriter - reader : FlightStreamReader - """ - def close(self) -> None: - """Close the client and disconnect.""" - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_value, traceback) -> None: ... - -class FlightDataStream(_Weakrefable): - """ - Abstract base class for Flight data streams. - - See Also - -------- - RecordBatchStream - GeneratorStream - """ - -class RecordBatchStream(FlightDataStream): - """A Flight data stream backed by RecordBatches. - - The remainder of this DoGet request will be handled in C++, - without having to acquire the GIL. - - """ - def __init__( - self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None - ) -> None: - """Create a RecordBatchStream from a data source. - - Parameters - ---------- - data_source : RecordBatchReader or Table - The data to stream to the client. - options : pyarrow.ipc.IpcWriteOptions, optional - Optional IPC options to control how to write the data. - """ - -class GeneratorStream(FlightDataStream): - """A Flight data stream backed by a Python generator.""" - def __init__( - self, - schema: Schema, - generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], - options: IpcWriteOptions | None = None, - ) -> None: - """Create a GeneratorStream from a Python generator. - - Parameters - ---------- - schema : Schema - The schema for the data to be returned. - - generator : iterator or iterable - The generator should yield other FlightDataStream objects, - Tables, RecordBatches, or RecordBatchReaders. - - options : pyarrow.ipc.IpcWriteOptions, optional - """ - -class ServerCallContext(_Weakrefable): - """Per-call state/context.""" - def peer_identity(self) -> bytes: - """Get the identity of the authenticated peer. - - May be the empty string. - """ - def peer(self) -> str: - """Get the address of the peer.""" - # Set safe=True as gRPC on Windows sometimes gives garbage bytes - def is_cancelled(self) -> bool: - """Check if the current RPC call has been canceled by the client.""" - def add_header(self, key: str, value: str) -> None: - """Add a response header.""" - def add_trailer(self, key: str, value: str) -> None: - """Add a response trailer.""" - def get_middleware(self, key: str) -> ServerMiddleware | None: - """ - Get a middleware instance by key. - - Returns None if the middleware was not found. - """ - -class ServerAuthReader(_Weakrefable): - """A reader for messages from the client during an auth handshake.""" - def read(self) -> str: ... - -class ServerAuthSender(_Weakrefable): - """A writer for messages to the client during an auth handshake.""" - def write(self, message: str) -> None: ... - -class ClientAuthReader(_Weakrefable): - """A reader for messages from the server during an auth handshake.""" - def read(self) -> str: ... - -class ClientAuthSender(_Weakrefable): - """A writer for messages to the server during an auth handshake.""" - def write(self, message: str) -> None: ... - -class ServerAuthHandler(_Weakrefable): - """Authentication middleware for a server. - - To implement an authentication mechanism, subclass this class and - override its methods. - - """ - def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): - """Conduct the handshake with the client. - - May raise an error if the client cannot authenticate. - - Parameters - ---------- - outgoing : ServerAuthSender - A channel to send messages to the client. - incoming : ServerAuthReader - A channel to read messages from the client. - """ - def is_valid(self, token: str) -> bool: - """Validate a client token, returning their identity. - - May return an empty string (if the auth mechanism does not - name the peer) or raise an exception (if the token is - invalid). - - Parameters - ---------- - token : bytes - The authentication token from the client. - - """ - -class ClientAuthHandler(_Weakrefable): - """Authentication plugin for a client.""" - def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): - """Conduct the handshake with the server. - - Parameters - ---------- - outgoing : ClientAuthSender - A channel to send messages to the server. - incoming : ClientAuthReader - A channel to read messages from the server. - """ - def get_token(self) -> str: - """Get the auth token for a call.""" - -class CallInfo(NamedTuple): - """Information about a particular RPC for Flight middleware.""" - - method: FlightMethod - -class ClientMiddlewareFactory(_Weakrefable): - """A factory for new middleware instances. - - All middleware methods will be called from the same thread as the - RPC method implementation. That is, thread-locals set in the - client are accessible from the middleware itself. - - """ - def start_call(self, info: CallInfo) -> ClientMiddleware | None: - """Called at the start of an RPC. - - This must be thread-safe and must not raise exceptions. - - Parameters - ---------- - info : CallInfo - Information about the call. - - Returns - ------- - instance : ClientMiddleware - An instance of ClientMiddleware (the instance to use for - the call), or None if this call is not intercepted. - - """ - -class ClientMiddleware(_Weakrefable): - """Client-side middleware for a call, instantiated per RPC. - - Methods here should be fast and must be infallible: they should - not raise exceptions or stall indefinitely. - - """ - - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: - """A callback before headers are sent. - - Returns - ------- - headers : dict - A dictionary of header values to add to the request, or - None if no headers are to be added. The dictionary should - have string keys and string or list-of-string values. - - Bytes values are allowed, but the underlying transport may - not support them or may restrict them. For gRPC, binary - values are only allowed on headers ending in "-bin". - - Header names must be lowercase ASCII. - - """ - - def received_headers(self, headers: dict[str, list[str] | list[bytes]]): - """A callback when headers are received. - - The default implementation does nothing. - - Parameters - ---------- - headers : dict - A dictionary of headers from the server. Keys are strings - and values are lists of strings (for text headers) or - bytes (for binary headers). - - """ - - def call_completed(self, exception: ArrowException): - """A callback when the call finishes. - - The default implementation does nothing. - - Parameters - ---------- - exception : ArrowException - If the call errored, this is the equivalent - exception. Will be None if the call succeeded. - - """ - -class ServerMiddlewareFactory(_Weakrefable): - """A factory for new middleware instances. - - All middleware methods will be called from the same thread as the - RPC method implementation. That is, thread-locals set in the - middleware are accessible from the method itself. - - """ - - def start_call( - self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> ServerMiddleware | None: - """Called at the start of an RPC. - - This must be thread-safe. - - Parameters - ---------- - info : CallInfo - Information about the call. - headers : dict - A dictionary of headers from the client. Keys are strings - and values are lists of strings (for text headers) or - bytes (for binary headers). - - Returns - ------- - instance : ServerMiddleware - An instance of ServerMiddleware (the instance to use for - the call), or None if this call is not intercepted. - - Raises - ------ - exception : pyarrow.ArrowException - If an exception is raised, the call will be rejected with - the given error. - - """ - -class TracingServerMiddlewareFactory(ServerMiddlewareFactory): - """A factory for tracing middleware instances. - - This enables OpenTelemetry support in Arrow (if Arrow was compiled - with OpenTelemetry support enabled). A new span will be started on - each RPC call. The TracingServerMiddleware instance can then be - retrieved within an RPC handler to get the propagated context, - which can be used to start a new span on the Python side. - - Because the Python/C++ OpenTelemetry libraries do not - interoperate, spans on the C++ side are not directly visible to - the Python side and vice versa. - - """ - -class ServerMiddleware(_Weakrefable): - """Server-side middleware for a call, instantiated per RPC. - - Methods here should be fast and must be infallible: they should - not raise exceptions or stall indefinitely. - - """ - - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: - """A callback before headers are sent. - - Returns - ------- - headers : dict - A dictionary of header values to add to the response, or - None if no headers are to be added. The dictionary should - have string keys and string or list-of-string values. - - Bytes values are allowed, but the underlying transport may - not support them or may restrict them. For gRPC, binary - values are only allowed on headers ending in "-bin". - - Header names must be lowercase ASCII. - - """ - def call_completed(self, exception: ArrowException): - """A callback when the call finishes. - - Parameters - ---------- - exception : pyarrow.ArrowException - If the call errored, this is the equivalent - exception. Will be None if the call succeeded. - - """ - -class TracingServerMiddleware(ServerMiddleware): - trace_context: dict - def __init__(self, trace_context: dict) -> None: ... - -class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): - """Wrapper to bundle server middleware into a single C++ one.""" - - def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... - def start_call( # type: ignore[override] - self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> _ServerMiddlewareFactoryWrapper | None: ... - -class _ServerMiddlewareWrapper(ServerMiddleware): - def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... - def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... - def call_completed(self, exception: ArrowException) -> None: ... - -class _FlightServerFinalizer(_Weakrefable): - """ - A finalizer that shuts down the server on destruction. - - See ARROW-16597. If the server is still active at interpreter - exit, the process may segfault. - """ - - def finalize(self) -> None: ... - -class FlightServerBase(_Weakrefable): - """A Flight service definition. - - To start the server, create an instance of this class with an - appropriate location. The server will be running as soon as the - instance is created; it is not required to call :meth:`serve`. - - Override methods to define your Flight service. - - Parameters - ---------- - location : str, tuple or Location optional, default None - Location to serve on. Either a gRPC URI like `grpc://localhost:port`, - a tuple of (host, port) pair, or a Location instance. - If None is passed then the server will be started on localhost with a - system provided random port. - auth_handler : ServerAuthHandler optional, default None - An authentication mechanism to use. May be None. - tls_certificates : list optional, default None - A list of (certificate, key) pairs. - verify_client : boolean optional, default False - If True, then enable mutual TLS: require the client to present - a client certificate, and validate the certificate. - root_certificates : bytes optional, default None - If enabling mutual TLS, this specifies the PEM-encoded root - certificate used to validate client certificates. - middleware : dict optional, default None - A dictionary of :class:`ServerMiddlewareFactory` instances. The - string keys can be used to retrieve the middleware instance within - RPC handlers (see :meth:`ServerCallContext.get_middleware`). - - """ - def __init__( - self, - location: str | tuple[str, int] | Location | None = None, - auth_handler: ServerAuthHandler | None = None, - tls_certificates: list[tuple[str, str]] | None = None, - verify_client: bool = False, - root_certificates: str | None = None, - middleware: dict[str, ServerMiddlewareFactory] | None = None, - ): ... - @property - def port(self) -> int: - """ - Get the port that this server is listening on. - - Returns a non-positive value if the operation is invalid - (e.g. init() was not called or server is listening on a domain - socket). - """ - def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: - """List flights available on this service. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - criteria : bytes - Filter criteria provided by the client. - - Returns - ------- - iterator of FlightInfo - - """ - def get_flight_info( - self, context: ServerCallContext, descriptor: FlightDescriptor - ) -> FlightInfo: - """Get information about a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - - Returns - ------- - FlightInfo - - """ - def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: - """Get the schema of a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - - Returns - ------- - Schema - - """ - def do_put( - self, - context: ServerCallContext, - descriptor: FlightDescriptor, - reader: MetadataRecordBatchReader, - writer: FlightMetadataWriter, - ) -> None: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - reader : MetadataRecordBatchReader - A reader for data uploaded by the client. - writer : FlightMetadataWriter - A writer to send responses to the client. - - """ - def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - ticket : Ticket - The ticket for the flight. - - Returns - ------- - FlightDataStream - A stream of data to send back to the client. - - """ - def do_exchange( - self, - context: ServerCallContext, - descriptor: FlightDescriptor, - reader: MetadataRecordBatchReader, - writer: MetadataRecordBatchWriter, - ) -> None: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - reader : MetadataRecordBatchReader - A reader for data uploaded by the client. - writer : MetadataRecordBatchWriter - A writer to send responses to the client. - - """ - def list_actions(self, context: ServerCallContext) -> Iterable[Action]: - """List custom actions available on this server. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - - Returns - ------- - iterator of ActionType or tuple - - """ - def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: - """Execute a custom action. - - This method should return an iterator, or it should be a - generator. Applications should override this method to - implement their own behavior. The default method raises a - NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - action : Action - The action to execute. - - Returns - ------- - iterator of bytes - - """ - def serve(self) -> None: - """Block until the server shuts down. - - This method only returns if shutdown() is called or a signal is - received. - """ - def run(self) -> None: - """Block until the server shuts down. - - .. deprecated:: 0.15.0 - Use the ``FlightServer.serve`` method instead - """ - def shutdown(self) -> None: - """Shut down the server, blocking until current requests finish. - - Do not call this directly from the implementation of a Flight - method, as then the server will block forever waiting for that - request to finish. Instead, call this method from a background - thread. - - This method should only be called once. - """ - def wait(self) -> None: - """Block until server is terminated with shutdown.""" - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_value, traceback): ... - -def connect( - location: str | tuple[str, int] | Location, - *, - tls_root_certs: str | None = None, - cert_chain: str | None = None, - private_key: str | None = None, - override_hostname: str | None = None, - middleware: list[ClientMiddlewareFactory] | None = None, - write_size_limit_bytes: int | None = None, - disable_server_verification: bool = False, - generic_options: list[tuple[str, int | str]] | None = None, -) -> FlightClient: - """ - Connect to a Flight server. - - Parameters - ---------- - location : str, tuple, or Location - Location to connect to. Either a URI like "grpc://localhost:port", - a tuple of (host, port), or a Location instance. - tls_root_certs : bytes or None - PEM-encoded. - cert_chain: str or None - If provided, enables TLS mutual authentication. - private_key: str or None - If provided, enables TLS mutual authentication. - override_hostname : str or None - Override the hostname checked by TLS. Insecure, use with caution. - middleware : list or None - A list of ClientMiddlewareFactory instances to apply. - write_size_limit_bytes : int or None - A soft limit on the size of a data payload sent to the - server. Enabled if positive. If enabled, writing a record - batch that (when serialized) exceeds this limit will raise an - exception; the client can retry the write with a smaller - batch. - disable_server_verification : boolean or None - Disable verifying the server when using TLS. - Insecure, use with caution. - generic_options : list or None - A list of generic (string, int or string) options to pass to - the underlying transport. - - Returns - ------- - client : FlightClient - """ diff --git a/python/pyarrow/_fs.pyi b/python/pyarrow/_fs.pyi deleted file mode 100644 index 45d4d922ac2..00000000000 --- a/python/pyarrow/_fs.pyi +++ /dev/null @@ -1,1022 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import datetime as dt -import enum -import sys - -from abc import ABC, abstractmethod - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from typing import Union, overload - -from fsspec import AbstractFileSystem # type: ignore[import-untyped] - -from .lib import NativeFile, _Weakrefable - -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - -class FileType(enum.IntFlag): - NotFound = enum.auto() - Unknown = enum.auto() - File = enum.auto() - Directory = enum.auto() - -class FileInfo(_Weakrefable): - """ - FileSystem entry info. - - Parameters - ---------- - path : str - The full path to the filesystem entry. - type : FileType - The type of the filesystem entry. - mtime : datetime or float, default None - If given, the modification time of the filesystem entry. - If a float is given, it is the number of seconds since the - Unix epoch. - mtime_ns : int, default None - If given, the modification time of the filesystem entry, - in nanoseconds since the Unix epoch. - `mtime` and `mtime_ns` are mutually exclusive. - size : int, default None - If given, the filesystem entry size in bytes. This should only - be given if `type` is `FileType.File`. - - Examples - -------- - Generate a file: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> path_fs = local_path + "/pyarrow-fs-example.dat" - >>> with local.open_output_stream(path_fs) as stream: - ... stream.write(b"data") - 4 - - Get FileInfo object using ``get_file_info()``: - - >>> file_info = local.get_file_info(path_fs) - >>> file_info - - - Inspect FileInfo attributes: - - >>> file_info.type - - - >>> file_info.is_file - True - - >>> file_info.path - '/.../pyarrow-fs-example.dat' - - >>> file_info.base_name - 'pyarrow-fs-example.dat' - - >>> file_info.size - 4 - - >>> file_info.extension - 'dat' - - >>> file_info.mtime # doctest: +SKIP - datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) - - >>> file_info.mtime_ns # doctest: +SKIP - 1656489370873922073 - """ - - def __init__( - self, - path: str, - type: FileType = FileType.Unknown, - *, - mtime: dt.datetime | float | None = None, - mtime_ns: int | None = None, - size: int | None = None, - ): ... - @property - def type(self) -> FileType: - """ - Type of the file. - - The returned enum values can be the following: - - - FileType.NotFound: target does not exist - - FileType.Unknown: target exists but its type is unknown (could be a - special file such as a Unix socket or character device, or - Windows NUL / CON / ...) - - FileType.File: target is a regular file - - FileType.Directory: target is a regular directory - - Returns - ------- - type : FileType - """ - @property - def is_file(self) -> bool: ... - @property - def path(self) -> str: - """ - The full file path in the filesystem. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.path - '/.../pyarrow-fs-example.dat' - """ - @property - def base_name(self) -> str: - """ - The file base name. - - Component after the last directory separator. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.base_name - 'pyarrow-fs-example.dat' - """ - @property - def size(self) -> int: - """ - The size in bytes, if available. - - Only regular files are guaranteed to have a size. - - Returns - ------- - size : int or None - """ - @property - def extension(self) -> str: - """ - The file extension. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.extension - 'dat' - """ - @property - def mtime(self) -> dt.datetime | None: - """ - The time of last modification, if available. - - Returns - ------- - mtime : datetime.datetime or None - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.mtime # doctest: +SKIP - datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) - """ - @property - def mtime_ns(self) -> int | None: - """ - The time of last modification, if available, expressed in nanoseconds - since the Unix epoch. - - Returns - ------- - mtime_ns : int or None - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.mtime_ns # doctest: +SKIP - 1656489370873922073 - """ - -class FileSelector(_Weakrefable): - """ - File and directory selector. - - It contains a set of options that describes how to search for files and - directories. - - Parameters - ---------- - base_dir : str - The directory in which to select files. Relative paths also work, use - '.' for the current directory and '..' for the parent. - allow_not_found : bool, default False - The behavior if `base_dir` doesn't exist in the filesystem. - If false, an error is returned. - If true, an empty selection is returned. - recursive : bool, default False - Whether to recurse into subdirectories. - - Examples - -------- - List the contents of a directory and subdirectories: - - >>> selector_1 = fs.FileSelector(local_path, recursive=True) - >>> local.get_file_info(selector_1) # doctest: +SKIP - [, - , - ] - - List only the contents of the base directory: - - >>> selector_2 = fs.FileSelector(local_path) - >>> local.get_file_info(selector_2) # doctest: +SKIP - [, - ] - - Return empty selection if the directory doesn't exist: - - >>> selector_not_found = fs.FileSelector( - ... local_path + "/missing", recursive=True, allow_not_found=True - ... ) - >>> local.get_file_info(selector_not_found) - [] - """ - - base_dir: str - allow_not_found: bool - recursive: bool - def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... - -class FileSystem(_Weakrefable): - """ - Abstract file system API. - """ - - @classmethod - def from_uri(cls, uri: str) -> tuple[Self, str]: - """ - Create a new FileSystem from URI or Path. - - Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". - In addition, the argument can be a pathlib.Path object, or a string - describing an absolute local path. - - Parameters - ---------- - uri : string - URI-based path, for example: file:///some/local/path. - - Returns - ------- - tuple of (FileSystem, str path) - With (filesystem, path) tuple where path is the abstract path - inside the FileSystem instance. - - Examples - -------- - Create a new FileSystem subclass from a URI: - - >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) - >>> local_new, path_new = fs.FileSystem.from_uri(uri) - >>> local_new - >> path_new - '/.../pyarrow-fs-example.dat' - - Or from a s3 bucket: - - >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") - (, 'usgs-landsat/collection02') - """ - def equals(self, other: FileSystem) -> bool: - """ - Parameters - ---------- - other : pyarrow.fs.FileSystem - - Returns - ------- - bool - """ - @property - def type_name(self) -> str: - """ - The filesystem's type name. - """ - @overload - def get_file_info(self, paths_or_selector: str) -> FileInfo: ... - @overload - def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... - def get_file_info(self, paths_or_selector): - """ - Get info for the given files. - - Any symlink is automatically dereferenced, recursively. A non-existing - or unreachable file returns a FileStat object and has a FileType of - value NotFound. An exception indicates a truly exceptional condition - (low-level I/O error, etc.). - - Parameters - ---------- - paths_or_selector : FileSelector, path-like or list of path-likes - Either a selector object, a path-like object or a list of - path-like objects. The selector's base directory will not be - part of the results, even if it exists. If it doesn't exist, - use `allow_not_found`. - - Returns - ------- - FileInfo or list of FileInfo - Single FileInfo object is returned for a single path, otherwise - a list of FileInfo objects is returned. - - Examples - -------- - >>> local - - >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) - - """ - def create_dir(self, path: str, *, recursive: bool = True) -> None: - """ - Create a directory and subdirectories. - - This function succeeds if the directory already exists. - - Parameters - ---------- - path : str - The path of the new directory. - recursive : bool, default True - Create nested directories as well. - """ - def delete_dir(self, path: str) -> None: - """ - Delete a directory and its contents, recursively. - - Parameters - ---------- - path : str - The path of the directory to be deleted. - """ - def delete_dir_contents( - self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False - ) -> None: - """ - Delete a directory's contents, recursively. - - Like delete_dir, but doesn't delete the directory itself. - - Parameters - ---------- - path : str - The path of the directory to be deleted. - accept_root_dir : boolean, default False - Allow deleting the root directory's contents - (if path is empty or "/") - missing_dir_ok : boolean, default False - If False then an error is raised if path does - not exist - """ - def move(self, src: str, dest: str) -> None: - """ - Move / rename a file or directory. - - If the destination exists: - - if it is a non-empty directory, an error is returned - - otherwise, if it has the same type as the source, it is replaced - - otherwise, behavior is unspecified (implementation-dependent). - - Parameters - ---------- - src : str - The path of the file or the directory to be moved. - dest : str - The destination path where the file or directory is moved to. - - Examples - -------- - Create a new folder with a file: - - >>> local.create_dir("/tmp/other_dir") - >>> local.copy_file(path, "/tmp/move_example.dat") - - Move the file: - - >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") - - Inspect the file info: - - >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") - - >>> local.get_file_info("/tmp/move_example.dat") - - - Delete the folder: - >>> local.delete_dir("/tmp/other_dir") - """ - def copy_file(self, src: str, dest: str) -> None: - """ - Copy a file. - - If the destination exists and is a directory, an error is returned. - Otherwise, it is replaced. - - Parameters - ---------- - src : str - The path of the file to be copied from. - dest : str - The destination path where the file is copied to. - - Examples - -------- - >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") - - Inspect the file info: - - >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") - - >>> local.get_file_info(path) - - """ - def delete_file(self, path: str) -> None: - """ - Delete a file. - - Parameters - ---------- - path : str - The path of the file to be deleted. - """ - def open_input_file(self, path: str) -> NativeFile: - """ - Open an input file for random access reading. - - Parameters - ---------- - path : str - The source to open for reading. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Print the data from the file with `open_input_file()`: - - >>> with local.open_input_file(path) as f: - ... print(f.readall()) - b'data' - """ - def open_input_stream( - self, path: str, compression: str | None = "detect", buffer_size: int | None = None - ) -> NativeFile: - """ - Open an input stream for sequential reading. - - Parameters - ---------- - path : str - The source to open for reading. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly decompression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary read buffer. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Print the data from the file with `open_input_stream()`: - - >>> with local.open_input_stream(path) as f: - ... print(f.readall()) - b'data' - """ - def open_output_stream( - self, - path: str, - compression: str | None = "detect", - buffer_size: int | None = None, - metadata: dict[str, str] | None = None, - ) -> NativeFile: - """ - Open an output stream for sequential writing. - - If the target already exists, existing data is truncated. - - Parameters - ---------- - path : str - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - metadata : dict optional, default None - If not None, a mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - Unsupported metadata keys will be ignored. - - Returns - ------- - stream : NativeFile - - Examples - -------- - >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream(path) as stream: - ... stream.write(b"data") - 4 - """ - def open_append_stream( - self, - path: str, - compression: str | None = "detect", - buffer_size: int | None = None, - metadata: dict[str, str] | None = None, - ): - """ - Open an output stream for appending. - - If the target doesn't exist, a new empty file is created. - - .. note:: - Some filesystem implementations do not support efficient - appending to an existing file, in which case this method will - raise NotImplementedError. - Consider writing to multiple files (using e.g. the dataset layer) - instead. - - Parameters - ---------- - path : str - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - metadata : dict optional, default None - If not None, a mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - Unsupported metadata keys will be ignored. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Append new data to a FileSystem subclass with nonempty file: - - >>> with local.open_append_stream(path) as f: - ... f.write(b"+newly added") - 12 - - Print out the content to the file: - - >>> with local.open_input_file(path) as f: - ... print(f.readall()) - b'data+newly added' - """ - def normalize_path(self, path: str) -> str: - """ - Normalize filesystem path. - - Parameters - ---------- - path : str - The path to normalize - - Returns - ------- - normalized_path : str - The normalized path - """ - -class LocalFileSystem(FileSystem): - """ - A FileSystem implementation accessing files on the local machine. - - Details such as symlinks are abstracted away (symlinks are always followed, - except when deleting an entry). - - Parameters - ---------- - use_mmap : bool, default False - Whether open_input_stream and open_input_file should return - a mmap'ed file or a regular file. - - Examples - -------- - Create a FileSystem object with LocalFileSystem constructor: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> local - - - and write data on to the file: - - >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: - ... stream.write(b"data") - 4 - >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: - ... print(stream.readall()) - b'data' - - Create a FileSystem object inferred from a URI of the saved file: - - >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") - >>> local_new - >> path - '/tmp/local_fs.dat' - - Check if FileSystems `local` and `local_new` are equal: - - >>> local.equals(local_new) - True - - Compare two different FileSystems: - - >>> local2 = fs.LocalFileSystem(use_mmap=True) - >>> local.equals(local2) - False - - Copy a file and print out the data: - - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") - >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: - ... print(stream.readall()) - b'data' - - Open an output stream for appending, add text and print the new data: - - >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: - ... f.write(b"+newly added") - 12 - - >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: - ... print(f.readall()) - b'data+newly added' - - Create a directory, copy a file into it and then delete the whole directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder") - - >>> local.delete_dir("/tmp/new_folder") - >>> local.get_file_info("/tmp/new_folder") - - - Create a directory, copy a file into it and then delete - the content of the directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - >>> local.delete_dir_contents("/tmp/new_folder") - >>> local.get_file_info("/tmp/new_folder") - - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - - Create a directory, copy a file into it and then delete - the file from the directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.delete_file("/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - >>> local.get_file_info("/tmp/new_folder") - - - Move the file: - - >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") - - >>> local.get_file_info("/tmp/local_fs-copy.dat") - - - To finish delete the file left: - >>> local.delete_file("/tmp/local_fs.dat") - """ - - def __init__(self, *, use_mmap: bool = False) -> None: ... - -class SubTreeFileSystem(FileSystem): - """ - Delegates to another implementation after prepending a fixed base path. - - This is useful to expose a logical view of a subtree of a filesystem, - for example a directory in a LocalFileSystem. - - Note, that this makes no security guarantee. For example, symlinks may - allow to "escape" the subtree and access other parts of the underlying - filesystem. - - Parameters - ---------- - base_path : str - The root of the subtree. - base_fs : FileSystem - FileSystem object the operations delegated to. - - Examples - -------- - Create a LocalFileSystem instance: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: - ... stream.write(b"data") - 4 - - Create a directory and a SubTreeFileSystem instance: - - >>> local.create_dir("/tmp/sub_tree") - >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) - - Write data into the existing file: - - >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: - ... f.write(b"+newly added") - 12 - - Print out the attributes: - - >>> subtree.base_fs - - >>> subtree.base_path - '/tmp/sub_tree/' - - Get info for the given directory or given file: - - >>> subtree.get_file_info("") - - >>> subtree.get_file_info("sub_tree_fs.dat") - - - Delete the file and directory: - - >>> subtree.delete_file("sub_tree_fs.dat") - >>> local.delete_dir("/tmp/sub_tree") - >>> local.delete_file("/tmp/local_fs.dat") - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - def __init__(self, base_path: str, base_fs: FileSystem): ... - @property - def base_path(self) -> str: ... - @property - def base_fs(self) -> FileSystem: ... - -class _MockFileSystem(FileSystem): - def __init__(self, current_time: dt.datetime | None = None) -> None: ... - -class PyFileSystem(FileSystem): - """ - A FileSystem with behavior implemented in Python. - - Parameters - ---------- - handler : FileSystemHandler - The handler object implementing custom filesystem behavior. - - Examples - -------- - Create an fsspec-based filesystem object for GitHub: - - >>> from fsspec.implementations import github - >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP - - Get a PyArrow FileSystem object: - - >>> from pyarrow.fs import PyFileSystem, FSSpecHandler - >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP - - Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: - - >>> pa_fs.get_file_info("README.md") # doctest: +SKIP - - """ - def __init__(self, handler: FileSystemHandler) -> None: ... - @property - def handler(self) -> FileSystemHandler: - """ - The filesystem's underlying handler. - - Returns - ------- - handler : FileSystemHandler - """ - -class FileSystemHandler(ABC): - """ - An abstract class exposing methods to implement PyFileSystem's behavior. - """ - @abstractmethod - def get_type_name(self) -> str: - """ - Implement PyFileSystem.type_name. - """ - @abstractmethod - def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: - """ - Implement PyFileSystem.get_file_info(paths). - - Parameters - ---------- - paths : list of str - paths for which we want to retrieve the info. - """ - @abstractmethod - def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: - """ - Implement PyFileSystem.get_file_info(selector). - - Parameters - ---------- - selector : FileSelector - selector for which we want to retrieve the info. - """ - - @abstractmethod - def create_dir(self, path: str, recursive: bool) -> None: - """ - Implement PyFileSystem.create_dir(...). - - Parameters - ---------- - path : str - path of the directory. - recursive : bool - if the parent directories should be created too. - """ - @abstractmethod - def delete_dir(self, path: str) -> None: - """ - Implement PyFileSystem.delete_dir(...). - - Parameters - ---------- - path : str - path of the directory. - """ - @abstractmethod - def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: - """ - Implement PyFileSystem.delete_dir_contents(...). - - Parameters - ---------- - path : str - path of the directory. - missing_dir_ok : bool - if False an error should be raised if path does not exist - """ - @abstractmethod - def delete_root_dir_contents(self) -> None: - """ - Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). - """ - @abstractmethod - def delete_file(self, path: str) -> None: - """ - Implement PyFileSystem.delete_file(...). - - Parameters - ---------- - path : str - path of the file. - """ - @abstractmethod - def move(self, src: str, dest: str) -> None: - """ - Implement PyFileSystem.move(...). - - Parameters - ---------- - src : str - path of what should be moved. - dest : str - path of where it should be moved to. - """ - - @abstractmethod - def copy_file(self, src: str, dest: str) -> None: - """ - Implement PyFileSystem.copy_file(...). - - Parameters - ---------- - src : str - path of what should be copied. - dest : str - path of where it should be copied to. - """ - @abstractmethod - def open_input_stream(self, path: str) -> NativeFile: - """ - Implement PyFileSystem.open_input_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - """ - @abstractmethod - def open_input_file(self, path: str) -> NativeFile: - """ - Implement PyFileSystem.open_input_file(...). - - Parameters - ---------- - path : str - path of what should be opened. - """ - @abstractmethod - def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: - """ - Implement PyFileSystem.open_output_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - metadata : mapping - Mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - """ - - @abstractmethod - def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: - """ - Implement PyFileSystem.open_append_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - metadata : mapping - Mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - """ - @abstractmethod - def normalize_path(self, path: str) -> str: - """ - Implement PyFileSystem.normalize_path(...). - - Parameters - ---------- - path : str - path of what should be normalized. - """ diff --git a/python/pyarrow/_hdfs.pyi b/python/pyarrow/_hdfs.pyi deleted file mode 100644 index ed367379171..00000000000 --- a/python/pyarrow/_hdfs.pyi +++ /dev/null @@ -1,92 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from _typeshed import StrPath - -from ._fs import FileSystem - -class HadoopFileSystem(FileSystem): - """ - HDFS backed FileSystem implementation - - Parameters - ---------- - host : str - HDFS host to connect to. Set to "default" for fs.defaultFS from - core-site.xml. - port : int, default 8020 - HDFS port to connect to. Set to 0 for default or logical (HA) nodes. - user : str, default None - Username when connecting to HDFS; None implies login user. - replication : int, default 3 - Number of copies each block will have. - buffer_size : int, default 0 - If 0, no buffering will happen otherwise the size of the temporary read - and write buffer. - default_block_size : int, default None - None means the default configuration for HDFS, a typical block size is - 128 MB. - kerb_ticket : string or path, default None - If not None, the path to the Kerberos ticket cache. - extra_conf : dict, default None - Extra key/value pairs for configuration; will override any - hdfs-site.xml properties. - - Examples - -------- - >>> from pyarrow import fs - >>> hdfs = fs.HadoopFileSystem( - ... host, port, user=user, kerb_ticket=ticket_cache_path - ... ) # doctest: +SKIP - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - def __init__( - self, - host: str, - port: int = 8020, - *, - user: str | None = None, - replication: int = 3, - buffer_size: int = 0, - default_block_size: int | None = None, - kerb_ticket: StrPath | None = None, - extra_conf: dict | None = None, - ): ... - @staticmethod - def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] - """ - Instantiate HadoopFileSystem object from an URI string. - - The following two calls are equivalent - - * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ -&replication=1')`` - * ``HadoopFileSystem('localhost', port=8020, user='test', \ -replication=1)`` - - Parameters - ---------- - uri : str - A string URI describing the connection to HDFS. - In order to change the user, replication, buffer_size or - default_block_size pass the values as query parts. - - Returns - ------- - HadoopFileSystem - """ diff --git a/python/pyarrow/_json.pyi b/python/pyarrow/_json.pyi deleted file mode 100644 index f416b4b29c6..00000000000 --- a/python/pyarrow/_json.pyi +++ /dev/null @@ -1,186 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import IO, Any, Literal - -from _typeshed import StrPath - -from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable - -class ReadOptions(_Weakrefable): - """ - Options for reading JSON files. - - Parameters - ---------- - use_threads : bool, optional (default True) - Whether to use multiple threads to accelerate reading - block_size : int, optional - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as - the size of individual chunks in the Table. - """ - - use_threads: bool - """ - Whether to use multiple threads to accelerate reading. - """ - block_size: int - """ - How much bytes to process at a time from the input stream. - - This will determine multi-threading granularity as well as the size of - individual chunks in the Table. - """ - def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... - def equals(self, other: ReadOptions) -> bool: - """ - Parameters - ---------- - other : pyarrow.json.ReadOptions - - Returns - ------- - bool - """ - -class ParseOptions(_Weakrefable): - """ - Options for parsing JSON files. - - Parameters - ---------- - explicit_schema : Schema, optional (default None) - Optional explicit schema (no type inference, ignores other fields). - newlines_in_values : bool, optional (default False) - Whether objects may be printed across multiple lines (for example - pretty printed). If false, input must end with an empty line. - unexpected_field_behavior : str, default "infer" - How JSON fields outside of explicit_schema (if given) are treated. - - Possible behaviors: - - - "ignore": unexpected JSON fields are ignored - - "error": error out on unexpected JSON fields - - "infer": unexpected JSON fields are type-inferred and included in - the output - """ - - explicit_schema: Schema - """ - Optional explicit schema (no type inference, ignores other fields) - """ - newlines_in_values: bool - """ - Whether newline characters are allowed in JSON values. - Setting this to True reduces the performance of multi-threaded - JSON reading. - """ - unexpected_field_behavior: Literal["ignore", "error", "infer"] - """ - How JSON fields outside of explicit_schema (if given) are treated. - - Possible behaviors: - - - "ignore": unexpected JSON fields are ignored - - "error": error out on unexpected JSON fields - - "infer": unexpected JSON fields are type-inferred and included in - the output - - Set to "infer" by default. - """ - def __init__( - self, - explicit_schema: Schema | None = None, - newlines_in_values: bool | None = None, - unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", - ): ... - def equals(self, other: ParseOptions) -> bool: - """ - Parameters - ---------- - other : pyarrow.json.ParseOptions - - Returns - ------- - bool - """ - -class JSONStreamingReader(RecordBatchReader): - """An object that reads record batches incrementally from a JSON file. - - Should not be instantiated directly by user code. - """ - -def read_json( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - memory_pool: MemoryPool | None = None, -) -> Table: - """ - Read a Table from a stream of JSON data. - - Parameters - ---------- - input_file : str, path or file-like object - The location of JSON data. Currently only the line-delimited JSON - format is supported. - read_options : pyarrow.json.ReadOptions, optional - Options for the JSON reader (see ReadOptions constructor for defaults). - parse_options : pyarrow.json.ParseOptions, optional - Options for the JSON parser - (see ParseOptions constructor for defaults). - memory_pool : MemoryPool, optional - Pool to allocate Table memory from. - - Returns - ------- - :class:`pyarrow.Table` - Contents of the JSON file as a in-memory table. - """ - -def open_json( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - memory_pool: MemoryPool | None = None, -) -> JSONStreamingReader: - """ - Open a streaming reader of JSON data. - - Reading using this function is always single-threaded. - - Parameters - ---------- - input_file : string, path or file-like object - The location of JSON data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.json.ReadOptions, optional - Options for the JSON reader (see pyarrow.json.ReadOptions constructor - for defaults) - parse_options : pyarrow.json.ParseOptions, optional - Options for the JSON parser - (see pyarrow.json.ParseOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate RecordBatch memory from - - Returns - ------- - :class:`pyarrow.json.JSONStreamingReader` - """ diff --git a/python/pyarrow/_orc.pyi b/python/pyarrow/_orc.pyi deleted file mode 100644 index 7587cc121c3..00000000000 --- a/python/pyarrow/_orc.pyi +++ /dev/null @@ -1,73 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import IO, Literal - -from .lib import ( - Buffer, - KeyValueMetadata, - MemoryPool, - NativeFile, - RecordBatch, - Schema, - Table, - _Weakrefable, -) - -class ORCReader(_Weakrefable): - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... - def metadata(self) -> KeyValueMetadata: ... - def schema(self) -> Schema: ... - def nrows(self) -> int: ... - def nstripes(self) -> int: ... - def file_version(self) -> str: ... - def software_version(self) -> str: ... - def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... - def compression_size(self) -> int: ... - def row_index_stride(self) -> int: ... - def writer(self) -> str: ... - def writer_version(self) -> str: ... - def nstripe_statistics(self) -> int: ... - def content_length(self) -> int: ... - def stripe_statistics_length(self) -> int: ... - def file_footer_length(self) -> int: ... - def file_postscript_length(self) -> int: ... - def file_length(self) -> int: ... - def serialized_file_tail(self) -> int: ... - def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... - def read(self, columns: list[str] | None = None) -> Table: ... - -class ORCWriter(_Weakrefable): - def open( - self, - where: str | NativeFile | IO, - *, - file_version: str | None = None, - batch_size: int | None = None, - stripe_size: int | None = None, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, - compression_block_size: int | None = None, - compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, - row_index_stride: int | None = None, - padding_tolerance: float | None = None, - dictionary_key_size_threshold: float | None = None, - bloom_filter_columns: list[int] | None = None, - bloom_filter_fpp: float | None = None, - ) -> None: ... - def write(self, table: Table) -> None: ... - def close(self) -> None: ... diff --git a/python/pyarrow/_parquet.pyi b/python/pyarrow/_parquet.pyi deleted file mode 100644 index c75337cbf3b..00000000000 --- a/python/pyarrow/_parquet.pyi +++ /dev/null @@ -1,462 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict - -from _typeshed import StrPath - -from ._stubs_typing import Order -from .lib import ( - Buffer, - ChunkedArray, - KeyValueMetadata, - MemoryPool, - NativeFile, - RecordBatch, - Schema, - Table, - _Weakrefable, -) - -_PhysicalType: TypeAlias = Literal[ - "BOOLEAN", - "INT32", - "INT64", - "INT96", - "FLOAT", - "DOUBLE", - "BYTE_ARRAY", - "FIXED_LEN_BYTE_ARRAY", - "UNKNOWN", -] -_LogicTypeName: TypeAlias = Literal[ - "UNDEFINED", - "STRING", - "MAP", - "LIST", - "ENUM", - "DECIMAL", - "DATE", - "TIME", - "TIMESTAMP", - "INT", - "FLOAT16", - "JSON", - "BSON", - "UUID", - "NONE", - "UNKNOWN", -] -_ConvertedType: TypeAlias = Literal[ - "NONE", - "UTF8", - "MAP", - "MAP_KEY_VALUE", - "LIST", - "ENUM", - "DECIMAL", - "DATE", - "TIME_MILLIS", - "TIME_MICROS", - "TIMESTAMP_MILLIS", - "TIMESTAMP_MICROS", - "UINT_8", - "UINT_16", - "UINT_32", - "UINT_64", - "INT_8", - "INT_16", - "INT_32", - "INT_64", - "JSON", - "BSON", - "INTERVAL", - "UNKNOWN", -] -_Encoding: TypeAlias = Literal[ - "PLAIN", - "PLAIN_DICTIONARY", - "RLE", - "BIT_PACKED", - "DELTA_BINARY_PACKED", - "DELTA_LENGTH_BYTE_ARRAY", - "DELTA_BYTE_ARRAY", - "RLE_DICTIONARY", - "BYTE_STREAM_SPLIT", - "UNKNOWN", -] -_Compression: TypeAlias = Literal[ - "UNCOMPRESSED", - "SNAPPY", - "GZIP", - "LZO", - "BROTLI", - "LZ4", - "ZSTD", - "UNKNOWN", -] - -class _Statistics(TypedDict): - has_min_max: bool - min: Any | None - max: Any | None - null_count: int | None - distinct_count: int | None - num_values: int - physical_type: _PhysicalType - -class Statistics(_Weakrefable): - def to_dict(self) -> _Statistics: ... - def equals(self, other: Statistics) -> bool: ... - @property - def has_min_max(self) -> bool: ... - @property - def hash_null_count(self) -> bool: ... - @property - def has_distinct_count(self) -> bool: ... - @property - def min_raw(self) -> Any | None: ... - @property - def max_raw(self) -> Any | None: ... - @property - def min(self) -> Any | None: ... - @property - def max(self) -> Any | None: ... - @property - def null_count(self) -> int | None: ... - @property - def distinct_count(self) -> int | None: ... - @property - def num_values(self) -> int: ... - @property - def physical_type(self) -> _PhysicalType: ... - @property - def logical_type(self) -> ParquetLogicalType: ... - @property - def converted_type(self) -> _ConvertedType | None: ... - -class ParquetLogicalType(_Weakrefable): - def to_json(self) -> str: ... - @property - def type(self) -> _LogicTypeName: ... - -class _ColumnChunkMetaData(TypedDict): - file_offset: int - file_path: str | None - physical_type: _PhysicalType - num_values: int - path_in_schema: str - is_stats_set: bool - statistics: Statistics | None - compression: _Compression - encodings: tuple[_Encoding, ...] - has_dictionary_page: bool - dictionary_page_offset: int | None - data_page_offset: int - total_compressed_size: int - total_uncompressed_size: int - -class ColumnChunkMetaData(_Weakrefable): - def to_dict(self) -> _ColumnChunkMetaData: ... - def equals(self, other: ColumnChunkMetaData) -> bool: ... - @property - def file_offset(self) -> int: ... - @property - def file_path(self) -> str | None: ... - @property - def physical_type(self) -> _PhysicalType: ... - @property - def num_values(self) -> int: ... - @property - def path_in_schema(self) -> str: ... - @property - def is_stats_set(self) -> bool: ... - @property - def statistics(self) -> Statistics | None: ... - @property - def compression(self) -> _Compression: ... - @property - def encodings(self) -> tuple[_Encoding, ...]: ... - @property - def has_dictionary_page(self) -> bool: ... - @property - def dictionary_page_offset(self) -> int | None: ... - @property - def data_page_offset(self) -> int: ... - @property - def has_index_page(self) -> bool: ... - @property - def index_page_offset(self) -> int: ... - @property - def total_compressed_size(self) -> int: ... - @property - def total_uncompressed_size(self) -> int: ... - @property - def has_offset_index(self) -> bool: ... - @property - def has_column_index(self) -> bool: ... - @property - def metadata(self) -> dict[bytes, bytes] | None: ... - -class _SortingColumn(TypedDict): - column_index: int - descending: bool - nulls_first: bool - -class SortingColumn: - def __init__( - self, column_index: int, descending: bool = False, nulls_first: bool = False - ) -> None: ... - @classmethod - def from_ordering( - cls, - schema: Schema, - sort_keys: Sequence[tuple[str, Order]], - null_placement: Literal["at_start", "at_end"] = "at_end", - ) -> tuple[SortingColumn, ...]: ... - @staticmethod - def to_ordering( - schema: Schema, sorting_columns: tuple[SortingColumn, ...] - ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... - def __hash__(self) -> int: ... - @property - def column_index(self) -> int: ... - @property - def descending(self) -> bool: ... - @property - def nulls_first(self) -> bool: ... - def to_dict(self) -> _SortingColumn: ... - -class _RowGroupMetaData(TypedDict): - num_columns: int - num_rows: int - total_byte_size: int - columns: list[ColumnChunkMetaData] - sorting_columns: list[SortingColumn] - -class RowGroupMetaData(_Weakrefable): - def __init__(self, parent: FileMetaData, index: int) -> None: ... - def equals(self, other: RowGroupMetaData) -> bool: ... - def column(self, i: int) -> ColumnChunkMetaData: ... - def to_dict(self) -> _RowGroupMetaData: ... - @property - def num_columns(self) -> int: ... - @property - def num_rows(self) -> int: ... - @property - def total_byte_size(self) -> int: ... - @property - def sorting_columns(self) -> list[SortingColumn]: ... - -class _FileMetaData(TypedDict): - created_by: str - num_columns: int - num_rows: int - num_row_groups: int - format_version: str - serialized_size: int - -class FileMetaData(_Weakrefable): - def __hash__(self) -> int: ... - def to_dict(self) -> _FileMetaData: ... - def equals(self, other: FileMetaData) -> bool: ... - @property - def schema(self) -> ParquetSchema: ... - @property - def serialized_size(self) -> int: ... - @property - def num_columns(self) -> int: ... - @property - def num_rows(self) -> int: ... - @property - def num_row_groups(self) -> int: ... - @property - def format_version(self) -> str: ... - @property - def created_by(self) -> str: ... - @property - def metadata(self) -> dict[bytes, bytes] | None: ... - def row_group(self, i: int) -> RowGroupMetaData: ... - def set_file_path(self, path: str) -> None: ... - def append_row_groups(self, other: FileMetaData) -> None: ... - def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... - -class ParquetSchema(_Weakrefable): - def __init__(self, container: FileMetaData) -> None: ... - def __getitem__(self, i: int) -> ColumnChunkMetaData: ... - def __hash__(self) -> int: ... - def __len__(self) -> int: ... - @property - def names(self) -> list[str]: ... - def to_arrow_schema(self) -> Schema: ... - def equals(self, other: ParquetSchema) -> bool: ... - def column(self, i: int) -> ColumnSchema: ... - -class ColumnSchema(_Weakrefable): - def __init__(self, schema: ParquetSchema, index: int) -> None: ... - def equals(self, other: ColumnSchema) -> bool: ... - @property - def name(self) -> str: ... - @property - def path(self) -> str: ... - @property - def max_definition_level(self) -> int: ... - @property - def max_repetition_level(self) -> int: ... - @property - def physical_type(self) -> _PhysicalType: ... - @property - def logical_type(self) -> ParquetLogicalType: ... - @property - def converted_type(self) -> _ConvertedType | None: ... - @property - def length(self) -> int | None: ... - @property - def precision(self) -> int | None: ... - @property - def scale(self) -> int | None: ... - -class ParquetReader(_Weakrefable): - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def open( - self, - source: StrPath | NativeFile | IO, - *, - use_memory_map: bool = False, - read_dictionary: Iterable[int] | Iterable[str] | None = None, - metadata: FileMetaData | None = None, - buffer_size: int = 0, - pre_buffer: bool = False, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - page_checksum_verification: bool = False, - ): ... - @property - def column_paths(self) -> list[str]: ... - @property - def metadata(self) -> FileMetaData: ... - @property - def schema_arrow(self) -> Schema: ... - @property - def num_row_groups(self) -> int: ... - def set_use_threads(self, use_threads: bool) -> None: ... - def set_batch_size(self, batch_size: int) -> None: ... - def iter_batches( - self, - batch_size: int, - row_groups: list[int], - column_indices: list[int] | None = None, - use_threads: bool = True, - ) -> Iterator[RecordBatch]: ... - def read_row_group( - self, i: int, column_indices: list[int] | None = None, use_threads: bool = True - ) -> Table: ... - def read_row_groups( - self, - row_groups: list[int], - column_indices: list[int] | None = None, - use_threads: bool = True, - ) -> Table: ... - def read_all( - self, column_indices: list[int] | None = None, use_threads: bool = True - ) -> Table: ... - def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... - def column_name_idx(self, column_name: str) -> int: ... - def read_column(self, column_index: int) -> ChunkedArray: ... - def close(self) -> None: ... - @property - def closed(self) -> bool: ... - -class ParquetWriter(_Weakrefable): - def __init__( - self, - where: StrPath | NativeFile | IO, - schema: Schema, - use_dictionary: bool | list[str] | None = None, - compression: _Compression | dict[str, _Compression] | None = None, - version: str | None = None, - write_statistics: bool | list[str] | None = None, - memory_pool: MemoryPool | None = None, - use_deprecated_int96_timestamps: bool = False, - coerce_timestamps: Literal["ms", "us"] | None = None, - data_page_size: int | None = None, - allow_truncated_timestamps: bool = False, - compression_level: int | dict[str, int] | None = None, - use_byte_stream_split: bool | list[str] = False, - column_encoding: _Encoding | dict[str, _Encoding] | None = None, - writer_engine_version: str | None = None, - data_page_version: str | None = None, - use_compliant_nested_type: bool = True, - encryption_properties: FileDecryptionProperties | None = None, - write_batch_size: int | None = None, - dictionary_pagesize_limit: int | None = None, - store_schema: bool = True, - write_page_index: bool = False, - write_page_checksum: bool = False, - sorting_columns: tuple[SortingColumn, ...] | None = None, - store_decimal_as_integer: bool = False, - ): ... - def close(self) -> None: ... - def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... - def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... - @property - def metadata(self) -> FileMetaData: ... - @property - def use_dictionary(self) -> bool | list[str] | None: ... - @property - def use_deprecated_int96_timestamps(self) -> bool: ... - @property - def use_byte_stream_split(self) -> bool | list[str]: ... - @property - def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... - @property - def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... - @property - def allow_truncated_timestamps(self) -> bool: ... - @property - def compression(self) -> _Compression | dict[str, _Compression] | None: ... - @property - def compression_level(self) -> int | dict[str, int] | None: ... - @property - def data_page_version(self) -> str | None: ... - @property - def use_compliant_nested_type(self) -> bool: ... - @property - def version(self) -> str | None: ... - @property - def write_statistics(self) -> bool | list[str] | None: ... - @property - def writer_engine_version(self) -> str: ... - @property - def row_group_size(self) -> int: ... - @property - def data_page_size(self) -> int: ... - @property - def encryption_properties(self) -> FileDecryptionProperties: ... - @property - def write_batch_size(self) -> int: ... - @property - def dictionary_pagesize_limit(self) -> int: ... - @property - def store_schema(self) -> bool: ... - @property - def store_decimal_as_integer(self) -> bool: ... - -class FileEncryptionProperties: ... -class FileDecryptionProperties: ... diff --git a/python/pyarrow/_s3fs.pyi b/python/pyarrow/_s3fs.pyi deleted file mode 100644 index e2f5f147096..00000000000 --- a/python/pyarrow/_s3fs.pyi +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import enum - -from typing import Literal, NotRequired, Required, TypedDict - -from ._fs import FileSystem -from .lib import KeyValueMetadata - -class _ProxyOptions(TypedDict): - schema: Required[Literal["http", "https"]] - host: Required[str] - port: Required[int] - username: NotRequired[str] - password: NotRequired[str] - -class S3LogLevel(enum.IntEnum): - Off = enum.auto() - Fatal = enum.auto() - Error = enum.auto() - Warn = enum.auto() - Info = enum.auto() - Debug = enum.auto() - Trace = enum.auto() - -Off = S3LogLevel.Off -Fatal = S3LogLevel.Fatal -Error = S3LogLevel.Error -Warn = S3LogLevel.Warn -Info = S3LogLevel.Info -Debug = S3LogLevel.Debug -Trace = S3LogLevel.Trace - -def initialize_s3( - log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 -) -> None: ... -def ensure_s3_initialized() -> None: ... -def finalize_s3() -> None: ... -def ensure_s3_finalized() -> None: ... -def resolve_s3_region(bucket: str) -> str: ... - -class S3RetryStrategy: - max_attempts: int - def __init__(self, max_attempts=3) -> None: ... - -class AwsStandardS3RetryStrategy(S3RetryStrategy): ... -class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... - -class S3FileSystem(FileSystem): - def __init__( - self, - *, - access_key: str | None = None, - secret_key: str | None = None, - session_token: str | None = None, - anonymous: bool = False, - region: str | None = None, - request_timeout: float | None = None, - connect_timeout: float | None = None, - scheme: Literal["http", "https"] = "https", - endpoint_override: str | None = None, - background_writes: bool = True, - default_metadata: dict | KeyValueMetadata | None = None, - role_arn: str | None = None, - session_name: str | None = None, - external_id: str | None = None, - load_frequency: int = 900, - proxy_options: _ProxyOptions | str | None = None, - allow_bucket_creation: bool = False, - allow_bucket_deletion: bool = False, - check_directory_existence_before_creation: bool = False, - retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), - force_virtual_addressing: bool = False, - ): ... - @property - def region(self) -> str: ... diff --git a/python/pyarrow/_substrait.pyi b/python/pyarrow/_substrait.pyi deleted file mode 100644 index ee78e9720fe..00000000000 --- a/python/pyarrow/_substrait.pyi +++ /dev/null @@ -1,56 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any, Callable - -from ._compute import Expression -from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable - -def run_query( - plan: Buffer | int, - *, - table_provider: Callable[[list[str], Schema], Table] | None = None, - use_threads: bool = True, -) -> RecordBatchReader: ... -def _parse_json_plan(plan: bytes) -> Buffer: ... - -class SubstraitSchema: - schema: Schema - expression: Expression - def __init__(self, schema: Schema, expression: Expression) -> None: ... - def to_pysubstrait(self) -> Any: ... - -def serialize_schema(schema: Schema) -> SubstraitSchema: ... -def deserialize_schema(buf: Buffer | bytes) -> Schema: ... -def serialize_expressions( - exprs: list[Expression], - names: list[str], - schema: Schema, - *, - allow_arrow_extensions: bool = False, -) -> Buffer: ... - -class BoundExpressions(_Weakrefable): - @property - def schema(self) -> Schema: ... - @property - def expressions(self) -> dict[str, Expression]: ... - @classmethod - def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... - -def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... -def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow/acero.pyi b/python/pyarrow/acero.pyi deleted file mode 100644 index 2abb608b32c..00000000000 --- a/python/pyarrow/acero.pyi +++ /dev/null @@ -1,102 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias -from typing import Literal - -from . import lib -from .compute import Expression, FunctionOptions - -_StrOrExpr: TypeAlias = str | Expression - -class Declaration(lib._Weakrefable): - def __init__( - self, - factory_name: str, - options: ExecNodeOptions, - inputs: list[Declaration] | None = None, - ) -> None: ... - @classmethod - def from_sequence(cls, decls: list[Declaration]) -> Self: ... - def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... - def to_table(self, use_threads: bool = True) -> lib.Table: ... - -class ExecNodeOptions(lib._Weakrefable): ... - -class TableSourceNodeOptions(ExecNodeOptions): - def __init__(self, table: lib.Table) -> None: ... - -class FilterNodeOptions(ExecNodeOptions): - def __init__(self, filter_expression: Expression) -> None: ... - -class ProjectNodeOptions(ExecNodeOptions): - def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... - -class AggregateNodeOptions(ExecNodeOptions): - def __init__( - self, - aggregates: list[tuple[list[str], str, FunctionOptions, str]], - keys: list[_StrOrExpr] | None = None, - ) -> None: ... - -class OrderByNodeOptions(ExecNodeOptions): - def __init__( - self, - sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), - *, - null_placement: Literal["at_start", "at_end"] = "at_end", - ) -> None: ... - -class HashJoinNodeOptions(ExecNodeOptions): - def __init__( - self, - join_type: Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", - ], - left_keys: _StrOrExpr | list[_StrOrExpr], - right_keys: _StrOrExpr | list[_StrOrExpr], - left_output: list[_StrOrExpr] | None = None, - right_output: list[_StrOrExpr] | None = None, - output_suffix_for_left: str = "", - output_suffix_for_right: str = "", - ) -> None: ... - -class AsofJoinNodeOptions(ExecNodeOptions): - def __init__( - self, - left_on: _StrOrExpr, - left_by: _StrOrExpr | list[_StrOrExpr], - right_on: _StrOrExpr, - right_by: _StrOrExpr | list[_StrOrExpr], - tolerance: int, - ) -> None: ... diff --git a/python/pyarrow/benchmark.pyi b/python/pyarrow/benchmark.pyi deleted file mode 100644 index 3ea8f70bc34..00000000000 --- a/python/pyarrow/benchmark.pyi +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow.lib import benchmark_PandasObjectIsNull - -__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/pyarrow/cffi.pyi b/python/pyarrow/cffi.pyi deleted file mode 100644 index e4f077d7155..00000000000 --- a/python/pyarrow/cffi.pyi +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import cffi - -c_source: str -ffi: cffi.FFI diff --git a/python/pyarrow/compute.pyi b/python/pyarrow/compute.pyi deleted file mode 100644 index cbbb9b0efcc..00000000000 --- a/python/pyarrow/compute.pyi +++ /dev/null @@ -1,8332 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence, Hashable -from collections.abc import Callable -from numpy.typing import NDArray - -# Option classes -from pyarrow._compute import ArraySortOptions as ArraySortOptions -from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions -from pyarrow._compute import CastOptions as CastOptions -from pyarrow._compute import CountOptions as CountOptions -from pyarrow._compute import CumulativeOptions as CumulativeOptions -from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions -from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions -from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions -from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions - -# Expressions -from pyarrow._compute import Expression as Expression -from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions -from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions -from pyarrow._compute import FilterOptions as FilterOptions -from pyarrow._compute import Function as Function -from pyarrow._compute import FunctionOptions as FunctionOptions -from pyarrow._compute import FunctionRegistry as FunctionRegistry -from pyarrow._compute import HashAggregateFunction as HashAggregateFunction -from pyarrow._compute import HashAggregateKernel as HashAggregateKernel -from pyarrow._compute import IndexOptions as IndexOptions -from pyarrow._compute import JoinOptions as JoinOptions -from pyarrow._compute import Kernel as Kernel -from pyarrow._compute import ListFlattenOptions as ListFlattenOptions -from pyarrow._compute import ListSliceOptions as ListSliceOptions -from pyarrow._compute import MakeStructOptions as MakeStructOptions -from pyarrow._compute import MapLookupOptions as MapLookupOptions -from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions -from pyarrow._compute import ModeOptions as ModeOptions -from pyarrow._compute import NullOptions as NullOptions -from pyarrow._compute import PadOptions as PadOptions -from pyarrow._compute import PairwiseOptions as PairwiseOptions -from pyarrow._compute import PartitionNthOptions as PartitionNthOptions -from pyarrow._compute import PivotWiderOptions as PivotWiderOptions -from pyarrow._compute import QuantileOptions as QuantileOptions -from pyarrow._compute import RandomOptions as RandomOptions -from pyarrow._compute import RankOptions as RankOptions -from pyarrow._compute import RankQuantileOptions as RankQuantileOptions -from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions -from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions -from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions -from pyarrow._compute import RoundOptions as RoundOptions -from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions -from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions -from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions -from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction -from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel -from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions -from pyarrow._compute import ScalarFunction as ScalarFunction -from pyarrow._compute import ScalarKernel as ScalarKernel -from pyarrow._compute import SelectKOptions as SelectKOptions -from pyarrow._compute import SetLookupOptions as SetLookupOptions -from pyarrow._compute import SkewOptions as SkewOptions -from pyarrow._compute import SliceOptions as SliceOptions -from pyarrow._compute import SortOptions as SortOptions -from pyarrow._compute import SplitOptions as SplitOptions -from pyarrow._compute import SplitPatternOptions as SplitPatternOptions -from pyarrow._compute import StrftimeOptions as StrftimeOptions -from pyarrow._compute import StrptimeOptions as StrptimeOptions -from pyarrow._compute import StructFieldOptions as StructFieldOptions -from pyarrow._compute import TakeOptions as TakeOptions -from pyarrow._compute import TDigestOptions as TDigestOptions -from pyarrow._compute import TrimOptions as TrimOptions -from pyarrow._compute import UdfContext as UdfContext -from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions -from pyarrow._compute import ZeroFillOptions as ZeroFillOptions -from pyarrow._compute import VarianceOptions as VarianceOptions -from pyarrow._compute import VectorFunction as VectorFunction -from pyarrow._compute import VectorKernel as VectorKernel -from pyarrow._compute import WeekOptions as WeekOptions -from pyarrow._compute import WinsorizeOptions as WinsorizeOptions - -# Functions -from pyarrow._compute import call_function as call_function - -# Udf -from pyarrow._compute import call_tabular_function as call_tabular_function -from pyarrow._compute import function_registry as function_registry -from pyarrow._compute import get_function as get_function -from pyarrow._compute import list_functions as list_functions -from pyarrow._compute import register_aggregate_function as register_aggregate_function -from pyarrow._compute import register_scalar_function as register_scalar_function -from pyarrow._compute import register_tabular_function as register_tabular_function -from pyarrow._compute import register_vector_function as register_vector_function - -from pyarrow._compute import _Order, _Placement -from pyarrow._stubs_typing import ArrayLike, ScalarLike -from . import lib -from _stubs_typing import Indices - -_P = ParamSpec("_P") -_R = TypeVar("_R") - -def field(*name_or_index: str | bytes | tuple[str | int, ...] | int) -> Expression: - """Reference a column of the dataset. - - Stores only the field's name. Type and other information is known only when - the expression is bound to a dataset having an explicit scheme. - - Nested references are allowed by passing multiple names or a tuple of - names. For example ``('foo', 'bar')`` references the field named "bar" - inside the field named "foo". - - Parameters - ---------- - *name_or_index : string, multiple strings, tuple or int - The name or index of the (possibly nested) field the expression - references to. - - Returns - ------- - field_expr : Expression - Reference to the given field - - Examples - -------- - >>> import pyarrow.compute as pc - >>> pc.field("a") - - >>> pc.field(1) - - >>> pc.field(("a", "b")) - >> pc.field("a", "b") - Expression: - """Expression representing a scalar value. - - Creates an Expression object representing a scalar value that can be used - in compute expressions and predicates. - - Parameters - ---------- - value : bool, int, float or string - Python value of the scalar. This function accepts any value that can be - converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. - - Notes - ----- - This function differs from ``pyarrow.scalar()`` in the following way: - - * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents - a single value in Arrow's memory model. - * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing - a scalar value that can be used in compute expressions, predicates, and - dataset filtering operations. - - Returns - ------- - scalar_expr : Expression - An Expression representing the scalar value - """ - -def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... - -# ============= compute functions ============= -_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) -_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) -_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) -_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) -_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) -ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] -ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT -_ZonedTimestampArrayT: TypeAlias = ArrayOrChunkedArray[lib.Scalar[lib.TimestampType[Any, Any]]] -_ZonelessTimestampArrayT: TypeAlias = ArrayOrChunkedArray[lib.Scalar[lib.TimestampType[Any, None]]] -_ZonedTimestampScalarT: TypeAlias = lib.Scalar[lib.TimestampType[Any, Any]] -_ZonelessTimestampScalarT: TypeAlias = lib.Scalar[lib.TimestampType[Any, None]] - -SignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.Int8Type] - | lib.Scalar[lib.Int16Type] - | lib.Scalar[lib.Int32Type] - | lib.Scalar[lib.Int64Type] -) -UnsignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.UInt8Type] - | lib.Scalar[lib.UInt16Type] - | lib.Scalar[lib.Uint32Type] - | lib.Scalar[lib.UInt64Type] -) -IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar -FloatScalar: TypeAlias = ( - lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] -) -DecimalScalar: TypeAlias = ( - lib.Scalar[lib.Decimal32Type] - | lib.Scalar[lib.Decimal64Type] - | lib.Scalar[lib.Decimal128Type] - | lib.Scalar[lib.Decimal256Type] -) -NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar -NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar -BinaryScalar: TypeAlias = ( - lib.Scalar[lib.BinaryType] - | lib.Scalar[lib.LargeBinaryType] - | lib.Scalar[lib.FixedSizeBinaryType] -) -StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] -StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar -_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] -_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] -ListScalar: TypeAlias = ( - lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] -) -TemporalScalar: TypeAlias = ( - lib.Date32Scalar - | lib.Date64Scalar - | lib.Time32Scalar[Any] - | lib.Time64Scalar[Any] - | lib.TimestampScalar[Any] - | lib.TimestampScalar[Any, None] - | lib.DurationScalar[Any] - | lib.MonthDayNanoIntervalScalar -) -NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar -NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar - -_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) -NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] -_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) -_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) -NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] -_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) -NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] -_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) -BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] -_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) -IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] -_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) -FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] -_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) -_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) -StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] -_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) -_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) -BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] -_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) -_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) -StringOrBinaryArray: TypeAlias = StringArray | BinaryArray -_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) -_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) -TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] -_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) -_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] -_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] -ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] -_DecimalScalarT = TypeVar("_DecimalScalarT", bound=DecimalScalar) -DecimalArray: TypeAlias = lib.Array[_DecimalScalarT] | lib.ChunkedArray[_DecimalScalarT] -_DecimalArrayT = TypeVar("_DecimalArrayT", bound=DecimalArray) -# =============================== 1. Aggregation =============================== - -# ========================= 1.1 functions ========================= - -def all( - array: lib.BooleanScalar | BooleanArray, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: - """ - Test whether all elements in a boolean array evaluate to true. - - Null values are ignored by default. - If the `skip_nulls` option is set to false, then Kleene logic is used. - See "kleene_and" for more details on Kleene logic. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -any = _clone_signature(all) -""" -Test whether any element in a boolean array evaluates to true. - -Null values are ignored by default. -If the `skip_nulls` option is set to false, then Kleene logic is used. -See "kleene_or" for more details on Kleene logic. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -def approximate_median( - array: NumericScalar | NumericArray, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Approximate median of a numeric array with T-Digest algorithm. - - Nulls and NaNs are ignored. - A null scalar is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def count( - array: lib.Array | lib.ChunkedArray, - /, - mode: Literal["only_valid", "only_null", "all"] = "only_valid", - *, - options: CountOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Count the number of null / non-null values. - - By default, only non-null values are counted. - This can be changed through CountOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - options : pyarrow.compute.CountOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def count_distinct( - array: lib.Array | lib.ChunkedArray, - /, - mode: Literal["only_valid", "only_null", "all"] = "only_valid", - *, - options: CountOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Count the number of unique values. - - By default, only non-null values are counted. - This can be changed through CountOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - options : pyarrow.compute.CountOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def first( - array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: - """ - Compute the first value in each group. - - Null values are ignored by default. - If skip_nulls = false, then this will return the first and last values - regardless if it is null - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def first_last( - array: lib.Array[Any] | lib.ChunkedArray[Any] | Sequence[Any], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | dict[str, Any] | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: - """ - Compute the first and last values of an array. - - Null values are ignored by default. - If skip_nulls = false, then this will return the first and last values - regardless if it is null - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def index( - data: lib.Array[Any] | lib.ChunkedArray[Any], - value, - start: int | None = None, - end: int | None = None, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Find the index of the first occurrence of a given value. - - Parameters - ---------- - data : Array-like - value : Scalar-like object - The value to search for. - start : int, optional - end : int, optional - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - index : int - the index, or -1 if not found - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) - >>> pc.index(arr, "ipsum") - - >>> pc.index(arr, "ipsum", start=2) - - >>> pc.index(arr, "amet") - - """ - -last = _clone_signature(first) -""" -Compute the first and last values of an array. - -Null values are ignored by default. -If skip_nulls = false, then this will return the first and last values -regardless if it is null - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True -In [15]: print(pc.last.__doc__) -Compute the first value in each group. - -Null values are ignored by default. -If skip_nulls = false, then this will return the first and last values -regardless if it is null - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -max = _clone_signature(first) -""" -Compute the minimum or maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -min = _clone_signature(first) -""" -Compute the minimum or maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -min_max = _clone_signature(first_last) -""" -Compute the minimum and maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def mean( - array: FloatScalar | FloatArray, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: ... -@overload -def mean( - array: lib.NumericArray[lib.Decimal128Scalar] - | lib.ChunkedArray[lib.Decimal128Scalar] - | lib.Decimal128Scalar, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Decimal128Scalar: ... -@overload -def mean( - array: lib.NumericArray[lib.Decimal256Scalar] - | lib.ChunkedArray[lib.Decimal256Scalar] - | lib.Decimal256Scalar, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Decimal256Scalar: ... -def mean(*args, **kwargs): - """ - Compute the mean of a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - The result is a double for integer and floating point arguments, - and a decimal with the same bit-width/precision/scale for decimal arguments. - For integers and floats, NaN is returned if min_count = 0 and - there are no values. For decimals, null is returned instead. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def mode( - array: NumericScalar | NumericArray, - /, - n: int = 1, - *, - skip_nulls: bool = True, - min_count: int = 0, - options: ModeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: - """ - Compute the modal (most common) values of a numeric array. - - Compute the n most common values and their respective occurrence counts. - The output has type `struct`, where T is the - input type. - The results are ordered by descending `count` first, and ascending `mode` - when breaking ties. - Nulls are ignored. If there are no non-null values in the array, - an empty array is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - n : int, default 1 - Number of distinct most-common values to return. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ModeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) - >>> modes = pc.mode(arr, 2) - >>> modes[0] - - >>> modes[1] - - """ - -def product( - array: _ScalarT | lib.NumericArray[_ScalarT], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: - """ - Compute the product of values in a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def quantile( - array: NumericScalar | NumericArray, - /, - q: float | list[float] = 0.5, - *, - interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", - skip_nulls: bool = True, - min_count: int = 0, - options: QuantileOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Compute an array of quantiles of a numeric array or chunked array. - - By default, 0.5 quantile (median) is returned. - If quantile lies between two data points, an interpolated value is - returned based on selected interpolation method. - Nulls and NaNs are ignored. - An array of nulls is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to compute. All values must be in - [0, 1]. - interpolation : str, default "linear" - How to break ties between competing data points for a given quantile. - Accepted values are: - - - "linear": compute an interpolation - - "lower": always use the smallest of the two data points - - "higher": always use the largest of the two data points - - "nearest": select the data point that is closest to the quantile - - "midpoint": compute the (unweighted) mean of the two data points - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.QuantileOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def stddev( - array: NumericScalar | NumericArray, - /, - *, - ddof: float = 0, - skip_nulls: bool = True, - min_count: int = 0, - options: VarianceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Calculate the standard deviation of a numeric array. - - The number of degrees of freedom can be controlled using VarianceOptions. - By default (`ddof` = 0), the population standard deviation is calculated. - Nulls are ignored. If there are not enough non-null values in the array - to satisfy `ddof`, null is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.VarianceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def skew( - array: NumericArray | Sequence[int | None], - /, - *, - skip_nulls: bool = True, - biased: bool = True, - min_count: int = 0, - options: SkewOptions | None = None, -) -> NumericScalar: - """ - Calculate the skewness of a numeric array - Nulls are ignored by default. If there are not enough non-null values - in the array to satisfy `min_count`, null is returned. - The behavior of nulls and the `min_count` parameter can be changed. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - biased : bool, default True - Whether the calculated value is biased. - If False, the value computed includes a correction factor to reduce bias. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : SkewOptions, optional - Options for the `skew` and `kurtosis` functions. - """ - -kurtosis = _clone_signature(skew) -""" -Calculate the kurtosis of a numeric array -Nulls are ignored by default. If there are not enough non-null values -in the array to satisfy `min_count`, null is returned. -The behavior of nulls and the `min_count` parameter can be changed. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -biased : bool, default True - Whether the calculated value is biased. - If False, the value computed includes a correction factor to reduce bias. -min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : SkewOptions, optional - Options for the `skew` and `kurtosis` functions. -""" - -def sum( - array: _NumericScalarT | NumericArray[_NumericScalarT] | _DecimalArrayT, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: - """ - Compute the sum of a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def tdigest( - array: NumericScalar | NumericArray, - /, - q: float | list[float] = 0.5, - *, - delta: int = 100, - buffer_size: int = 500, - skip_nulls: bool = True, - min_count: int = 0, - options: TDigestOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Approximate quantiles of a numeric array with T-Digest algorithm. - - By default, 0.5 quantile (median) is returned. - Nulls and NaNs are ignored. - An array of nulls is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to approximate. All values must be - in [0, 1]. - delta : int, default 100 - Compression parameter for the T-digest algorithm. - buffer_size : int, default 500 - Buffer size for the T-digest algorithm. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.TDigestOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -def variance( - array: NumericScalar | NumericArray | list[int] | list[int | None], - /, - *, - ddof: int = 0, - skip_nulls: bool = True, - min_count: int = 0, - options: VarianceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Calculate the variance of a numeric array. - - The number of degrees of freedom can be controlled using VarianceOptions. - By default (`ddof` = 0), the population variance is calculated. - Nulls are ignored. If there are not enough non-null values in the array - to satisfy `ddof`, null is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.VarianceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def top_k_unstable( - values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, - k: int, - sort_keys: list | None = None, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Select the indices of the top-k ordered elements from array- or table-like - data. - - This is a specialization for :func:`select_k_unstable`. Output is not - guaranteed to be stable. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get top indices from. - k : int - The number of `k` elements to keep. - sort_keys : List-like - Column key names to order by when input is table-like data. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array - Indices of the top-k ordered elements - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) - >>> pc.top_k_unstable(arr, k=3) - - [ - 5, - 4, - 2 - ] - """ - -def bottom_k_unstable( - values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, - k: int, - sort_keys: list | None = None, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Select the indices of the bottom-k ordered elements from - array- or table-like data. - - This is a specialization for :func:`select_k_unstable`. Output is not - guaranteed to be stable. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get bottom indices from. - k : int - The number of `k` elements to keep. - sort_keys : List-like - Column key names to order by when input is table-like data. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array of indices - Indices of the bottom-k ordered elements - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) - >>> pc.bottom_k_unstable(arr, k=3) - - [ - 0, - 1, - 2 - ] - """ - -def winsorize( - values: lib.Array | lib.ChunkedArray, - lower_limit: float | None = None, - upper_limit: float | None = None, - /, - *, - options: WinsorizeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Apply a winsorization transform to the input array so as to reduce the influence of potential outliers. - NaNs and nulls in the input are ignored for the purpose of computing the lower and upper quantiles. - The quantile limits can be changed in WinsorizeOptions. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get bottom indices from. - - lower_limit : float, between 0 and 1 - The quantile below which all values are replaced with the quantile's value. - For example, if lower_limit = 0.05, then all values in the lower 5% percentile will be replaced with the 5% percentile value. - - upper_limit : float, between 0 and 1 - The quantile above which all values are replaced with the quantile’s value. - For example, if upper_limit = 0.95, then all values in the upper 95% percentile will be replaced with the 95% percentile value. - - options : pyarrow.compute.WinsorizeOptions, optional - Alternative way of passing options. - - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array of indices - Winsorized array - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) - >>> pc.winsorize(arr, 0.1, 0.8) - - [ - 8, - 4, - 8, - 8, - 5, - 3, - 7, - 2, - 2, - 6 - ] - """ - -def pivot_wider( - pivot_keys: lib.Array | lib.ChunkedArray | list[Any], - pivot_values: lib.Array | lib.ChunkedArray | list[Any], - /, - key_names: list[Any] | None = None, - *, - unexpected_key_behavior: str | None = None, - options: PivotWiderOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: - """ - Pivot values according to a pivot key column. - - Output is a struct with as many fields as PivotWiderOptions.key_names. - All output struct fields have the same type as pivot_values. Each pivot - key decides in which output field the corresponding pivot value is emitted. - If a pivot key doesn’t appear, null is emitted. If more than one non-null - value is encountered for a given pivot key, Invalid is raised. The pivot - key column can be string, binary or integer. The key_names will be cast - to the pivot key column type for matching. Behavior of unexpected pivot - keys is controlled by unexpected_key_behavior. - - Parameters - ---------- - pivot_keys : sequence - Array, ChunkedArray, list - pivot_values : sequence - Array, ChunkedArray, list - key_names : sequence of str - The pivot key names expected in the pivot key column. - For each entry in `key_names`, a column with the same name is emitted - in the struct output. - unexpected_key_behavior : str, default "ignore" - The behavior when pivot keys not in `key_names` are encountered. - Accepted values are "ignore", "raise". - If "ignore", unexpected keys are silently ignored. - If "raise", unexpected keys raise a KeyError. - options : pyarrow.compute.PivotWiderOptions, optional - Alternative way of passing options. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array of indices - Pivoted struct array - """ - -# ========================= 2. Element-wise (“scalar”) functions ========================= - -# ========================= 2.1 Arithmetic ========================= -@overload -def abs( - x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationT: ... -@overload -def abs( - x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationArrayT: ... -@overload -def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def abs(*args, **kwargs): - """ - Calculate the absolute value of the argument element-wise. - - Results will wrap around on integer overflow. - Use function "abs_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -abs_checked = _clone_signature(abs) -""" -Calculate the absolute value of the argument element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "abs". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def add( - x: _NumericOrTemporalScalarT, - y: _NumericOrTemporalScalarT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalScalarT: ... -@overload -def add( - x: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], - y: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def add( - x: Expression, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def add( - x: NumericOrTemporalScalar | lib._AsPyType, - y: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def add( - x: _NumericOrTemporalArrayT | NDArray[Any] | list[lib._AsPyType | None], - y: NumericOrTemporalScalar | lib._AsPyType, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def add( - x: NumericOrTemporalScalar | lib._AsPyType, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def add( - x: Expression, y: NumericOrTemporalScalar | lib._AsPyType, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def add(*args, **kwargs): - """ - Add the arguments element-wise. - - Results will wrap around on integer overflow. - Use function "add_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -add_checked = _clone_signature(add) -""" -Add the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "add". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - -""" - -@overload -def divide( - dividend: _NumericOrTemporalScalarT, - divisor: _NumericOrTemporalScalarT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalScalarT: ... -@overload -def divide( - dividend: _NumericOrTemporalArrayT, - divisor: _NumericOrTemporalArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def divide( - dividend: Expression, - divisor: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def divide( - dividend: NumericOrTemporalScalar, - divisor: _NumericOrTemporalArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def divide( - dividend: _NumericOrTemporalArrayT, - divisor: NumericOrTemporalScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def divide( - dividend: NumericOrTemporalScalar, - divisor: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def divide( - dividend: Expression, - divisor: NumericOrTemporalScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def divide(*args, **kwargs): - """ - Divide the arguments element-wise. - - Integer division by zero returns an error. However, integer overflow - wraps around, and floating-point division by zero returns an infinite. - Use function "divide_checked" if you want to get an error - in all the aforementioned cases. - - Parameters - ---------- - dividend : Array-like or scalar-like - Argument to compute function. - divisor : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -divide_checked = _clone_signature(divide) -""" -Divide the arguments element-wise. - -An error is returned when trying to divide by zero, or when -integer overflow is encountered. - -Parameters ----------- -dividend : Array-like or scalar-like - Argument to compute function. -divisor : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def exp( - exponent: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _FloatArrayT: ... -@overload -def exp( - exponent: ArrayOrChunkedArray[NonFloatNumericScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: ... -@overload -def exp( - exponent: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _FloatScalarT: ... -@overload -def exp( - exponent: NonFloatNumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.DoubleScalar: ... -@overload -def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def exp(*args, **kwargs): - """ - Compute Euler's number raised to the power of specified exponent, element-wise. - - If exponent is null the result will be null. - - Parameters - ---------- - exponent : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -multiply = _clone_signature(add) -""" -Multiply the arguments element-wise. - -Results will wrap around on integer overflow. -Use function "multiply_checked" if you want overflow -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -multiply_checked = _clone_signature(add) -""" -Multiply the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "multiply". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def negate( - x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationT: ... -@overload -def negate( - x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationArrayT: ... -@overload -def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def negate(*args, **kwargs): - """ - Negate the argument element-wise. - - Results will wrap around on integer overflow. - Use function "negate_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -negate_checked = _clone_signature(negate) -""" -Negate the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "negate". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def power( - base: _NumericScalarT, - exponent: _NumericScalarT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def power( - base: _NumericArrayT, - exponent: _NumericArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def power( - base: Expression, - exponent: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def power( - base: _NumericArrayT, - exponent: NumericScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def power( - base: NumericScalar, - exponent: _NumericArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def power( - base: NumericScalar, - exponent: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def power( - base: Expression, - exponent: NumericScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def power(*args, **kwargs): - """ - Raise arguments to power element-wise. - - Integer to negative integer power returns an error. However, integer overflow - wraps around. If either base or exponent is null the result will be null. - - Parameters - ---------- - base : Array-like or scalar-like - Argument to compute function. - exponent : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -power_checked = _clone_signature(power) -""" -Raise arguments to power element-wise. - -An error is returned when integer to negative integer power is encountered, -or integer overflow is encountered. - -Parameters ----------- -base : Array-like or scalar-like - Argument to compute function. -exponent : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def sign( - x: NumericOrDurationArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> ( - lib.NumericArray[lib.Int8Scalar] - | lib.NumericArray[lib.FloatScalar] - | lib.NumericArray[lib.DoubleScalar] -): ... -@overload -def sign( - x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... -@overload -def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def sign(*args, **kwargs): - """ - Get the signedness of the arguments element-wise. - - Output is any of (-1,1) for nonzero inputs and 0 for zero input. - NaN values return NaN. Integral values return signedness as Int8 and - floating-point values return it with the same type as the input values. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -@overload -def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... -@overload -def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... -@overload -def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def sqrt(*args, **kwargs): - """ - Takes the square root of arguments element-wise. - - A negative argument returns a NaN. For a variant that returns an - error, use function "sqrt_checked". - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -sqrt_checked = _clone_signature(sqrt) -""" -Takes the square root of arguments element-wise. - -A negative argument returns an error. For a variant that returns a -NaN, use function "sqrt". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -subtract = _clone_signature(add) -""" -Subtract the arguments element-wise. - -Results will wrap around on integer overflow. -Use function "subtract_checked" if you want overflow -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -subtract_checked = _clone_signature(add) -""" -Subtract the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "subtract". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.1 Bit-wise functions ========================= -@overload -def bit_wise_and( - x: _NumericScalarT, y: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericScalarT: ... -@overload -def bit_wise_and( - x: _NumericArrayT, - y: _NumericArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def bit_wise_and( - x: NumericScalar, y: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericArrayT: ... -@overload -def bit_wise_and( - x: _NumericArrayT, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericArrayT: ... -@overload -def bit_wise_and( - x: Expression, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def bit_wise_and( - x: Expression, - y: NumericScalar | ArrayOrChunkedArray[NumericScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def bit_wise_and( - x: NumericScalar | ArrayOrChunkedArray[NumericScalar], - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def bit_wise_and(*args, **kwargs): - """ - Bit-wise AND the arguments element-wise. - - Null values return null. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def bit_wise_not( - x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericScalarT: ... -@overload -def bit_wise_not( - x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericArrayT: ... -@overload -def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def bit_wise_not(*args, **kwargs): - """ - Bit-wise negate the arguments element-wise. - - Null values return null. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -bit_wise_or = _clone_signature(bit_wise_and) -""" -Bit-wise OR the arguments element-wise. - -Null values return null. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -bit_wise_xor = _clone_signature(bit_wise_and) -""" -Bit-wise XOR the arguments element-wise. - -Null values return null. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_left = _clone_signature(bit_wise_and) -""" -Left shift `x` by `y`. - -The shift operates as if on the two's complement representation of the number. -In other words, this is equivalent to multiplying `x` by 2 to the power `y`, -even if overflow occurs. -`x` is returned if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -Use function "shift_left_checked" if you want an invalid shift amount -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_left_checked = _clone_signature(bit_wise_and) -""" -Left shift `x` by `y`. - -The shift operates as if on the two's complement representation of the number. -In other words, this is equivalent to multiplying `x` by 2 to the power `y`, -even if overflow occurs. -An error is raised if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -See "shift_left" for a variant that doesn't fail for an invalid shift amount. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_right = _clone_signature(bit_wise_and) -""" -Right shift `x` by `y`. - -This is equivalent to dividing `x` by 2 to the power `y`. -`x` is returned if `y` (the amount to shift by) is: (1) negative or -(2) greater than or equal to the precision of `x`. -Use function "shift_right_checked" if you want an invalid shift amount -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_right_checked = _clone_signature(bit_wise_and) -""" -Right shift `x` by `y`. - -This is equivalent to dividing `x` by 2 to the power `y`. -An error is raised if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -See "shift_right" for a variant that doesn't fail for an invalid shift amount - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.2 Rounding functions ========================= -@overload -def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... -@overload -def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... -@overload -def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def ceil(*args, **kwargs): - """ - Round up to the nearest integer. - - Compute the smallest integer value not less in magnitude than `x`. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -floor = _clone_signature(ceil) -""" -Round down to the nearest integer. - -Compute the largest integer value not greater in magnitude than `x`. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def round( - x: _NumericScalarT | int | float, - /, - ndigits: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def round( - x: _NumericArrayT | Sequence[int | float | None], - /, - ndigits: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def round( - x: Expression, - /, - ndigits: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def round(*args, **kwargs): - """ - Round to a given precision. - - Options are used to control the number of digits and rounding mode. - Default behavior is to round to the nearest integer and - use half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - ndigits : int, default 0 - Number of fractional digits to round to. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def round_to_multiple( - x: int | float | _NumericScalarT, - /, - multiple: int | float | _NumericScalarT = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundToMultipleOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def round_to_multiple( - x: _NumericArrayT | Sequence[int | float | None], - /, - multiple: int | float | _NumericScalarT = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundToMultipleOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def round_to_multiple( - x: Expression, - /, - multiple: int | float | _NumericScalarT = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundToMultipleOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def round_to_multiple(*args, **kwargs): - """ - Round to a given multiple. - - Options are used to control the rounding multiple and rounding mode. - Default behavior is to round to the nearest integer and - use half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - multiple : numeric scalar, default 1.0 - Multiple to round to. Should be a scalar of a type compatible - with the argument to be rounded. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundToMultipleOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def round_binary( - x: _NumericScalarT | float, - s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def round_binary( - x: _NumericScalarT | float, - s: Iterable, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[_NumericScalarT]: ... -@overload -def round_binary( - x: _NumericArrayT | Sequence[float], - s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def round_binary( - x: Expression, - s: Iterable, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def round_binary(*args, **kwargs): - """ - Round to the given precision. - - Options are used to control the rounding mode. - Default behavior is to use the half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - s : Array-like or scalar-like - Argument to compute function. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundBinaryOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -trunc = _clone_signature(ceil) -""" -Compute the integral part. - -Compute the nearest integer not greater in magnitude than `x`. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.3 Logarithmic functions ========================= -@overload -def ln( - x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... -@overload -def ln( - x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def ln(*args, **kwargs): - """ - Compute natural logarithm. - - Non-positive values return -inf or NaN. Null values return null. - Use function "ln_checked" if you want non-positive values to raise an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ln_checked = _clone_signature(ln) -""" -Compute natural logarithm. - -Non-positive values raise an error. Null values return null. -Use function "ln" if you want non-positive values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log10 = _clone_signature(ln) -""" -Compute base 10 logarithm. - -Non-positive values return -inf or NaN. Null values return null. -Use function "log10_checked" if you want non-positive values -to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log10_checked = _clone_signature(ln) -""" -Compute base 10 logarithm. - -Non-positive values raise an error. Null values return null. -Use function "log10" if you want non-positive values -to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log1p = _clone_signature(ln) -""" -Compute natural log of (1+x). - -Values <= -1 return -inf or NaN. Null values return null. -This function may be more precise than log(1 + x) for x close to zero. -Use function "log1p_checked" if you want invalid values to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log1p_checked = _clone_signature(ln) -""" -Compute natural log of (1+x). - -Values <= -1 return -inf or NaN. Null values return null. -This function may be more precise than log(1 + x) for x close to zero. -Use function "log1p" if you want invalid values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log2 = _clone_signature(ln) -""" -Compute base 2 logarithm. - -Non-positive values return -inf or NaN. Null values return null. -Use function "log2_checked" if you want non-positive values -to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log2_checked = _clone_signature(ln) -""" -Compute base 2 logarithm. - -Non-positive values raise an error. Null values return null. -Use function "log2" if you want non-positive values -to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def logb( - x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... -@overload -def logb( - x: FloatArray, b: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def logb( - x: FloatScalar, - b: FloatArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def logb( - x: FloatArray, - b: FloatScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def logb( - x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression | Any: ... -def logb(*args, **kwargs): - """ - Compute base `b` logarithm. - - Values <= 0 return -inf or NaN. Null values return null. - Use function "logb_checked" if you want non-positive values to raise an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - b : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -logb_checked = _clone_signature(logb) -""" -Compute base `b` logarithm. - -Values <= 0 return -inf or NaN. Null values return null. -Use function "logb" if you want non-positive values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -b : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.4 Trigonometric functions ========================= -acos = _clone_signature(ln) -""" -Compute the inverse cosine. - -NaN is returned for invalid input values; -to raise an error instead, see "acos_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -acos_checked = _clone_signature(ln) -""" -Compute the inverse cosine. - -Invalid input values raise an error; -to return NaN instead, see "acos". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -asin = _clone_signature(ln) -""" -Compute the inverse sine. - -NaN is returned for invalid input values; -to raise an error instead, see "asin_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -asinh = _clone_signature(ln) -""" -Compute the inverse hyperbolic sine. -NaN is returned for invalid input values. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -asin_checked = _clone_signature(ln) -""" -Compute the inverse sine. - -Invalid input values raise an error; -to return NaN instead, see "asin". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -atan = _clone_signature(ln) -""" -Compute the inverse tangent of x. - -The return value is in the range [-pi/2, pi/2]; -for a full return range [-pi, pi], see "atan2". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -atanh = _clone_signature(ln) -""" -Compute the inverse hyperbolic tangent of x. -The return value is in the range [-1, 1]. -NaN is returned for invalid input values. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cos = _clone_signature(ln) -""" -Compute the cosine. - -NaN is returned for invalid input values; -to raise an error instead, see "cos_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cosh = _clone_signature(ln) -""" -Compute the hyperbolic cosine. -NaN is returned for invalid input values. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -acosh = _clone_signature(ln) -""" -Compute the inverse hyperbolic cosine. -NaN is returned for invalid input values. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cos_checked = _clone_signature(ln) -""" -Compute the cosine. - -Infinite values raise an error; -to return NaN instead, see "cos". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -sin = _clone_signature(ln) -""" -Compute the sine. - -NaN is returned for invalid input values; -to raise an error instead, see "sin_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -sin_checked = _clone_signature(ln) -""" -Compute the sine. - -Invalid input values raise an error; -to return NaN instead, see "sin". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -sinh = _clone_signature(ln) -""" -Compute the hyperbolic sine. -NaN is returned for invalid input values. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -tan = _clone_signature(ln) -""" -Compute the tangent. - -NaN is returned for invalid input values; -to raise an error instead, see "tan_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -tan_checked = _clone_signature(ln) -""" -Compute the tangent. - -Infinite values raise an error; -to return NaN instead, see "tan". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -tanh = _clone_signature(ln) -""" -Compute the hyperbolic tangent. -NaN is returned for invalid input values. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def atan2( - y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... -@overload -def atan2( - y: FloatArray, x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def atan2( - y: FloatArray, - x: FloatScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def atan2( - y: FloatScalar, - x: FloatArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def atan2( - y: Expression, x: Any, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def atan2( - y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def atan2(*args, **kwargs): - """ - Compute the inverse tangent of y/x. - - The return value is in the range [-pi, pi]. - - Parameters - ---------- - y : Array-like or scalar-like - Argument to compute function. - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.5 Comparisons functions ========================= -@overload -def equal( - x: lib.Scalar, y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def equal( - x: lib.Scalar | lib._AsPyType, - y: lib.Array | lib.ChunkedArray | list[lib._AsPyType], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def equal( - x: lib.Array | lib.ChunkedArray | list[lib._AsPyType], - y: lib.Scalar | lib._AsPyType, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def equal( - x: lib.Array | lib.ChunkedArray, - y: lib.Array | lib.ChunkedArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def equal( - x: Expression, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def equal( - x: lib.Scalar, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def equal( - x: Expression, - y: lib.Scalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def equal(*args, **kwargs): - """ - Compare values for equality (x == y). - - A null on either side emits a null comparison result. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -greater = _clone_signature(equal) -""" -Compare values for ordered inequality (x > y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -greater_equal = _clone_signature(equal) -""" -Compare values for ordered inequality (x >= y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -less = _clone_signature(equal) -""" -Compare values for ordered inequality (x < y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -less_equal = _clone_signature(equal) -""" -Compare values for ordered inequality (x <= y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -not_equal = _clone_signature(equal) -""" -Compare values for inequality (x != y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def max_element_wise( - *args: ScalarOrArray[_Scalar_CoT] | NDArray[Any] | float, - skip_nulls: bool = True, - options: ElementWiseAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT]: ... -@overload -def max_element_wise( - *args: Expression, - skip_nulls: bool = True, - options: ElementWiseAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def max_element_wise(*args, **kwargs): - """ - Find the element-wise maximum value. - - Nulls are ignored (by default) or propagated. - NaN is preferred over null, but not over any valid value. - - Parameters - ---------- - *args : Array-like or scalar-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - options : pyarrow.compute.ElementWiseAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -min_element_wise = _clone_signature(max_element_wise) -""" -Find the element-wise minimum value. - -Nulls are ignored (by default) or propagated. -NaN is preferred over null, but not over any valid value. - -Parameters ----------- -*args : Array-like or scalar-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -options : pyarrow.compute.ElementWiseAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.6 Logical functions ========================= -@overload -def and_( - x: lib.BooleanScalar, y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def and_( - x: BooleanArray, - y: BooleanArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def and_( - x: Expression, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def and_( - x: lib.BooleanScalar, - y: BooleanArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def and_( - x: BooleanArray, - y: lib.BooleanScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def and_( - x: lib.BooleanScalar, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def and_( - x: Expression, - y: lib.BooleanScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def and_( - x: ScalarOrArray[lib.BooleanScalar], - y: ScalarOrArray[lib.BooleanScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> ScalarOrArray[lib.BooleanScalar]: ... -def and_(*args, **kwargs): - """ - Logical 'and' boolean values. - - When a null is encountered in either input, a null is output. - For a different null behavior, see function "and_kleene". - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -and_kleene = _clone_signature(and_) -""" -Logical 'and' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true and null = null -- null and true = null -- false and null = false -- null and false = false -- null and null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'and' false is always false. -For a different null behavior, see function "and". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -and_not = _clone_signature(and_) -""" -Logical 'and not' boolean values. - -When a null is encountered in either input, a null is output. -For a different null behavior, see function "and_not_kleene". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -and_not_kleene = _clone_signature(and_) -""" -Logical 'and not' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true and not null = null -- null and not false = null -- false and not null = false -- null and not true = false -- null and not null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'and not' true is always false, as is false -'and not' an unknown value. -For a different null behavior, see function "and_not". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -or_ = _clone_signature(and_) -""" -Logical 'or' boolean values. - -When a null is encountered in either input, a null is output. -For a different null behavior, see function "or_kleene". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -or_kleene = _clone_signature(and_) -""" -Logical 'or' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true or null = true -- null or true = true -- false or null = null -- null or false = null -- null or null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'or' true is always true. -For a different null behavior, see function "or". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -xor = _clone_signature(and_) -""" -Logical 'xor' boolean values. - -When a null is encountered in either input, a null is output. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def invert( - x: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def invert( - x: _BooleanArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _BooleanArrayT: ... -@overload -def invert( - x: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def invert(*args, **kwargs): - """ - Invert boolean values. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.10 String predicates ========================= -@overload -def ascii_is_alnum( - strings: StringScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def ascii_is_alnum( - strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanArray: ... -@overload -def ascii_is_alnum( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def ascii_is_alnum(*args, **kwargs): - """ - Classify strings as ASCII alphanumeric. - - For each string in `strings`, emit true iff the string is non-empty - and consists only of alphanumeric ASCII characters. Null strings emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_is_alpha = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII alphabetic. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphabetic ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_decimal = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII decimal. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of decimal ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_lower = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII lowercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of lowercase ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_printable = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII printable. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of printable ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_space = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII whitespace. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of whitespace ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_upper = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII uppercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of uppercase ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_alnum = _clone_signature(ascii_is_alnum) -""" -Classify strings as alphanumeric. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphanumeric Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_alpha = _clone_signature(ascii_is_alnum) -""" -Classify strings as alphabetic. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphabetic Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_decimal = _clone_signature(ascii_is_alnum) -""" -Classify strings as decimal. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of decimal Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_digit = _clone_signature(ascii_is_alnum) -""" -Classify strings as digits. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of Unicode digits. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_lower = _clone_signature(ascii_is_alnum) -""" -Classify strings as lowercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of lowercase Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_numeric = _clone_signature(ascii_is_alnum) -""" -Classify strings as numeric. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of numeric Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_printable = _clone_signature(ascii_is_alnum) -""" -Classify strings as printable. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of printable Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_space = _clone_signature(ascii_is_alnum) -""" -Classify strings as whitespace. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of whitespace Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_upper = _clone_signature(ascii_is_alnum) -""" -Classify strings as uppercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of uppercase Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_title = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII titlecase. - -For each string in `strings`, emit true iff the string is title-cased, -i.e. it has at least one cased character, each uppercase character -follows an uncased character, and each lowercase character follows -an uppercase character. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_title = _clone_signature(ascii_is_alnum) -""" -Classify strings as titlecase. - -For each string in `strings`, emit true iff the string is title-cased, -i.e. it has at least one cased character, each uppercase character -follows an uncased character, and each lowercase character follows -an uppercase character. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -string_is_ascii = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII. - -For each string in `strings`, emit true iff the string consists only -of ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.11 String transforms ========================= -@overload -def ascii_capitalize( - strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringScalarT: ... -@overload -def ascii_capitalize( - strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringArrayT: ... -@overload -def ascii_capitalize( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def ascii_capitalize(*args, **kwargs): - """ - Capitalize the first character of ASCII input. - - For each string in `strings`, return a capitalized version. - - This function assumes the input is fully ASCII. If it may contain - non-ASCII characters, use "utf8_capitalize" instead. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_lower = _clone_signature(ascii_capitalize) -""" -Transform ASCII input to lowercase. - -For each string in `strings`, return a lowercase version. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_lower" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_reverse = _clone_signature(ascii_capitalize) -""" -Reverse ASCII input. - -For each ASCII string in `strings`, return a reversed version. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_reverse" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_swapcase = _clone_signature(ascii_capitalize) -""" -Transform ASCII input by inverting casing. - -For each string in `strings`, return a string with opposite casing. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_swapcase" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_title = _clone_signature(ascii_capitalize) -""" -Titlecase each word of ASCII input. - -For each string in `strings`, return a titlecased version. -Each word in the output will start with an uppercase character and its -remaining characters will be lowercase. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_title" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_upper = _clone_signature(ascii_capitalize) -""" -Transform ASCII input to uppercase. - -For each string in `strings`, return an uppercase version. - -This function assumes the input is fully ASCII. It it may contain -non-ASCII characters, use "utf8_upper" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def binary_length( - strings: lib.BinaryScalar | lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int32Scalar: ... -@overload -def binary_length( - strings: lib.LargeBinaryScalar | lib.LargeStringScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def binary_length( - strings: lib.BinaryArray - | lib.StringArray - | lib.ChunkedArray[lib.BinaryScalar] - | lib.ChunkedArray[lib.StringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def binary_length( - strings: lib.LargeBinaryArray - | lib.LargeStringArray - | lib.ChunkedArray[lib.LargeBinaryScalar] - | lib.ChunkedArray[lib.LargeStringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def binary_length( - strings: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_length(*args, **kwargs): - """ - Compute string lengths. - - For each string in `strings`, emit its length of bytes. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_repeat( - strings: _StringOrBinaryScalarT, - num_repeats: int, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT: ... -@overload -def binary_repeat( - strings: _StringOrBinaryScalarT, - num_repeats: list[int] | list[int | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[_StringOrBinaryScalarT]: ... -@overload -def binary_repeat( - strings: _StringOrBinaryArrayT, - num_repeats: int | list[int] | list[int | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryArrayT: ... -@overload -def binary_repeat( - strings: Expression, - num_repeats: int | list[int] | list[int | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_repeat(*args, **kwargs): - """ - Repeat a binary string. - - For each binary string in `strings`, return a replicated version. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - num_repeats : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_replace_slice( - strings: _StringOrBinaryScalarT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT: ... -@overload -def binary_replace_slice( - strings: _StringOrBinaryArrayT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryArrayT: ... -@overload -def binary_replace_slice( - strings: Expression, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_replace_slice(*args, **kwargs): - """ - Replace a slice of a binary string. - - For each string in `strings`, replace a slice of the string defined by `start` - and `stop` indices with the given `replacement`. `start` is inclusive - and `stop` is exclusive, and both are measured in bytes. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - options : pyarrow.compute.ReplaceSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_reverse( - strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _BinaryScalarT: ... -@overload -def binary_reverse( - strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _BinaryArrayT: ... -@overload -def binary_reverse( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def binary_reverse(*args, **kwargs): - """ - Reverse binary input. - - For each binary string in `strings`, return a reversed version. - - This function reverses the binary data at a byte-level. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def replace_substring( - strings: _StringScalarT, - /, - pattern: str | bytes, - replacement: str | bytes, - *, - max_replacements: int | None = None, - options: ReplaceSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def replace_substring( - strings: _StringArrayT, - /, - pattern: str | bytes, - replacement: str | bytes, - *, - max_replacements: int | None = None, - options: ReplaceSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def replace_substring( - strings: Expression, - /, - pattern: str | bytes, - replacement: str | bytes, - *, - max_replacements: int | None = None, - options: ReplaceSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def replace_substring(*args, **kwargs): - """ - Replace matching non-overlapping substrings with replacement. - - For each string in `strings`, replace non-overlapping substrings that match - the given literal `pattern` with the given `replacement`. - If `max_replacements` is given and not equal to -1, it limits the - maximum amount replacements per input, counted from the left. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - replacement : str - What to replace the pattern with. - max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). - options : pyarrow.compute.ReplaceSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -replace_substring_regex = _clone_signature(replace_substring) -""" -Replace matching non-overlapping substrings with replacement. - -For each string in `strings`, replace non-overlapping substrings that match -the given regular expression `pattern` with the given `replacement`. -If `max_replacements` is given and not equal to -1, it limits the -maximum amount replacements per input, counted from the left. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -replacement : str - What to replace the pattern with. -max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). -options : pyarrow.compute.ReplaceSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def utf8_capitalize( - strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringScalarT: ... -@overload -def utf8_capitalize( - strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringArrayT: ... -@overload -def utf8_capitalize( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def utf8_capitalize(*args, **kwargs): - """ - Capitalize the first character of input. - - For each string in `strings`, return a capitalized version, - with the first character uppercased and the others lowercased. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def utf8_length( - strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int32Scalar: ... -@overload -def utf8_length( - strings: lib.LargeStringScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def utf8_length( - strings: lib.StringArray | lib.ChunkedArray[lib.StringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def utf8_length( - strings: lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def utf8_length( - strings: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_length(*args, **kwargs): - """ - Compute UTF8 string lengths. - - For each string in `strings`, emit its length in UTF8 characters. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -utf8_lower = _clone_signature(utf8_capitalize) -""" -Transform input to lowercase. - -For each string in `strings`, return a lowercase version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def utf8_replace_slice( - strings: _StringScalarT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def utf8_replace_slice( - strings: _StringArrayT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def utf8_replace_slice( - strings: Expression, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_replace_slice(*args, **kwargs): - """ - Replace a slice of a string. - - For each string in `strings`, replace a slice of the string defined by `start` - and `stop` indices with the given `replacement`. `start` is inclusive - and `stop` is exclusive, and both are measured in UTF8 characters. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - options : pyarrow.compute.ReplaceSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -utf8_reverse = _clone_signature(utf8_capitalize) -""" -Reverse input. - -For each string in `strings`, return a reversed version. - -This function operates on Unicode codepoints, not grapheme -clusters. Hence, it will not correctly reverse grapheme clusters -composed of multiple codepoints. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_swapcase = _clone_signature(utf8_capitalize) -""" -Transform input lowercase characters to uppercase and uppercase characters to lowercase. - -For each string in `strings`, return an opposite case version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_title = _clone_signature(utf8_capitalize) -""" -Titlecase each word of input. - -For each string in `strings`, return a titlecased version. -Each word in the output will start with an uppercase character and its -remaining characters will be lowercase. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_upper = _clone_signature(utf8_capitalize) -""" -Transform input to uppercase. - -For each string in `strings`, return an uppercase version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory poo -""" - -def utf8_normalize( - strings: _StringArrayT, /, form: str, *, options: Utf8NormalizeOptions | None = None, memory_pool: lib.MemoryPool | None = None -) -> _StringArrayT: - """ - Utf8-normalize input - - For each string in `strings`, return the normal form. - The normalization form must be given in the Utf8NormalizeOptions. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - form : str - Unicode normalization form. - Accepted values are "NFC", "NFKC", "NFD", NFKD". - """ - -# ========================= 2.12 String padding ========================= -@overload -def ascii_center( - strings: _StringScalarT, - /, - width: int, - padding: str = " ", - lean_left_on_odd_padding: bool = True, - *, - options: PadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def ascii_center( - strings: _StringArrayT, - /, - width: int, - padding: str = " ", - lean_left_on_odd_padding: bool = True, - *, - options: PadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def ascii_center( - strings: Expression, - /, - width: int, - padding: str = " ", - lean_left_on_odd_padding: bool = True, - *, - options: PadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_center(*args, **kwargs): - """ - Center strings by padding with a given character. - - For each string in `strings`, emit a centered string by padding both sides - with the given ASCII character. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - width : int - Desired string length. - padding : str, default " " - What to pad the string with. Should be one byte or codepoint. - lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). - options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_lpad = _clone_signature(ascii_center) -""" -Right-align strings by padding with a given character. - -For each string in `strings`, emit a right-aligned string by prepending -the given ASCII character. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_rpad = _clone_signature(ascii_center) -""" -Left-align strings by padding with a given character. - -For each string in `strings`, emit a left-aligned string by appending -the given ASCII character. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_center = _clone_signature(ascii_center) -""" -Center strings by padding with a given character. - -For each string in `strings`, emit a centered string by padding both sides -with the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_lpad = _clone_signature(ascii_center) -""" -Right-align strings by padding with a given character. - -For each string in `strings`, emit a right-aligned string by prepending -the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_rpad = _clone_signature(ascii_center) -""" -Left-align strings by padding with a given character. - -For each string in `strings`, emit a left-aligned string by appending -the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def utf8_zero_fill( - strings: _StringScalarT, - /, - width: int, - padding: str = '0', - *, - options: ZeroFillOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def utf8_zero_fill( - strings: _StringArrayT, - /, - width: int | None = None, - padding: str | None = '0', - *, - options: ZeroFillOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def utf8_zero_fill( - strings: Expression, - /, - width: int, - padding: str = '0', - *, - options: ZeroFillOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_zero_fill(*args, **kwargs): - """ - Left-pad strings to a given width, preserving leading sign characters - - For each string in `strings`, emit a string of length `width` by - prepending the given padding character (defaults to '0' if not specified). - If the string starts with '+' or '-', the sign is preserved and padding - occurs after the sign. Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - width : int - Desired string length. - padding : str, default "0" - Padding character. Should be one Unicode codepoint. - options : pyarrow.compute.ZeroFillOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ -utf8_zfill = _clone_signature(utf8_zero_fill) - -# ========================= 2.13 String trimming ========================= -@overload -def ascii_ltrim( - strings: _StringScalarT, - /, - characters: str, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def ascii_ltrim( - strings: _StringArrayT, - /, - characters: str, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def ascii_ltrim( - strings: Expression, - /, - characters: str, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_ltrim(*args, **kwargs): - """ - Trim leading characters. - - For each string in `strings`, remove any leading characters - from the `characters` option (as given in TrimOptions). - Null values emit null. - Both the `strings` and the `characters` are interpreted as - ASCII; to trim non-ASCII characters, use `utf8_ltrim`. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - characters : str - Individual characters to be trimmed from the string. - options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_rtrim = _clone_signature(ascii_ltrim) -""" -Trim trailing characters. - -For each string in `strings`, remove any trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. -Both the `strings` and the `characters` are interpreted as -ASCII; to trim non-ASCII characters, use `utf8_rtrim`. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_trim = _clone_signature(ascii_ltrim) -""" -Trim leading and trailing characters. - -For each string in `strings`, remove any leading or trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. -Both the `strings` and the `characters` are interpreted as -ASCII; to trim non-ASCII characters, use `utf8_trim`. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_ltrim = _clone_signature(ascii_ltrim) -""" -Trim leading characters. - -For each string in `strings`, remove any leading characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_rtrim = _clone_signature(ascii_ltrim) -""" -Trim trailing characters. - -For each string in `strings`, remove any trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_trim = _clone_signature(ascii_ltrim) -""" -Trim leading and trailing characters. - -For each string in `strings`, remove any leading or trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def ascii_ltrim_whitespace( - strings: _StringScalarT, - /, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def ascii_ltrim_whitespace( - strings: _StringArrayT, - /, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def ascii_ltrim_whitespace( - strings: Expression, - /, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_ltrim_whitespace(*args, **kwargs): - """ - Trim leading ASCII whitespace characters. - - For each string in `strings`, emit a string with leading ASCII whitespace - characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode - whitespace characters. Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim trailing ASCII whitespace characters. - -For each string in `strings`, emit a string with trailing ASCII whitespace -characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode -whitespace characters. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading and trailing ASCII whitespace characters. - -For each string in `strings`, emit a string with leading and trailing ASCII -whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode -whitespace characters. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading whitespace characters. - -For each string in `strings`, emit a string with leading whitespace -characters removed, where whitespace characters are defined by the Unicode -standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim trailing whitespace characters. - -For each string in `strings`, emit a string with trailing whitespace -characters removed, where whitespace characters are defined by the Unicode -standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading and trailing whitespace characters. - -For each string in `strings`, emit a string with leading and trailing -whitespace characters removed, where whitespace characters are defined -by the Unicode standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.14 String splitting ========================= -@overload -def ascii_split_whitespace( - strings: _StringScalarT, - /, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[_StringScalarT]: ... -@overload -def ascii_split_whitespace( - strings: lib.Array[lib.Scalar[_DataTypeT]], - /, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... -@overload -def ascii_split_whitespace( - strings: Expression, - /, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_split_whitespace(*args, **kwargs): - """ - Split string according to any ASCII whitespace. - - Split each string according any non-zero length sequence of ASCII - whitespace characters. The output for each string input is a list - of strings. - - The maximum number of splits and direction of splitting - (forward, reverse) can optionally be defined in SplitOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - options : pyarrow.compute.SplitOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def split_pattern( - strings: _StringOrBinaryScalarT, - /, - pattern: str, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[_StringOrBinaryScalarT]: ... -@overload -def split_pattern( - strings: lib.Array[lib.Scalar[_DataTypeT]], - /, - pattern: str, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitPatternOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... -@overload -def split_pattern( - strings: Expression, - /, - pattern: str, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitPatternOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def split_pattern(*args, **kwargs): - """ - Split string according to separator. - - Split each string according to the exact `pattern` defined in - SplitPatternOptions. The output for each string input is a list - of strings. - - The maximum number of splits and direction of splitting - (forward, reverse) can optionally be defined in SplitPatternOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - String pattern to split on. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - options : pyarrow.compute.SplitPatternOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -split_pattern_regex = _clone_signature(split_pattern) -""" -Split string according to regex pattern. - -Split each string according to the regex `pattern` defined in -SplitPatternOptions. The output for each string input is a list -of strings. - -The maximum number of splits and direction of splitting -(forward, reverse) can optionally be defined in SplitPatternOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - String pattern to split on. -max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). -reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. -options : pyarrow.compute.SplitPatternOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_split_whitespace = _clone_signature(ascii_split_whitespace) -""" -Split string according to any Unicode whitespace. - -Split each string according any non-zero length sequence of Unicode -whitespace characters. The output for each string input is a list -of strings. - -The maximum number of splits and direction of splitting -(forward, reverse) can optionally be defined in SplitOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). -reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. -options : pyarrow.compute.SplitOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.15 String component extraction ========================= -@overload -def extract_regex( - strings: StringOrBinaryScalar, - /, - pattern: str, - *, - options: ExtractRegexOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: ... -@overload -def extract_regex( - strings: StringOrBinaryArray, - /, - pattern: str, - *, - options: ExtractRegexOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: ... -@overload -def extract_regex( - strings: Expression, - /, - pattern: str, - *, - options: ExtractRegexOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def extract_regex(*args, **kwargs): - """ - Extract substrings captured by a regex pattern. - - For each string in `strings`, match the regular expression and, if - successful, emit a struct with field names and values coming from the - regular expression's named capture groups. If the input is null or the - regular expression fails matching, a null output value is emitted. - - Regular expression matching is done using the Google RE2 library. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Regular expression with named capture fields. - options : pyarrow.compute.ExtractRegexOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def extract_regex_span( - strings: StringOrBinaryArray, - /, - pattern: str, - *, - options: ExtractRegexSpanOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: - """ - Extract string spans captured by a regex pattern - - For each string in `strings`, match the regular expression and, if - successful, emit a struct with field names and values coming from the - regular expression's named capture groups. Each struct field value - will be a fixed_size_list(offset_type, 2) where offset_type is int32 - or int64, depending on the input string type. The two elements in - each fixed-size list are the index and the length of the substring - matched by the corresponding named capture group. - - If the input is null or the regular expression fails matching, - a null output value is emitted. - - Regular expression matching is done using the Google RE2 library. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Regular expression with named capture fields. - options : pyarrow.compute.ExtractRegexSpanOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.16 String join ========================= -def binary_join( - strings: ArrayOrChunkedArray[lib.ListType[lib.BinaryType]], separator, /, *, memory_pool: lib.MemoryPool | None = None, -) -> StringArray | BinaryArray: ... -""" -Join a list of strings together with a separator. - -Concatenate the strings in `list`. The `separator` is inserted -between each given string. -Any null input and any null `list` element emits a null output. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -separator : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def binary_join_element_wise( - *strings: _StringOrBinaryScalarT | str, - null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", - null_replacement: str = "", - options: JoinOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _BinaryScalarT: ... -@overload -def binary_join_element_wise( - *strings: _StringOrBinaryArrayT | Sequence[str | None], - null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", - null_replacement: str = "", - options: JoinOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryArrayT: ... -@overload -def binary_join_element_wise( - *strings: Expression, - null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", - null_replacement: str = "", - options: JoinOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_join_element_wise(*args, **kwargs): - """ - Join string arguments together, with the last argument as separator. - - Concatenate the `strings` except for the last one. The last argument - in `strings` is inserted between each given string. - Any null separator element emits a null output. Null elements either - emit a null (the default), are skipped, or replaced with a given string. - - Parameters - ---------- - *strings : Array-like or scalar-like - Argument to compute function. - null_handling : str, default "emit_null" - How to handle null values in the inputs. - Accepted values are "emit_null", "skip", "replace". - null_replacement : str, default "" - Replacement string to emit for null inputs if `null_handling` - is "replace". - options : pyarrow.compute.JoinOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.17 String Slicing ========================= -@overload -def binary_slice( - strings: _BinaryScalarT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _BinaryScalarT: ... -@overload -def binary_slice( - strings: _BinaryArrayT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _BinaryArrayT: ... -@overload -def binary_slice( - strings: Expression, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_slice(*args, **kwargs): - """ - Slice binary string. - - For each binary string in `strings`, emit the substring defined by - (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is - inclusive and `stop` is exclusive. All three values are measured in - bytes. - If `step` is negative, the string will be advanced in reversed order. - An error is raised if `step` is zero. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - options : pyarrow.compute.SliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def utf8_slice_codeunits( - strings: _StringScalarT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def utf8_slice_codeunits( - strings: _StringArrayT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def utf8_slice_codeunits( - strings: Expression, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_slice_codeunits(*args, **kwargs): - """ - Slice string. - - For each string in `strings`, emit the substring defined by - (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is - inclusive and `stop` is exclusive. All three values are measured in - UTF8 codeunits. - If `step` is negative, the string will be advanced in reversed order. - An error is raised if `step` is zero. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - options : pyarrow.compute.SliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.18 Containment tests ========================= -@overload -def count_substring( - strings: lib.Scalar[lib.StringType | lib.BinaryType | lib.LargeStringType | lib.LargeBinaryType], - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar | lib.Int64Scalar: ... -@overload -def count_substring( - strings: lib.Array[lib.Scalar[lib.StringType | lib.BinaryType | lib.LargeStringType | lib.LargeBinaryType]] - | lib.ChunkedArray[lib.Scalar[lib.StringType | lib.BinaryType | lib.LargeStringType | lib.LargeBinaryType]], - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array | lib.Int64Array: ... -@overload -def count_substring( - strings: Expression, - /, - pattern: Any, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def count_substring(*args, **kwargs): - """ - Count occurrences of substring. - - For each string in `strings`, emit the number of occurrences of the given - literal pattern. - Null inputs emit null. The pattern must be given in MatchSubstringOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -count_substring_regex = _clone_signature(count_substring) -""" -Count occurrences of substring. - -For each string in `strings`, emit the number of occurrences of the given -regular expression pattern. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def ends_with( - strings: StringScalar | BinaryScalar, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def ends_with( - strings: StringArray | BinaryArray, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def ends_with( - strings: Expression, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ends_with(*args, **kwargs): - """ - Check if strings end with a literal pattern. - - For each string in `strings`, emit true iff it ends with a given pattern. - The pattern must be given in MatchSubstringOptions. - If ignore_case is set, only simple case folding is performed. - - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -find_substring = _clone_signature(count_substring) -""" -Find first occurrence of substring. - -For each string in `strings`, emit the index in bytes of the first occurrence -of the given literal pattern, or -1 if not found. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -find_substring_regex = _clone_signature(count_substring) -""" -Find location of first match of regex pattern. - -For each string in `strings`, emit the index in bytes of the first occurrence -of the given literal pattern, or -1 if not found. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def index_in( - values: lib.Scalar, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar: ... -@overload -def index_in( - values: lib.Array | lib.ChunkedArray, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def index_in( - values: Expression, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def index_in(*args, **kwargs): - """ - Return index of each element in a set of values. - - For each element in `values`, return its index in a given set of - values, or null if it is not found there. - The set of values to look for must be given in SetLookupOptions. - By default, nulls are matched against the value set, this can be - changed in SetLookupOptions. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - options : pyarrow.compute.SetLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_in( - values: lib.Scalar, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def is_in( - values: lib.Array | lib.ChunkedArray, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_in( - values: Expression, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def is_in(*args, **kwargs): - """ - Find each element in a set of values. - - For each element in `values`, return true if it is found in a given - set of values, false otherwise. - The set of values to look for must be given in SetLookupOptions. - By default, nulls are matched against the value set, this can be - changed in SetLookupOptions. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - options : pyarrow.compute.SetLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -match_like = _clone_signature(ends_with) -""" -Match strings against SQL-style LIKE pattern. - -For each string in `strings`, emit true iff it matches a given pattern -at any position. '%' will match any number of characters, '_' will -match exactly one character, and any other character matches itself. -To match a literal '%', '_', or '\', precede the character with a backslash. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -match_substring = _clone_signature(ends_with) -""" -Match strings against literal pattern. - -For each string in `strings`, emit true iff it contains a given pattern. -Null inputs emit null. -The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -match_substring_regex = _clone_signature(ends_with) -""" -Match strings against regex pattern. - -For each string in `strings`, emit true iff it matches a given pattern -at any position. The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Null inputs emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -starts_with = _clone_signature(ends_with) -""" -Check if strings start with a literal pattern. - -For each string in `strings`, emit true iff it starts with a given pattern. -The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Null inputs emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.19 Categorizations ========================= -@overload -def is_finite( - values: NumericScalar | lib.NullScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def is_finite( - values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanArray: ... -@overload -def is_finite( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def is_finite(*args, **kwargs): - """ - Return true if value is finite. - - For each input value, emit true iff the value is finite - (i.e. neither NaN, inf, nor -inf). - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -is_inf = _clone_signature(is_finite) -""" -Return true if infinity. - -For each input value, emit true iff the value is infinite (inf or -inf). - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -is_nan = _clone_signature(is_finite) -""" -Return true if NaN. - -For each input value, emit true iff the value is NaN. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def is_null( - values: lib.Scalar, - /, - *, - nan_is_null: bool = False, - options: NullOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def is_null( - values: lib.Array | lib.ChunkedArray, - /, - *, - nan_is_null: bool = False, - options: NullOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_null( - values: Expression, - /, - *, - nan_is_null: bool = False, - options: NullOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def is_null(*args, **kwargs): - """ - Return true if null (and optionally NaN). - - For each input value, emit true iff the value is null. - True may also be emitted for NaN values by setting the `nan_is_null` flag. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - nan_is_null : bool, default False - Whether floating-point NaN values are considered null. - options : pyarrow.compute.NullOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_valid( - values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def is_valid( - values: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanArray: ... -@overload -def is_valid( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def is_valid(*args, **kwargs): - """ - Return true if non-null. - - For each input value, emit true iff the value is valid (i.e. non-null). - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -true_unless_null = _clone_signature(is_valid) -""" -Return true if non-null, else return null. - -For each input value, emit true iff the value -is valid (non-null), otherwise emit null. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.20 Selecting / multiplexing ========================= -def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): - """ - Choose values based on multiple conditions. - - `cond` must be a struct of Boolean values. `cases` can be a mix - of scalar and array arguments (of any type, but all must be the - same type or castable to a common type), with either exactly one - datum per child of `cond`, or one more `cases` than children of - `cond` (in which case we have an "else" value). - - Each row of the output will be the corresponding value of the - first datum in `cases` for which the corresponding child of `cond` - is true, or otherwise the "else" value (if given), or null. - - Essentially, this implements a switch-case or if-else, if-else... statement. - - Parameters - ---------- - cond : Array-like or scalar-like - Argument to compute function. - *cases : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): - """ - Choose values from several arrays. - - For each row, the value of the first argument is used as a 0-based index - into the list of `values` arrays (i.e. index 0 selects the first of the - `values` arrays). The output value is the corresponding value of the - selected argument. - - If an index is null, the output will be null. - - Parameters - ---------- - indices : Array-like or scalar-like - Argument to compute function. - *values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def coalesce( - *values: _ScalarOrArrayT | Expression, memory_pool: lib.MemoryPool | None = None -) -> _ScalarOrArrayT: - """ - Select the first non-null value. - - Each row of the output will be the value from the first corresponding input - for which the value is not null. If all inputs are null in a row, the output - will be null. - - Parameters - ---------- - *values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -fill_null = coalesce -"""Replace each null element in values with a corresponding -element from fill_value. - -If fill_value is scalar-like, then every null element in values -will be replaced with fill_value. If fill_value is array-like, -then the i-th element in values will be replaced with the i-th -element in fill_value. - -The fill_value's type must be the same as that of values, or it -must be able to be implicitly casted to the array's type. - -This is an alias for :func:`coalesce`. - -Parameters ----------- -values : Array, ChunkedArray, or Scalar-like object - Each null element is replaced with the corresponding value - from fill_value. -fill_value : Array, ChunkedArray, or Scalar-like object - If not same type as values, will attempt to cast. - -Returns -------- -result : depends on inputs - Values with all null elements replaced - -Examples --------- ->>> import pyarrow as pa ->>> arr = pa.array([1, 2, None, 3], type=pa.int8()) ->>> fill_value = pa.scalar(5, type=pa.int8()) ->>> arr.fill_null(fill_value) - -[ - 1, - 2, - 5, - 3 -] ->>> arr = pa.array([1, 2, None, 4, None]) ->>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) - -[ - 1, - 2, - 30, - 4, - 50 -] -""" - -def if_else( - cond: ArrayLike | ScalarLike, - left: ArrayLike | ScalarLike, - right: ArrayLike | ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> ArrayLike | ScalarLike: - """ - Choose values based on a condition. - - `cond` must be a Boolean scalar/ array. - `left` or `right` must be of the same type scalar/ array. - `null` values in `cond` will be promoted to the output. - - Parameters - ---------- - cond : Array-like or scalar-like - Argument to compute function. - left : Array-like or scalar-like - Argument to compute function. - right : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.21 Structural transforms ========================= - -@overload -def list_value_length( - lists: _ListArray[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def list_value_length( - lists: _LargeListArray[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def list_value_length( - lists: ListArray[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array | lib.Int64Array: ... -@overload -def list_value_length( - lists: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def list_value_length(*args, **kwargs): - """ - Compute list lengths. - - `lists` must have a list-like type. - For each non-null value in `lists`, its length is emitted. - Null values emit a null in the output. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def make_struct( - *args: lib.Scalar | lib._AsPyType, - field_names: list[str] | tuple[str, ...] = (), - field_nullability: bool | None = None, - field_metadata: list[lib.KeyValueMetadata] | None = None, - options: MakeStructOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: ... -@overload -def make_struct( - *args: lib.Array | lib.ChunkedArray | list[lib._AsPyType], - field_names: list[str] | tuple[str, ...] = (), - field_nullability: bool | None = None, - field_metadata: list[lib.KeyValueMetadata] | None = None, - options: MakeStructOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: ... -@overload -def make_struct( - *args: Expression, - field_names: list[str] | tuple[str, ...] = (), - field_nullability: bool | None = None, - field_metadata: list[lib.KeyValueMetadata] | None = None, - options: MakeStructOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def make_struct(*args, **kwargs): - """ - Wrap Arrays into a StructArray. - - Names of the StructArray's fields are - specified through MakeStructOptions. - - Parameters - ---------- - *args : Array-like or scalar-like - Argument to compute function. - field_names : sequence of str - Names of the struct fields to create. - field_nullability : sequence of bool, optional - Nullability information for each struct field. - If omitted, all fields are nullable. - field_metadata : sequence of KeyValueMetadata, optional - Metadata for each struct field. - options : pyarrow.compute.MakeStructOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.22 Conversions ========================= - -def run_end_decode( - array: lib.Array, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Decode run-end encoded array. - - Return a decoded version of a run-end encoded input array. - - Parameters - ---------- - array : Array-like - Argument to compute function. - - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - - -def run_end_encode( - array: lib.Array, - /, - run_end_type: lib.Type_INT16 | lib.Type_INT32 | lib.Type_INT64 = lib.Type_INT32, - *, - options: RunEndEncodeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Run-end encode array. - - Return a run-end encoded version of the input array. - - Parameters - ---------- - - array : Array-like - Argument to compute function. - - run_end_type : DataType, default pyarrow.int32() - The data type of the run_ends array. - - Accepted values are pyarrow.{int16(), int32(), int64()}. - - options : pyarrow.compute.RunEndEncodeOptions, optional - Alternative way of passing options. - - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def ceil_temporal( - timestamps: _TemporalScalarT, - /, - multiple: int = 1, - unit: Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", - ] = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - options: RoundTemporalOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _TemporalScalarT: ... -@overload -def ceil_temporal( - timestamps: _TemporalArrayT, - /, - multiple: int = 1, - unit: Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", - ] = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - options: RoundTemporalOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _TemporalArrayT: ... -@overload -def ceil_temporal( - timestamps: Expression, - /, - multiple: int = 1, - unit: Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", - ] = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - options: RoundTemporalOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ceil_temporal(*args, **kwargs): - """ - Round temporal values up to nearest multiple of specified time unit. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - multiple : int, default 1 - Number of units to round to. - unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. - calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. - options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -floor_temporal = _clone_signature(ceil_temporal) -""" -Round temporal values down to nearest multiple of specified time unit. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -timestamps : Array-like or scalar-like - Argument to compute function. -multiple : int, default 1 - Number of units to round to. -unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". -week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. -ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. -calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. -options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -round_temporal = _clone_signature(ceil_temporal) -""" -Round temporal values to the nearest multiple of specified time unit. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -timestamps : Array-like or scalar-like - Argument to compute function. -multiple : int, default 1 - Number of units to round to. -unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". -week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. -ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. -calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. -options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def cast( - arr: lib.Scalar, - target_type: _DataTypeT | None = None, - safe: bool | None = None, - options: CastOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Scalar[_DataTypeT]: ... -@overload -def cast( - arr: lib.Array, - target_type: _DataTypeT | str | None = None, - safe: bool | None = None, - options: CastOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[lib.Scalar[_DataTypeT]]: ... -@overload -def cast( - arr: lib.ChunkedArray, - target_type: _DataTypeT | None = None, - safe: bool | None = None, - options: CastOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... -def cast(*args, **kwargs): - """ - Cast array values to another data type. Can also be invoked as an array - instance method. - - Parameters - ---------- - arr : Array-like - target_type : DataType or str - Type to cast to - safe : bool, default True - Check for overflows or other unsafe conversions - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Examples - -------- - >>> from datetime import datetime - >>> import pyarrow as pa - >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) - >>> arr.type - TimestampType(timestamp[us]) - - You can use ``pyarrow.DataType`` objects to specify the target type: - - >>> cast(arr, pa.timestamp("ms")) - - [ - 2010-01-01 00:00:00.000, - 2015-01-01 00:00:00.000 - ] - - >>> cast(arr, pa.timestamp("ms")).type - TimestampType(timestamp[ms]) - - Alternatively, it is also supported to use the string aliases for these - types: - - >>> arr.cast("timestamp[ms]") - - [ - 2010-01-01 00:00:00.000, - 2015-01-01 00:00:00.000 - ] - >>> arr.cast("timestamp[ms]").type - TimestampType(timestamp[ms]) - - Returns - ------- - casted : Array - The cast result as a new Array - """ - -@overload -def strftime( - timestamps: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, - /, - format: str = "%Y-%m-%dT%H:%M:%S", - locale: str = "C", - *, - options: StrftimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StringScalar: ... -@overload -def strftime( - timestamps: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, - /, - format: str = "%Y-%m-%dT%H:%M:%S", - locale: str = "C", - *, - options: StrftimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StringArray: ... -@overload -def strftime( - timestamps: Expression, - /, - format: str = "%Y-%m-%dT%H:%M:%S", - locale: str = "C", - *, - options: StrftimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def strftime(*args, **kwargs): - """ - Format temporal values according to a format string. - - For each input value, emit a formatted string. - The time format string and locale can be set using StrftimeOptions. - The output precision of the "%S" (seconds) format code depends on - the input time precision: it is an integer for timestamps with - second precision, a real number with the required number of fractional - digits for higher precisions. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database, or if the specified locale - does not exist on this system. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - format : str, default "%Y-%m-%dT%H:%M:%S" - Pattern for formatting input values. - locale : str, default "C" - Locale to use for locale-specific format specifiers. - options : pyarrow.compute.StrftimeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def strptime( - strings: StringScalar, - /, - format: str, - unit: Literal["s", "ms", "us", "ns"], - error_is_null: bool = False, - *, - options: StrptimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampScalar: ... -@overload -def strptime( - strings: StringArray, - /, - format: str, - unit: Literal["s", "ms", "us", "ns"], - error_is_null: bool = False, - *, - options: StrptimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampArray: ... -@overload -def strptime( - strings: Expression, - /, - format: str, - unit: Literal["s", "ms", "us", "ns"], - error_is_null: bool = False, - *, - options: StrptimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def strptime(*args, **kwargs): - """ - Parse timestamps. - - For each string in `strings`, parse it as a timestamp. - The timestamp unit and the expected string pattern must be given - in StrptimeOptions. Null inputs emit null. If a non-null string - fails parsing, an error is returned by default. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - format : str - Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". - Note that the semantics of the format follow the C/C++ strptime, not the Python one. - There are differences in behavior, for example how the "%y" placeholder - handles years with less than four digits. - unit : str - Timestamp unit of the output. - Accepted values are "s", "ms", "us", "ns". - error_is_null : boolean, default False - Return null on parsing errors if true or raise if false. - options : pyarrow.compute.StrptimeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.23 Temporal component extraction ========================= -@overload -def day( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar: ... -@overload -def day( - values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Array: ... -@overload -def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def day(*args, **kwargs): - """ - Extract day number. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def day_of_week( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def day_of_week( - values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def day_of_week( - values: Expression, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def day_of_week(*args, **kwargs): - """ - Extract day of the week number. - - By default, the week starts on Monday represented by 0 and ends on Sunday - represented by 6. - `DayOfWeekOptions.week_start` can be used to set another starting day using - the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). - Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - options : pyarrow.compute.DayOfWeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -day_of_year = _clone_signature(day) -""" -Extract day of year number. - -January 1st maps to day number 1, February 1st to 32, etc. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def hour( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def hour( - values: _ZonedTimestampArrayT - | _ZonelessTimestampArrayT - | lib.Time32Array[Any] - | lib.Time64Array[Any] - | lib.ChunkedArray[lib.Time32Scalar[Any]] - | lib.ChunkedArray[lib.Time64Scalar[Any]], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def hour( - values: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def hour(*args, **kwargs): - """ - Extract hour value. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_dst( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def is_dst( - values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def is_dst(*args, **kwargs): - """ - Extracts if currently observing daylight savings. - - IsDaylightSavings returns true if a timestamp has a daylight saving - offset in the given timezone. - Null values emit null. - An error is returned if the values do not have a defined timezone. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def iso_week( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar: ... -@overload -def iso_week( - values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def iso_week( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def iso_week(*args, **kwargs): - """ - Extract ISO week of year number. - - First ISO week has the majority (4 or more) of its days in January. - ISO week starts on Monday. The week number starts with 1 and can run - up to 53. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -iso_year = _clone_signature(iso_week) -""" -Extract ISO year number. - -First week of an ISO year has the majority (4 or more) of its days in January. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def iso_calendar( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructScalar: ... -@overload -def iso_calendar( - values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: ... -@overload -def iso_calendar( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def iso_calendar(*args, **kwargs): - """ - Extract (ISO year, ISO week, ISO day of week) struct. - - ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7. - Null values emit null. An error is returned if the values have a defined - timezone, but it cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_leap_year( - values: _ZonedTimestampScalarT | _ZonelessTimestampScalarT | lib.Date32Scalar | lib.Date64Scalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def is_leap_year( - values: _ZonedTimestampArrayT - | _ZonelessTimestampArrayT - | lib.Date32Array - | lib.Date64Array - | lib.ChunkedArray[lib.TimestampScalar] - | lib.ChunkedArray[lib.Date32Scalar] - | lib.ChunkedArray[lib.Date64Scalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_leap_year( - values: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def is_leap_year(*args, **kwargs): - """ - Extract if year is a leap year. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -microsecond = _clone_signature(iso_week) -""" -Extract microsecond values. - -Microsecond returns number of microseconds since the last full millisecond. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -millisecond = _clone_signature(iso_week) -""" -Extract millisecond values. - -Millisecond returns number of milliseconds since the last full second. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -minute = _clone_signature(iso_week) -""" -Extract minute values. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -month = _clone_signature(day_of_week) -""" -Extract month number. - -Month is encoded as January=1, December=12. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -nanosecond = _clone_signature(hour) -""" -Extract nanosecond values. - -Nanosecond returns number of nanoseconds since the last full microsecond. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -quarter = _clone_signature(day_of_week) -""" -Extract quarter of year number. - -First quarter maps to 1 and forth quarter maps to 4. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -second = _clone_signature(hour) -""" -Extract second values. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -subsecond = _clone_signature(hour) -""" -Extract subsecond values. - -Subsecond returns the fraction of a second since the last full second. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -us_week = _clone_signature(iso_week) -""" -Extract US week of year number. - -First US week has the majority (4 or more) of its days in January. -US week starts on Monday. The week number starts with 1 and can run -up to 53. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -us_year = _clone_signature(iso_week) -""" -Extract US epidemiological year number. - -First week of US epidemiological year has the majority (4 or more) of -it's days in January. Last week of US epidemiological year has the -year's last Wednesday in it. US epidemiological week starts on Sunday. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -year = _clone_signature(iso_week) -""" -Extract year number. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def week( - values: lib.Scalar[lib.TimestampType[Any, Any]], - /, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - options: WeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def week( - values: _ZonedTimestampArrayT | _ZonelessTimestampArrayT, - /, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - options: WeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def week( - values: Expression, - /, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - options: WeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def week(*args, **kwargs): - """ - Extract week of year number. - - First week has the majority (4 or more) of its days in January. - Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using - DayOfWeekOptions.count_from_zero. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - count_from_zero : bool, default False - If True, dates at the start of a year that fall into the last week - of the previous year emit 0. - If False, they emit 52 or 53 (the week number of the last week - of the previous year). - first_week_is_fully_in_year : bool, default False - If True, week number 0 is fully in January. - If False, a week that begins on December 29, 30 or 31 is considered - to be week number 0 of the following year. - options : pyarrow.compute.WeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def year_month_day( - values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructScalar: ... -@overload -def year_month_day( - values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructArray: ... -@overload -def year_month_day( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def year_month_day(*args, **kwargs): - """ - Extract (year, month, day) struct. - - Null values emit null. - An error is returned in the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.24 Temporal difference ========================= -def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Compute the number of days and milliseconds between two timestamps. - - Returns the number of days and milliseconds from `start` to `end`. - That is, first the difference in days is computed as if both - timestamps were truncated to the day, then the difference between time times - of the two timestamps is computed as if both times were truncated to the - millisecond. - Null values return null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def days_between( - start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar | lib.Int64Array: - """ - Compute the number of days between two timestamps. - - Returns the number of day boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the day. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -hours_between = _clone_signature(days_between) -""" -Compute the number of hours between two timestamps. - -Returns the number of hour boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the hour. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -microseconds_between = _clone_signature(days_between) -""" -Compute the number of microseconds between two timestamps. - -Returns the number of microsecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the microsecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -milliseconds_between = _clone_signature(days_between) -""" -Compute the number of millisecond boundaries between two timestamps. - -Returns the number of millisecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the millisecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -minutes_between = _clone_signature(days_between) -""" -Compute the number of millisecond boundaries between two timestamps. - -Returns the number of millisecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the millisecond. -Null values emit null. -In [152]: print(pc.minutes_between.__doc__) -Compute the number of minute boundaries between two timestamps. - -Returns the number of minute boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the minute. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -def month_day_nano_interval_between( - start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: - """ - Compute the number of months, days and nanoseconds between two timestamps. - - Returns the number of months, days, and nanoseconds from `start` to `end`. - That is, first the difference in months is computed as if both timestamps - were truncated to the months, then the difference between the days - is computed, and finally the difference between the times of the two - timestamps is computed as if both times were truncated to the nanosecond. - Null values return null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Compute the number of months between two timestamps. - - Returns the number of month boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the month. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -nanoseconds_between = _clone_signature(days_between) -""" -Compute the number of nanoseconds between two timestamps. - -Returns the number of nanosecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the nanosecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -quarters_between = _clone_signature(days_between) -""" -Compute the number of quarters between two timestamps. - -Returns the number of quarter start boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the quarter. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -seconds_between = _clone_signature(days_between) -""" -Compute the number of seconds between two timestamps. - -Returns the number of second boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the second. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -def weeks_between( - start, - end, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array: - """ - Compute the number of weeks between two timestamps. - - Returns the number of week boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the week. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - options : pyarrow.compute.DayOfWeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -years_between = _clone_signature(days_between) -""" -Compute the number of years between two timestamps. - -Returns the number of year boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the year. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.25 Timezone handling ========================= -@overload -def assume_timezone( - timestamps: _ZonelessTimestampScalarT, - /, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - options: AssumeTimezoneOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ZonedTimestampScalarT: ... -@overload -def assume_timezone( - timestamps: _ZonelessTimestampArrayT, - /, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - options: AssumeTimezoneOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ZonedTimestampArrayT: ... -@overload -def assume_timezone( - timestamps: _ZonelessTimestampScalarT, - /, - *, - options: AssumeTimezoneOptions, - memory_pool: lib.MemoryPool | None = None, -) -> _ZonedTimestampScalarT: ... -@overload -def assume_timezone( - timestamps: _ZonelessTimestampArrayT, - /, - *, - options: AssumeTimezoneOptions, - memory_pool: lib.MemoryPool | None = None, -) -> _ZonedTimestampArrayT: ... -@overload -def assume_timezone( - timestamps: Expression, - /, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - options: AssumeTimezoneOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def assume_timezone(*args, **kwargs): - """ - Convert naive timestamp to timezone-aware timestamp. - - Input timestamps are assumed to be relative to the timezone given in the - `timezone` option. They are converted to UTC-relative timestamps and - the output type has its timezone set to the value of the `timezone` - option. Null values emit null. - This function is meant to be used when an external system produces - "timezone-naive" timestamps which need to be converted to - "timezone-aware" timestamps. An error is returned if the timestamps - already have a defined timezone. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - timezone : str - Timezone to assume for the input. - ambiguous : str, default "raise" - How to handle timestamps that are ambiguous in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - nonexistent : str, default "raise" - How to handle timestamps that don't exist in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - options : pyarrow.compute.AssumeTimezoneOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def local_timestamp( - timestamps: _ZonedTimestampScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _ZonelessTimestampScalarT: ... -@overload -def local_timestamp( - timestamps: _ZonedTimestampArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _ZonelessTimestampArrayT: ... -@overload -def local_timestamp( - timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def local_timestamp(*args, **kwargs): - """ - Convert timestamp to a timezone-naive local time timestamp. - - LocalTimestamp converts timezone-aware timestamp to local timestamp - of the given timestamp's timezone and removes timezone metadata. - Alternative name for this timestamp is also wall clock time. - If input is in UTC or without timezone, then unchanged input values - without timezone metadata are returned. - Null values emit null. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.26 Random number generation ========================= -def random( - n: int, - *, - initializer: Hashable = "system", - options: RandomOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Generate numbers in the range [0, 1). - - Generated values are uniformly-distributed, double-precision - in range [0, 1). Algorithm and seed can be changed via RandomOptions. - - Parameters - ---------- - n : int - Number of values to generate, must be greater than or equal to 0 - initializer : int or str - How to initialize the underlying random generator. - If an integer is given, it is used as a seed. - If "system" is given, the random generator is initialized with - a system-specific source of (hopefully true) randomness. - Other values are invalid. - options : pyarrow.compute.RandomOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3. Array-wise (“vector”) functions ========================= - -# ========================= 3.1 Cumulative Functions ========================= -@overload -def cumulative_sum( - values: _NumericArrayT, - /, - start: lib.Scalar | int | None = None, - *, - skip_nulls: bool = False, - options: CumulativeSumOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def cumulative_sum( - values: Expression, - /, - start: lib.Scalar | None = None, - *, - skip_nulls: bool = False, - options: CumulativeSumOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def cumulative_sum(*args, **kwargs): - """ - Compute the cumulative sum over a numeric input. - - `values` must be numeric. Return an array/chunked array which is the - cumulative sum computed over `values`. Results will wrap around on - integer overflow. Use function "cumulative_sum_checked" if you want - overflow to return an error. The default start is 0. - - Parameters - ---------- - values : Array-like - Argument to compute function. - start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. - skip_nulls : bool, default False - When false, the first encountered null is propagated. - options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -cumulative_sum_checked = _clone_signature(cumulative_sum) -""" -Compute the cumulative sum over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative sum computed over `values`. This function returns an error -on overflow. For a variant that doesn't fail on overflow, use -function "cumulative_sum". The default start is 0. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_prod = _clone_signature(cumulative_sum) -""" -Compute the cumulative product over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative product computed over `values`. Results will wrap around on -integer overflow. Use function "cumulative_prod_checked" if you want -overflow to return an error. The default start is 1. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_prod_checked = _clone_signature(cumulative_sum) -""" -Compute the cumulative product over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative product computed over `values`. This function returns an error -on overflow. For a variant that doesn't fail on overflow, use -function "cumulative_prod". The default start is 1. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_max = _clone_signature(cumulative_sum) -""" -Compute the cumulative max over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative max computed over `values`. The default start is the minimum -value of input type (so that any other value will replace the -start as the new maximum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_min = _clone_signature(cumulative_sum) -""" -Compute the cumulative min over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative min computed over `values`. The default start is the maximum -value of input type (so that any other value will replace the -start as the new minimum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_mean = _clone_signature(cumulative_sum) -""" -Compute the cumulative max over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative max computed over `values`. The default start is the minimum -value of input type (so that any other value will replace the -start as the new maximum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -# ========================= 3.2 Associative transforms ========================= - -@overload -def dictionary_encode( - array: _ScalarOrArrayT, - /, - null_encoding: Literal["mask", "encode"] = "mask", - *, - options=None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarOrArrayT: ... -@overload -def dictionary_encode( - array: Expression, - /, - null_encoding: Literal["mask", "encode"] = "mask", - *, - options=None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def dictionary_decode(array: _ScalarOrArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ScalarOrArrayT: ... -@overload -def dictionary_decode(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def dictionary_decode(*args, **kwargs): - """ - Decodes a DictionaryArray to an Array - - Return a plain-encoded version of the array input. - This function does nothing if the input is not a dictionary. - - Parameters - ---------- - array : Array-like - Argument to compute function. - """ -@overload -def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... -@overload -def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -@overload -def value_counts( - array: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructArray: ... -@overload -def value_counts( - array: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... - -# ========================= 3.3 Selections ========================= -@overload -def array_filter( - array: _ArrayT, - selection_filter: list[bool] | list[bool | None] | BooleanArray, - /, - null_selection_behavior: Literal["drop", "emit_null"] = "drop", - *, - options: FilterOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ArrayT: ... -@overload -def array_filter( - array: Expression, - selection_filter: list[bool] | list[bool | None] | BooleanArray, - /, - null_selection_behavior: Literal["drop", "emit_null"] = "drop", - *, - options: FilterOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def array_take( - array: _ArrayT, - indices: Indices | list[int | None], - /, - *, - boundscheck: bool = True, - options: TakeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ArrayT: ... -@overload -def array_take( - array: Expression, - indices: list[int] - | list[int | None] - | lib.Int16Array - | lib.Int32Array - | lib.Int64Array - | lib.ChunkedArray[lib.Int16Scalar] - | lib.ChunkedArray[lib.Int32Scalar] - | lib.ChunkedArray[lib.Int64Scalar], - /, - *, - boundscheck: bool = True, - options: TakeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... -@overload -def drop_null( - input: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... - -filter = array_filter -take = array_take -""" -Select values (or records) from array- or table-like data given integer -selection indices. - -The result will be of the same type(s) as the input, with elements taken -from the input array (or record batch / table fields) at the given -indices. If an index is null then the corresponding value in the output -will be null. - -Parameters ----------- -data : Array, ChunkedArray, RecordBatch, or Table -indices : Array, ChunkedArray - Must be of integer type -boundscheck : boolean, default True - Whether to boundscheck the indices. If False and there is an out of - bounds index, will likely cause the process to crash. -memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - -Returns -------- -result : depends on inputs - Selected values for the given indices - -Examples --------- ->>> import pyarrow as pa ->>> arr = pa.array(["a", "b", "c", None, "e", "f"]) ->>> indices = pa.array([0, None, 4, 3]) ->>> arr.take(indices) - -[ - "a", - null, - "e", - null -] -""" - -# ========================= 3.4 Containment tests ========================= -@overload -def indices_nonzero( - values: lib.BooleanArray - | lib.NullArray - | NumericArray - | lib.Decimal128Array - | lib.Decimal256Array, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def indices_nonzero( - values: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def indices_nonzero(*args, **kwargs): - """ - Return the indices of the values in the array that are non-zero. - - For each input value, check if it's zero, false or null. Emit the index - of the value in the array if it's none of the those. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3.5 Sorts and partitions ========================= -@overload -def array_sort_indices( - array: lib.Array | lib.ChunkedArray, - /, - order: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - options: ArraySortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def array_sort_indices( - array: Expression, - /, - order: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - options: ArraySortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def array_sort_indices(*args, **kwargs): - """ - Return the indices that would sort an array. - - This function computes an array of indices that define a stable sort - of the input array. By default, Null values are considered greater - than any other value and are therefore sorted at the end of the array. - For floating-point types, NaNs are considered greater than any - other non-null value, but smaller than null values. - - The handling of nulls and NaNs can be changed in ArraySortOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - null_placement : str, default "at_end" - Where nulls in the input should be sorted. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.ArraySortOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def partition_nth_indices( - array: lib.Array | lib.ChunkedArray | Sequence[int | float | str | None], - /, - pivot: int, - *, - null_placement: _Placement = "at_end", - options: PartitionNthOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def partition_nth_indices( - array: Expression, - /, - pivot: int, - *, - null_placement: _Placement = "at_end", - options: PartitionNthOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def partition_nth_indices(*args, **kwargs): - """ - Return the indices that would partition an array around a pivot. - - This functions computes an array of indices that define a non-stable - partial sort of the input array. - - The output is such that the `N`'th index points to the `N`'th element - of the input in sorted order, and all indices before the `N`'th point - to elements in the input less or equal to elements at or after the `N`'th. - - By default, null values are considered greater than any other value - and are therefore partitioned towards the end of the array. - For floating-point types, NaNs are considered greater than any - other non-null value, but smaller than null values. - - The pivot index `N` must be given in PartitionNthOptions. - The handling of nulls and NaNs can also be changed in PartitionNthOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - pivot : int - Index into the equivalent sorted array of the pivot element. - null_placement : str, default "at_end" - Where nulls in the input should be partitioned. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.PartitionNthOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def rank( - input: lib.Array | lib.ChunkedArray, - /, - sort_keys: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - tiebreaker: Literal["min", "max", "first", "dense"] = "first", - options: RankOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: - """ - Compute ordinal ranks of an array (1-based). - - This function computes a rank of the input array. - By default, null values are considered greater than any other value and - are therefore sorted at the end of the input. For floating-point types, - NaNs are considered greater than any other non-null value, but smaller - than null values. The default tiebreaker is to assign ranks in order of - when ties appear in the input. - - The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - tiebreaker : str, default "first" - Configure how ties between equal values are handled. - Accepted values are: - - - "min": Ties get the smallest possible rank in sorted order. - - "max": Ties get the largest possible rank in sorted order. - - "first": Ranks are assigned in order of when ties appear in the - input. This ensures the ranks are a stable permutation - of the input. - - "dense": The ranks span a dense [1, M] interval where M is the - number of distinct values in the input. - options : pyarrow.compute.RankOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def rank_quantile( - input: lib.Array | lib.ChunkedArray, - /, - sort_keys: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - options: RankQuantileOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: - """ - Compute quantile ranks of an array (1-based). - - This function computes a quantile rank of the input array. - By default, null values are considered greater than any other value and - are therefore sorted at the end of the input. For floating-point types, - NaNs are considered greater than any other non-null value, but smaller - than null values. - - The results are real values strictly between 0 and 1. They are - computed as in https://en.wikipedia.org/wiki/Quantile_rank - but without multiplying by 100. - - The handling of nulls and NaNs can be changed in RankQuantileOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.RankQuantileOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - - -rank_normal = _clone_signature(rank_quantile) -""" -Compute normal (gaussian) ranks of an array (1-based). - -This function computes a normal (gaussian) rank of the input array. -By default, null values are considered greater than any other value and -are therefore sorted at the end of the input. For floating-point types, -NaNs are considered greater than any other non-null value, but smaller -than null values. -The results are finite real values. They are obtained as if first -calling the "rank_quantile" function and then applying the normal -percent-point function (PPF) to the resulting quantile values. - -The handling of nulls and NaNs can be changed in RankQuantileOptions. - -Parameters ----------- -input : Array-like or scalar-like - Argument to compute function. -sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. -null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". -options : pyarrow.compute.RankQuantileOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def select_k_unstable( - input: lib.Array | lib.ChunkedArray | lib.Table, - /, - k: int, - sort_keys: list[tuple[str | Expression, _Order]] | None = None, - *, - options: SelectKOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def select_k_unstable( - input: Expression, - /, - k: int, - sort_keys: list[tuple[str | Expression, _Order]] | None = None, - *, - options: SelectKOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def select_k_unstable( - input: lib.Array | lib.ChunkedArray | lib.Table, - /, - options: SelectKOptions, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def select_k_unstable( - input: Expression, - /, - options: SelectKOptions, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def select_k_unstable(*args, **kwargs): - """ - Select the indices of the first `k` ordered elements from the input. - - This function selects an array of indices of the first `k` ordered elements - from the `input` array, record batch or table specified in the column keys - (`options.sort_keys`). Output is not guaranteed to be stable. - Null values are considered greater than any other value and are - therefore ordered at the end. For floating-point types, NaNs are considered - greater than any other non-null value, but smaller than null values. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - k : int - Number of leading values to select in sorted order - (i.e. the largest values if sort order is "descending", - the smallest otherwise). - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - options : pyarrow.compute.SelectKOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def sort_indices( - input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, - /, - sort_keys: Sequence[tuple[str|Expression, _Order]] = (), - *, - null_placement: _Placement = "at_end", - options: SortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def sort_indices( - input: Expression, - /, - sort_keys: Sequence[tuple[str|Expression, _Order]] = (), - *, - null_placement: _Placement = "at_end", - options: SortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def sort_indices(*args, **kwargs): - """ - Return the indices that would sort an array, record batch or table. - - This function computes an array of indices that define a stable sort - of the input array, record batch or table. By default, null values are - considered greater than any other value and are therefore sorted at the - end of the input. For floating-point types, NaNs are considered greater - than any other non-null value, but smaller than null values. - - The handling of nulls and NaNs can be changed in SortOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - null_placement : str, default "at_end" - Where nulls in input should be sorted, only applying to - columns/fields mentioned in `sort_keys`. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.SortOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3.6 Structural transforms ========================= -@overload -def list_element( - lists: Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def list_element( - lists: lib.Array[ListScalar[_DataTypeT]] | lib.Array[lib.Scalar[lib.ListType[lib.StructType]]], - index: ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[lib.Scalar[_DataTypeT]]: ... -@overload -def list_element( - lists: lib.ChunkedArray[ListScalar[_DataTypeT]], - index: ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... -@overload -def list_element( - lists: ListScalar[_DataTypeT], - index: ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _DataTypeT: ... -def list_element(*args, **kwargs): - """ - Compute elements using of nested list values using an index. - - `lists` must have a list-like type. - For each value in each list of `lists`, the element at `index` - is emitted. Null values emit a null in the output. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - index : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def list_flatten( - lists: Expression, - /, - recursive: bool = False, - *, - options: ListFlattenOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def list_flatten( - lists: ArrayOrChunkedArray[ListScalar[Any]], - /, - recursive: bool = False, - *, - options: ListFlattenOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[Any]: ... -def list_flatten(*args, **kwargs): - """ - Flatten list values. - - `lists` must have a list-like type (lists, list-views, and - fixed-size lists). - Return an array with the top list level flattened unless - `recursive` is set to true in ListFlattenOptions. When that - is that case, flattening happens recursively until a non-list - array is formed. - - Null list values do not emit anything to the output. - - Parameters - ---------- - lists : Array-like - Argument to compute function. - recursive : bool, default False - When True, the list array is flattened recursively until an array - of non-list values is formed. - options : pyarrow.compute.ListFlattenOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def list_parent_indices( - lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def list_parent_indices( - lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Array: ... -def list_parent_indices(*args, **kwargs): - """ - Compute parent indices of nested list values. - - `lists` must have a list-like or list-view type. - For each value in each list of `lists`, the top-level list index - is emitted. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def list_slice( - lists: Expression, - /, - start: int, - stop: int | None = None, - step: int = 1, - return_fixed_size_list: bool | None = None, - *, - options: ListSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def list_slice( - lists: ArrayOrChunkedArray[Any], - /, - start: int, - stop: int | None = None, - step: int = 1, - return_fixed_size_list: bool | None = None, - *, - options: ListSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[Any]: ... -def list_slice(*args, **kwargs): - """ - Compute slice of list-like array. - - `lists` must have a list-like type. - For each list element, compute a slice, returning a new list array. - A variable or fixed size list array is returned, depending on options. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing inner list elements (inclusive). - stop : Optional[int], default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. (NotImplemented) - step : int, default 1 - Slice step. - return_fixed_size_list : Optional[bool], default None - Whether to return a FixedSizeListArray. If true _and_ stop is after - a list element's length, nulls will be appended to create the - requested slice size. The default of `None` will return the same - type which was passed in. - options : pyarrow.compute.ListSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def map_lookup( - container, - /, - query_key, - occurrence: str, - *, - options: MapLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -): - """ - Find the items corresponding to a given key in a Map. - - For a given query key (passed via MapLookupOptions), extract - either the FIRST, LAST or ALL items from a Map that have - matching keys. - - Parameters - ---------- - container : Array-like or scalar-like - Argument to compute function. - query_key : Scalar or Object can be converted to Scalar - The key to search for. - occurrence : str - The occurrence(s) to return from the Map - Accepted values are "first", "last", or "all". - options : pyarrow.compute.MapLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def struct_field( - values, - /, - indices, - *, - options: StructFieldOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -): - """ - Extract children of a struct or union by index. - - Given a list of indices (passed via StructFieldOptions), extract - the child array or scalar with the given child index, recursively. - - For union inputs, nulls are emitted for union values that reference - a different child than specified. Also, the indices are always - in physical order, not logical type codes - for example, the first - child is always index 0. - - An empty list of indices returns the argument unchanged. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - indices : List[str], List[bytes], List[int], Expression, bytes, str, or int - List of indices for chained field lookup, for example `[4, 1]` - will look up the second nested field in the fifth outer field. - options : pyarrow.compute.StructFieldOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Carry non-null values backward to fill null slots. - - Given an array, propagate next valid observation backward to previous valid - or nothing if all next values are null. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Carry non-null values forward to fill null slots. - - Given an array, propagate last valid observation forward to next valid - or nothing if all previous values are null. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def replace_with_mask( - values, - mask: list[bool] | list[bool | None] | BooleanArray, - replacements, - /, - *, - memory_pool: lib.MemoryPool | None = None, -): - """ - Replace items selected with a mask. - - Given an array and a boolean mask (either scalar or of equal length), - along with replacement values (either scalar or array), - each element of the array for which the corresponding mask element is - true will be replaced by the next value from the replacements, - or with null if the mask is null. - Hence, for replacement arrays, len(replacements) == sum(mask == true). - - Parameters - ---------- - values : Array-like - Argument to compute function. - mask : Array-like - Argument to compute function. - replacements : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3.7 Pairwise functions ========================= -@overload -def pairwise_diff( - input: _NumericOrTemporalArrayT, - /, - period: int = 1, - *, - options: PairwiseOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def pairwise_diff( - input: Expression, - /, - period: int = 1, - *, - options: PairwiseOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def pairwise_diff(*args, **kwargs): - """ - Compute first order difference of an array. - - Computes the first order difference of an array, It internally calls - the scalar function "subtract" to compute - differences, so its - behavior and supported types are the same as - "subtract". The period can be specified in :struct:`PairwiseOptions`. - - Results will wrap around on integer overflow. Use function - "pairwise_diff_checked" if you want overflow to return an error. - - Parameters - ---------- - input : Array-like - Argument to compute function. - period : int, default 1 - Period for applying the period function. - options : pyarrow.compute.PairwiseOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -pairwise_diff_checked = _clone_signature(pairwise_diff) -""" -Compute first order difference of an array. - -Computes the first order difference of an array, It internally calls -the scalar function "subtract_checked" (or the checked variant) to compute -differences, so its behavior and supported types are the same as -"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. - -This function returns an error on overflow. For a variant that doesn't -fail on overflow, use function "pairwise_diff". - -Parameters ----------- -input : Array-like - Argument to compute function. -period : int, default 1 - Period for applying the period function. -options : pyarrow.compute.PairwiseOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py deleted file mode 100644 index 18c530d4afe..00000000000 --- a/python/pyarrow/cuda.py +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# flake8: noqa - - -from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, - HostBuffer, BufferReader, BufferWriter, - new_host_buffer, - serialize_record_batch, read_message, - read_record_batch) diff --git a/python/pyarrow/dataset.pyi b/python/pyarrow/dataset.pyi deleted file mode 100644 index 6cb7fed43e6..00000000000 --- a/python/pyarrow/dataset.pyi +++ /dev/null @@ -1,246 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload - -from _typeshed import StrPath -from pyarrow._dataset import ( - CsvFileFormat, - CsvFragmentScanOptions, - Dataset, - DatasetFactory, - DirectoryPartitioning, - FeatherFileFormat, - FileFormat, - FileFragment, - FilenamePartitioning, - FileSystemDataset, - FileSystemDatasetFactory, - FileSystemFactoryOptions, - FileWriteOptions, - Fragment, - FragmentScanOptions, - HivePartitioning, - InMemoryDataset, - IpcFileFormat, - IpcFileWriteOptions, - JsonFileFormat, - JsonFragmentScanOptions, - Partitioning, - PartitioningFactory, - Scanner, - TaggedRecordBatch, - UnionDataset, - UnionDatasetFactory, - WrittenFile, - get_partition_keys, -) -from pyarrow._dataset_orc import OrcFileFormat -from pyarrow._dataset_parquet import ( - ParquetDatasetFactory, - ParquetFactoryOptions, - ParquetFileFormat, - ParquetFileFragment, - ParquetFileWriteOptions, - ParquetFragmentScanOptions, - ParquetReadOptions, - RowGroupInfo, -) -from pyarrow._dataset_parquet_encryption import ( - ParquetDecryptionConfig, - ParquetEncryptionConfig, -) -from pyarrow.compute import Expression, field, scalar -from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table - -from ._fs import SupportedFileSystem - -_orc_available: bool -_parquet_available: bool - -__all__ = [ - "CsvFileFormat", - "CsvFragmentScanOptions", - "Dataset", - "DatasetFactory", - "DirectoryPartitioning", - "FeatherFileFormat", - "FileFormat", - "FileFragment", - "FilenamePartitioning", - "FileSystemDataset", - "FileSystemDatasetFactory", - "FileSystemFactoryOptions", - "FileWriteOptions", - "Fragment", - "FragmentScanOptions", - "HivePartitioning", - "InMemoryDataset", - "IpcFileFormat", - "IpcFileWriteOptions", - "JsonFileFormat", - "JsonFragmentScanOptions", - "Partitioning", - "PartitioningFactory", - "Scanner", - "TaggedRecordBatch", - "UnionDataset", - "UnionDatasetFactory", - "WrittenFile", - "get_partition_keys", - # Orc - "OrcFileFormat", - # Parquet - "ParquetDatasetFactory", - "ParquetFactoryOptions", - "ParquetFileFormat", - "ParquetFileFragment", - "ParquetFileWriteOptions", - "ParquetFragmentScanOptions", - "ParquetReadOptions", - "RowGroupInfo", - # Parquet Encryption - "ParquetDecryptionConfig", - "ParquetEncryptionConfig", - # Compute - "Expression", - "field", - "scalar", - # Dataset - "partitioning", - "parquet_dataset", - "write_dataset", -] - -_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] - -@overload -def partitioning( - schema: Schema, -) -> Partitioning: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["filename"], - dictionaries: dict[str, Array] | None = None, -) -> Partitioning: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["filename"], - dictionaries: Literal["infer"], -) -> PartitioningFactory: ... -@overload -def partitioning( - field_names: list[str], - *, - flavor: Literal["filename"], -) -> PartitioningFactory: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["hive"], - dictionaries: Literal["infer"], -) -> PartitioningFactory: ... -@overload -def partitioning( - *, - flavor: Literal["hive"], -) -> PartitioningFactory: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["hive"], - dictionaries: dict[str, Array] | None = None, -) -> Partitioning: ... -def parquet_dataset( - metadata_path: StrPath, - schema: Schema | None = None, - filesystem: SupportedFileSystem | None = None, - format: ParquetFileFormat | None = None, - partitioning: Partitioning | PartitioningFactory | None = None, - partition_base_dir: str | None = None, -) -> FileSystemDataset: ... -@overload -def dataset( - source: StrPath | Sequence[StrPath], - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> FileSystemDataset: ... -@overload -def dataset( - source: list[Dataset], - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> UnionDataset: ... -@overload -def dataset( - source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> InMemoryDataset: ... -@overload -def dataset( - source: RecordBatch | Table, - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> InMemoryDataset: ... -def write_dataset( - data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], - base_dir: StrPath, - *, - basename_template: str | None = None, - format: FileFormat | _DatasetFormat | None = None, - partitioning: Partitioning | list[str] | None = None, - partitioning_flavor: str | None = None, - schema: Schema | None = None, - filesystem: SupportedFileSystem | None = None, - file_options: FileWriteOptions | None = None, - use_threads: bool = True, - max_partitions: int = 1024, - max_open_files: int = 1024, - max_rows_per_file: int = 0, - min_rows_per_group: int = 0, - max_rows_per_group: int = 1024 * 1024, - file_visitor: Callable[[str], None] | None = None, - existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", - create_dir: bool = True, -): ... diff --git a/python/pyarrow/feather.pyi b/python/pyarrow/feather.pyi deleted file mode 100644 index ce8d83dbcd9..00000000000 --- a/python/pyarrow/feather.pyi +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import IO, Literal - -import pandas as pd - -from _typeshed import StrPath -from pyarrow._feather import FeatherError -from pyarrow.lib import Table - -__all__ = [ - "FeatherError", - "FeatherDataset", - "check_chunked_overflow", - "write_feather", - "read_feather", - "read_table", -] - -class FeatherDataset: - path_or_paths: str | list[str] - validate_schema: bool - - def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... - def read_table(self, columns: list[str] | None = None) -> Table: ... - def validate_schemas(self, piece, table: Table) -> None: ... - def read_pandas( - self, columns: list[str] | None = None, use_threads: bool = True - ) -> pd.DataFrame: ... - -def check_chunked_overflow(name: str, col) -> None: ... -def write_feather( - df: pd.DataFrame | Table, - dest: StrPath | IO, - compression: Literal["zstd", "lz4", "uncompressed"] | None = None, - compression_level: int | None = None, - chunksize: int | None = None, - version: Literal[1, 2] = 2, -) -> None: ... -def read_feather( - source: StrPath | IO, - columns: list[str] | None = None, - use_threads: bool = True, - memory_map: bool = False, - **kwargs, -) -> pd.DataFrame: ... -def read_table( - source: StrPath | IO, - columns: list[str] | None = None, - memory_map: bool = False, - use_threads: bool = True, -) -> Table: ... diff --git a/python/pyarrow/gandiva.pyi b/python/pyarrow/gandiva.pyi deleted file mode 100644 index bc07e15c4a6..00000000000 --- a/python/pyarrow/gandiva.pyi +++ /dev/null @@ -1,82 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Iterable, Literal - -from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable - -class Node(_Weakrefable): - def return_type(self) -> DataType: ... - -class Expression(_Weakrefable): - def root(self) -> Node: ... - def result(self) -> Field: ... - -class Condition(_Weakrefable): - def root(self) -> Node: ... - def result(self) -> Field: ... - -class SelectionVector(_Weakrefable): - def to_array(self) -> Array: ... - -class Projector(_Weakrefable): - @property - def llvm_ir(self): ... - def evaluate( - self, batch: RecordBatch, selection: SelectionVector | None = None - ) -> list[Array]: ... - -class Filter(_Weakrefable): - @property - def llvm_ir(self): ... - def evaluate( - self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" - ) -> SelectionVector: ... - -class TreeExprBuilder(_Weakrefable): - def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... - def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... - def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... - def make_field(self, field: Field) -> Node: ... - def make_if( - self, condition: Node, this_node: Node, else_node: Node, return_type: DataType - ) -> Node: ... - def make_and(self, children: list[Node]) -> Node: ... - def make_or(self, children: list[Node]) -> Node: ... - def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... - def make_condition(self, condition: Node) -> Condition: ... - -class Configuration(_Weakrefable): - def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... - -def make_projector( - schema: Schema, - children: list[Expression], - pool: MemoryPool, - selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", - configuration: Configuration | None = None, -) -> Projector: ... -def make_filter( - schema: Schema, condition: Condition, configuration: Configuration | None = None -) -> Filter: ... - -class FunctionSignature(_Weakrefable): - def return_type(self) -> DataType: ... - def param_types(self) -> list[DataType]: ... - def name(self) -> str: ... - -def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/pyarrow/ipc.pyi b/python/pyarrow/ipc.pyi deleted file mode 100644 index 985cf0678f9..00000000000 --- a/python/pyarrow/ipc.pyi +++ /dev/null @@ -1,140 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from io import IOBase - -import pandas as pd -import pyarrow.lib as lib - -from pyarrow.lib import ( - IpcReadOptions, - IpcWriteOptions, - Message, - MessageReader, - MetadataVersion, - ReadStats, - RecordBatchReader, - WriteStats, - _ReadPandasMixin, - get_record_batch_size, - get_tensor_size, - read_message, - read_record_batch, - read_schema, - read_tensor, - write_tensor, -) - -class RecordBatchStreamReader(lib._RecordBatchStreamReader): - def __init__( - self, - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - *, - options: IpcReadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, - ) -> None: ... - -class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): - def __init__( - self, - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, - ) -> None: ... - -class RecordBatchFileReader(lib._RecordBatchFileReader): - def __init__( - self, - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - footer_offset: int | None = None, - *, - options: IpcReadOptions | None, - memory_pool: lib.MemoryPool | None = None, - ) -> None: ... - -class RecordBatchFileWriter(lib._RecordBatchFileWriter): - def __init__( - self, - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, - ) -> None: ... - -def new_stream( - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, -) -> RecordBatchStreamWriter: ... -def open_stream( - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - *, - options: IpcReadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> RecordBatchStreamReader: ... -def new_file( - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, -) -> RecordBatchFileWriter: ... -def open_file( - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - footer_offset: int | None = None, - *, - options: IpcReadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> RecordBatchFileReader: ... -def serialize_pandas( - df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None -) -> lib.Buffer: ... -def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... - -__all__ = [ - "IpcReadOptions", - "IpcWriteOptions", - "Message", - "MessageReader", - "MetadataVersion", - "ReadStats", - "RecordBatchReader", - "WriteStats", - "_ReadPandasMixin", - "get_record_batch_size", - "get_tensor_size", - "read_message", - "read_record_batch", - "read_schema", - "read_tensor", - "write_tensor", - "RecordBatchStreamReader", - "RecordBatchStreamWriter", - "RecordBatchFileReader", - "RecordBatchFileWriter", - "new_stream", - "open_stream", - "new_file", - "open_file", - "serialize_pandas", - "deserialize_pandas", -] diff --git a/python/pyarrow/json.pyi b/python/pyarrow/json.pyi deleted file mode 100644 index 67768db42e4..00000000000 --- a/python/pyarrow/json.pyi +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json - -__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow/orc.pyi b/python/pyarrow/orc.pyi deleted file mode 100644 index 557f38a2b9e..00000000000 --- a/python/pyarrow/orc.pyi +++ /dev/null @@ -1,296 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import IO, Literal - -from _typeshed import StrPath - -from . import _orc -from ._fs import SupportedFileSystem -from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table - -class ORCFile: - """ - Reader interface for a single ORC file - - Parameters - ---------- - source : str or pyarrow.NativeFile - Readable source. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. - """ - - reader: _orc.ORCReader - def __init__(self, source: StrPath | NativeFile | IO) -> None: ... - @property - def metadata(self) -> KeyValueMetadata: - """The file metadata, as an arrow KeyValueMetadata""" - @property - def schema(self) -> Schema: - """The file schema, as an arrow schema""" - @property - def nrows(self) -> int: - """The number of rows in the file""" - @property - def nstripes(self) -> int: - """The number of stripes in the file""" - @property - def file_version(self) -> str: - """Format version of the ORC file, must be 0.11 or 0.12""" - @property - def software_version(self) -> str: - """Software instance and version that wrote this file""" - @property - def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: - """Compression codec of the file""" - @property - def compression_size(self) -> int: - """Number of bytes to buffer for the compression codec in the file""" - @property - def writer(self) -> str: - """Name of the writer that wrote this file. - If the writer is unknown then its Writer ID - (a number) is returned""" - @property - def writer_version(self) -> str: - """Version of the writer""" - @property - def row_index_stride(self) -> int: - """Number of rows per an entry in the row index or 0 - if there is no row index""" - @property - def nstripe_statistics(self) -> int: - """Number of stripe statistics""" - @property - def content_length(self) -> int: - """Length of the data stripes in the file in bytes""" - @property - def stripe_statistics_length(self) -> int: - """The number of compressed bytes in the file stripe statistics""" - @property - def file_footer_length(self) -> int: - """The number of compressed bytes in the file footer""" - @property - def file_postscript_length(self) -> int: - """The number of bytes in the file postscript""" - @property - def file_length(self) -> int: - """The number of bytes in the file""" - def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: - """Read a single stripe from the file. - - Parameters - ---------- - n : int - The stripe index - columns : list - If not None, only these columns will be read from the stripe. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e' - - Returns - ------- - pyarrow.RecordBatch - Content of the stripe as a RecordBatch. - """ - def read(self, columns: list[str] | None = None) -> Table: - """Read the whole file. - - Parameters - ---------- - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. Output always follows the - ordering of the file and not the `columns` list. - - Returns - ------- - pyarrow.Table - Content of the file as a Table. - """ - -class ORCWriter: - """ - Writer interface for a single ORC file - - Parameters - ---------- - where : str or pyarrow.io.NativeFile - Writable target. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream - or pyarrow.io.FixedSizeBufferWriter. - file_version : {"0.11", "0.12"}, default "0.12" - Determine which ORC file version to use. - `Hive 0.11 / ORC v0 `_ - is the older version - while `Hive 0.12 / ORC v1 `_ - is the newer one. - batch_size : int, default 1024 - Number of rows the ORC writer writes at a time. - stripe_size : int, default 64 * 1024 * 1024 - Size of each ORC stripe in bytes. - compression : string, default 'uncompressed' - The compression codec. - Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} - Note that LZ0 is currently not supported. - compression_block_size : int, default 64 * 1024 - Size of each compression block in bytes. - compression_strategy : string, default 'speed' - The compression strategy i.e. speed vs size reduction. - Valid values: {'SPEED', 'COMPRESSION'} - row_index_stride : int, default 10000 - The row index stride i.e. the number of rows per - an entry in the row index. - padding_tolerance : double, default 0.0 - The padding tolerance. - dictionary_key_size_threshold : double, default 0.0 - The dictionary key size threshold. 0 to disable dictionary encoding. - 1 to always enable dictionary encoding. - bloom_filter_columns : None, set-like or list-like, default None - Columns that use the bloom filter. - bloom_filter_fpp : double, default 0.05 - Upper limit of the false-positive rate of the bloom filter. - """ - - writer: _orc.ORCWriter - is_open: bool - def __init__( - self, - where: StrPath | NativeFile | IO, - *, - file_version: str = "0.12", - batch_size: int = 1024, - stripe_size: int = 64 * 1024 * 1024, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", - compression_block_size: int = 65536, - compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", - row_index_stride: int = 10000, - padding_tolerance: float = 0.0, - dictionary_key_size_threshold: float = 0.0, - bloom_filter_columns: list[int] | None = None, - bloom_filter_fpp: float = 0.05, - ): ... - def __enter__(self) -> Self: ... - def __exit__(self, *args, **kwargs) -> None: ... - def write(self, table: Table) -> None: - """ - Write the table into an ORC file. The schema of the table must - be equal to the schema used when opening the ORC file. - - Parameters - ---------- - table : pyarrow.Table - The table to be written into the ORC file - """ - def close(self) -> None: - """ - Close the ORC file - """ - -def read_table( - source: StrPath | NativeFile | IO, - columns: list[str] | None = None, - filesystem: SupportedFileSystem | None = None, -) -> Table: - """ - Read a Table from an ORC file. - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name. For file-like objects, - only read a single file. Use pyarrow.BufferReader to read a file - contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. Output always follows the ordering of the file and - not the `columns` list. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - """ - -def write_table( - table: Table, - where: StrPath | NativeFile | IO, - *, - file_version: str = "0.12", - batch_size: int = 1024, - stripe_size: int = 64 * 1024 * 1024, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", - compression_block_size: int = 65536, - compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", - row_index_stride: int = 10000, - padding_tolerance: float = 0.0, - dictionary_key_size_threshold: float = 0.0, - bloom_filter_columns: list[int] | None = None, - bloom_filter_fpp: float = 0.05, -) -> None: - """ - Write a table into an ORC file. - - Parameters - ---------- - table : pyarrow.lib.Table - The table to be written into the ORC file - where : str or pyarrow.io.NativeFile - Writable target. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream - or pyarrow.io.FixedSizeBufferWriter. - file_version : {"0.11", "0.12"}, default "0.12" - Determine which ORC file version to use. - `Hive 0.11 / ORC v0 `_ - is the older version - while `Hive 0.12 / ORC v1 `_ - is the newer one. - batch_size : int, default 1024 - Number of rows the ORC writer writes at a time. - stripe_size : int, default 64 * 1024 * 1024 - Size of each ORC stripe in bytes. - compression : string, default 'uncompressed' - The compression codec. - Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} - Note that LZ0 is currently not supported. - compression_block_size : int, default 64 * 1024 - Size of each compression block in bytes. - compression_strategy : string, default 'speed' - The compression strategy i.e. speed vs size reduction. - Valid values: {'SPEED', 'COMPRESSION'} - row_index_stride : int, default 10000 - The row index stride i.e. the number of rows per - an entry in the row index. - padding_tolerance : double, default 0.0 - The padding tolerance. - dictionary_key_size_threshold : double, default 0.0 - The dictionary key size threshold. 0 to disable dictionary encoding. - 1 to always enable dictionary encoding. - bloom_filter_columns : None, set-like or list-like, default None - Columns that use the bloom filter. - bloom_filter_fpp : double, default 0.05 - Upper limit of the false-positive rate of the bloom filter. - """ diff --git a/python/pyarrow/pandas_compat.pyi b/python/pyarrow/pandas_compat.pyi deleted file mode 100644 index 82fcb19ad97..00000000000 --- a/python/pyarrow/pandas_compat.pyi +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any, TypedDict, TypeVar - -import numpy as np -import pandas as pd - -from pandas import DatetimeTZDtype - -from .lib import Array, DataType, Schema, Table - -_T = TypeVar("_T") - -def get_logical_type_map() -> dict[int, str]: ... -def get_logical_type(arrow_type: DataType) -> str: ... -def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... -def get_logical_type_from_numpy(pandas_collection) -> str: ... -def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... - -class _ColumnMetadata(TypedDict): - name: str - field_name: str - pandas_type: int - numpy_type: str - metadata: dict | None - -def get_column_metadata( - column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str -) -> _ColumnMetadata: ... -def construct_metadata( - columns_to_convert: list[pd.Series], - df: pd.DataFrame, - column_names: list[str], - index_levels: list[pd.Index], - index_descriptors: list[dict], - preserve_index: bool, - types: list[DataType], - column_field_names: list[str] = ..., -) -> dict[bytes, bytes]: ... -def dataframe_to_types( - df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None -) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... -def dataframe_to_arrays( - df: pd.DataFrame, - schema: Schema, - preserve_index: bool | None, - nthreads: int = 1, - columns: list[str] | None = None, - safe: bool = True, -) -> tuple[Array, Schema, int]: ... -def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... -def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... -def table_to_dataframe( - options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None -) -> pd.DataFrame: ... -def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/python/pyarrow/substrait.pyi b/python/pyarrow/substrait.pyi deleted file mode 100644 index b78bbd8aebd..00000000000 --- a/python/pyarrow/substrait.pyi +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._substrait import ( - BoundExpressions, - SubstraitSchema, - deserialize_expressions, - deserialize_schema, - get_supported_functions, - run_query, - serialize_expressions, - serialize_schema, -) - -__all__ = [ - "BoundExpressions", - "get_supported_functions", - "run_query", - "deserialize_expressions", - "serialize_expressions", - "deserialize_schema", - "serialize_schema", - "SubstraitSchema", -] diff --git a/python/pyarrow/util.pyi b/python/pyarrow/util.pyi deleted file mode 100644 index 5c9687bb83f..00000000000 --- a/python/pyarrow/util.pyi +++ /dev/null @@ -1,44 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from collections.abc import Callable -from os import PathLike -from typing import Any, Protocol, Sequence, TypeVar - -_F = TypeVar("_F", bound=Callable) -_N = TypeVar("_N") - -class _DocStringComponents(Protocol): - _docstring_components: list[str] - -def doc( - *docstrings: str | _DocStringComponents | Callable | None, **params: Any -) -> Callable[[_F], _F]: ... -def _is_iterable(obj) -> bool: ... -def _is_path_like(path) -> bool: ... -def _stringify_path(path: str | PathLike) -> str: ... -def product(seq: Sequence[_N]) -> _N: ... -def get_contiguous_span( - shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int -) -> tuple[int, int]: ... -def find_free_port() -> int: ... -def guid() -> str: ... -def _download_urllib(url, out_path) -> None: ... -def _download_requests(url, out_path) -> None: ... -def download_tzdata_on_windows() -> None: ... -def _deprecate_api(old_name, new_name, api, next_version, type=...): ... -def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... From a7ca3d2c9c490383ee889abd8972f7c5f4cc46e9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 12 Sep 2025 15:32:41 +0200 Subject: [PATCH 08/26] Remove overloads from stubs and other things --- python/pyarrow/__init__.pyi | 232 +-- python/pyarrow/__lib_pxi/array.pyi | 2281 ++++++++------------------ python/pyarrow/__lib_pxi/io.pyi | 347 ++-- python/pyarrow/__lib_pxi/memory.pyi | 7 +- python/pyarrow/__lib_pxi/scalar.pyi | 1055 +++++------- python/pyarrow/__lib_pxi/tensor.pyi | 51 +- python/pyarrow/__lib_pxi/types.pyi | 915 +++++------ python/pyarrow/_stubs_typing.pyi | 17 +- python/pyarrow/array.pxi | 2 +- python/pyarrow/lib.pyi | 71 +- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/strategies.py | 38 +- python/pyarrow/tests/test_array.py | 8 +- python/pyarrow/tests/test_compute.py | 323 ++-- python/pyarrow/tests/test_io.py | 20 +- python/pyarrow/types.pyi | 506 +++++- python/pyproject.toml | 13 - 17 files changed, 2434 insertions(+), 3454 deletions(-) diff --git a/python/pyarrow/__init__.pyi b/python/pyarrow/__init__.pyi index ed1cad1bf80..d366d1793ff 100644 --- a/python/pyarrow/__init__.pyi +++ b/python/pyarrow/__init__.pyi @@ -15,34 +15,43 @@ # specific language governing permissions and limitations # under the License. +# ruff: noqa: F401, I001, E402 __version__: str import pyarrow.lib as _lib _gc_enabled: bool +# TODO from pyarrow.lib import ( - BuildInfo, - RuntimeInfo, - set_timezone_db_path, + # BuildInfo, + # RuntimeInfo, + # set_timezone_db_path, MonthDayNano, - VersionInfo, - cpp_build_info, - cpp_version, - cpp_version_info, - runtime_info, + # VersionInfo, + # cpp_build_info, + # cpp_version, + # cpp_version_info, + # runtime_info, cpu_count, set_cpu_count, - enable_signal_handlers, + # enable_signal_handlers, io_thread_count, set_io_thread_count, ) -def show_versions() -> None: ... -def show_info() -> None: ... +def show_versions() -> None: + """ + Print various version information, to help with error reporting. + """ +def show_info() -> None: + """ + Print detailed version and platform information, for error reporting + """ def _module_is_available(module: str) -> bool: ... def _filesystem_is_available(fs: str) -> bool: ... +# TODO from pyarrow.lib import ( null, bool_, @@ -123,7 +132,6 @@ from pyarrow.lib import ( UuidType, JsonType, OpaqueType, - PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, @@ -136,8 +144,8 @@ from pyarrow.lib import ( Array, Tensor, array, - chunked_array, - record_batch, + # chunked_array, + # record_batch, nulls, repeat, SparseCOOTensor, @@ -249,7 +257,7 @@ from pyarrow.lib import ( ) # Buffers, allocation -from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager +# from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager from pyarrow.lib import ( Buffer, @@ -303,79 +311,108 @@ from pyarrow.lib import ( have_libhdfs, ) +# TODO from pyarrow.lib import ( - ChunkedArray, - RecordBatch, - Table, - table, + # ChunkedArray, + # RecordBatch, + # Table, + # table, concat_arrays, - concat_tables, - TableGroupBy, - RecordBatchReader, + # concat_tables, + # TableGroupBy, + # RecordBatchReader, ) # Exceptions -from pyarrow.lib import ( - ArrowCancelled, - ArrowCapacityError, - ArrowException, - ArrowKeyError, - ArrowIndexError, - ArrowInvalid, - ArrowIOError, - ArrowMemoryError, - ArrowNotImplementedError, - ArrowTypeError, - ArrowSerializationError, -) +# from pyarrow.lib import ( +# ArrowCancelled, +# ArrowCapacityError, +# ArrowException, +# ArrowKeyError, +# ArrowIndexError, +# ArrowInvalid, +# ArrowIOError, +# ArrowMemoryError, +# ArrowNotImplementedError, +# ArrowTypeError, +# ArrowSerializationError, +# ) -from pyarrow.ipc import serialize_pandas, deserialize_pandas -import pyarrow.ipc as ipc +# TODO +# from ipc import serialize_pandas, deserialize_pandas +# import ipc as ipc -import pyarrow.types as types +import types as types # ---------------------------------------------------------------------- # Deprecations -from pyarrow.util import _deprecate_api, _deprecate_class +# from util import _deprecate_api, _deprecate_class -from pyarrow.ipc import ( - Message, - MessageReader, - MetadataVersion, - RecordBatchFileReader, - RecordBatchFileWriter, - RecordBatchStreamReader, - RecordBatchStreamWriter, -) +# TODO +# from pyarrow.ipc import ( +# Message, +# MessageReader, +# MetadataVersion, +# RecordBatchFileReader, +# RecordBatchFileWriter, +# RecordBatchStreamReader, +# RecordBatchStreamWriter, +# ) # ---------------------------------------------------------------------- # Returning absolute path to the pyarrow include directory (if bundled, e.g. in # wheels) -def get_include() -> str: ... +def get_include() -> str: + """ + Return absolute path to directory containing Arrow C++ include + headers. Similar to numpy.get_include + """ def _get_pkg_config_executable() -> str: ... def _has_pkg_config(pkgname: str) -> bool: ... def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... -def get_libraries() -> list[str]: ... -def create_library_symlinks() -> None: ... -def get_library_dirs() -> list[str]: ... +def get_libraries() -> list[str]: + """ + Return list of library names to include in the `libraries` argument for C + or Cython extensions using pyarrow + """ +def create_library_symlinks() -> None: + """ + With Linux and macOS wheels, the bundled shared libraries have an embedded + ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them + with -larrow won't work unless we create symlinks at locations like + site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses + prior problems we had with shipping two copies of the shared libraries to + permit third party projects like turbodbc to build their C++ extensions + against the pyarrow wheels. + + This function must only be invoked once and only when the shared libraries + are bundled with the Python package, which should only apply to wheel-based + installs. It requires write access to the site-packages/pyarrow directory + and so depending on your system may need to be run with root. + """ +def get_library_dirs() -> list[str]: + """ + Return lists of directories likely to contain Arrow C++ libraries for + linking C or Cython extensions using pyarrow + """ __all__ = [ "__version__", "_lib", "_gc_enabled", - "BuildInfo", - "RuntimeInfo", - "set_timezone_db_path", + # "BuildInfo", + # "RuntimeInfo", + # "set_timezone_db_path", "MonthDayNano", - "VersionInfo", - "cpp_build_info", - "cpp_version", - "cpp_version_info", - "runtime_info", + # "VersionInfo", + # "cpp_build_info", + # "cpp_version", + # "cpp_version_info", + # "runtime_info", "cpu_count", "set_cpu_count", - "enable_signal_handlers", + # "enable_signal_handlers", "io_thread_count", "set_io_thread_count", "show_versions", @@ -461,7 +498,6 @@ __all__ = [ "UuidType", "JsonType", "OpaqueType", - "PyExtensionType", "UnknownExtensionType", "register_extension_type", "unregister_extension_type", @@ -474,8 +510,8 @@ __all__ = [ "Array", "Tensor", "array", - "chunked_array", - "record_batch", + # "chunked_array", + # "record_batch", "nulls", "repeat", "SparseCOOTensor", @@ -584,10 +620,10 @@ __all__ = [ "UuidScalar", "JsonScalar", "OpaqueScalar", - "DeviceAllocationType", - "Device", - "MemoryManager", - "default_cpu_memory_manager", + # "DeviceAllocationType", + # "Device", + # "MemoryManager", + # "default_cpu_memory_manager", "Buffer", "ResizableBuffer", "foreign_buffer", @@ -630,38 +666,38 @@ __all__ = [ "input_stream", "output_stream", "have_libhdfs", - "ChunkedArray", - "RecordBatch", - "Table", - "table", + # "ChunkedArray", + # "RecordBatch", + # "Table", + # "table", "concat_arrays", - "concat_tables", - "TableGroupBy", - "RecordBatchReader", - "ArrowCancelled", - "ArrowCapacityError", - "ArrowException", - "ArrowKeyError", - "ArrowIndexError", - "ArrowInvalid", - "ArrowIOError", - "ArrowMemoryError", - "ArrowNotImplementedError", - "ArrowTypeError", - "ArrowSerializationError", - "serialize_pandas", - "deserialize_pandas", - "ipc", + # "concat_tables", + # "TableGroupBy", + # "RecordBatchReader", + # "ArrowCancelled", + # "ArrowCapacityError", + # "ArrowException", + # "ArrowKeyError", + # "ArrowIndexError", + # "ArrowInvalid", + # "ArrowIOError", + # "ArrowMemoryError", + # "ArrowNotImplementedError", + # "ArrowTypeError", + # "ArrowSerializationError", + # "serialize_pandas", + # "deserialize_pandas", + # "ipc", "types", - "_deprecate_api", - "_deprecate_class", - "Message", - "MessageReader", - "MetadataVersion", - "RecordBatchFileReader", - "RecordBatchFileWriter", - "RecordBatchStreamReader", - "RecordBatchStreamWriter", + # "_deprecate_api", + # "_deprecate_class", + # "Message", + # "MessageReader", + # "MetadataVersion", + # "RecordBatchFileReader", + # "RecordBatchFileWriter", + # "RecordBatchStreamReader", + # "RecordBatchStreamWriter", "get_include", "_get_pkg_config_executable", "_has_pkg_config", diff --git a/python/pyarrow/__lib_pxi/array.pyi b/python/pyarrow/__lib_pxi/array.pyi index c14cd1b8c44..c6e8dfecb62 100644 --- a/python/pyarrow/__lib_pxi/array.pyi +++ b/python/pyarrow/__lib_pxi/array.pyi @@ -15,11 +15,9 @@ # specific language governing permissions and limitations # under the License. -import datetime as dt import sys from collections.abc import Callable -from decimal import Decimal if sys.version_info >= (3, 11): from typing import Self @@ -31,16 +29,14 @@ from typing import ( Iterable, Iterator, Literal, - LiteralString, TypeVar, - overload, ) import numpy as np import pandas as pd from pandas.core.dtypes.base import ExtensionDtype -from pyarrow._compute import CastOptions +from pyarrow._compute import CastOptions # type: ignore[import-not-found] from pyarrow._stubs_typing import ( ArrayLike, Indices, @@ -49,25 +45,23 @@ from pyarrow._stubs_typing import ( SupportArrowArray, SupportArrowDeviceArray, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[attr-defined] Buffer, - Device, - MemoryManager, + Device, # type: ignore[reportAttributeAccessIssue] + MemoryManager, # type: ignore[reportAttributeAccessIssue] MemoryPool, - MonthDayNano, Tensor, _Weakrefable, ) from typing_extensions import deprecated from . import scalar, types -from .device import DeviceAllocationType -from .scalar import NullableCollection, Scalar +from .device import DeviceAllocationType # type: ignore[import-not-found] +from .scalar import Scalar from .types import ( DataType, Field, MapType, - ListType, _AsPyType, _BasicDataType, _BasicValueT, @@ -76,1308 +70,185 @@ from .types import ( _RunEndType, _Size, ) +from .._stubs_typing import NullableCollection -@overload def array( - values: NullableCollection[bool], - type: None = None, + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Any | None = None, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> BooleanArray: ... -@overload -def array( - values: NullableCollection[int], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int64Array: ... -@overload -def array( - values: NullableCollection[float], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DoubleArray: ... -@overload -def array( - values: NullableCollection[Decimal], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Decimal128Array: ... -@overload -def array( - values: NullableCollection[dict[str, Any]], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StructArray: ... -@overload -def array( - values: NullableCollection[dt.date], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Date32Array: ... -@overload -def array( - values: NullableCollection[dt.time], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time64Array[Literal["us"]]: ... -@overload -def array( - values: NullableCollection[dt.timedelta], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["us"]]: ... -@overload -def array( - values: NullableCollection[MonthDayNano], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalArray: ... -@overload -def array( - values: NullableCollection[str], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def array( - values: NullableCollection[bytes], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def array( - values: NullableCollection[list[Any]], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> ListArray[Any]: ... -@overload -def array( - values: NullableCollection[_ScalarT], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[_ScalarT]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["null"] | types.NullType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> NullArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["bool", "boolean"] | types.BoolType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BooleanArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i1", "int8"] | types.Int8Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int8Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i2", "int16"] | types.Int16Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int16Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i4", "int32"] | types.Int32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int32Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i8", "int64"] | types.Int64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int64Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"] | types.UInt8Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt8Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"] | types.UInt16Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt16Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u4", "uint32"] | types.Uint32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt32Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"] | types.UInt64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt64Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f2", "halffloat", "float16"] | types.Float16Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> HalfFloatArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f4", "float", "float32"] | types.Float32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> FloatArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f8", "double", "float64"] | types.Float64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DoubleArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string", "str", "utf8"] | types.StringType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary"] | types.BinaryType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> LargeStringArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_binary"] | types.LargeBinaryType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary_view"] | types.BinaryViewType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BinaryViewArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string_view"] | types.StringViewType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StringViewArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date32", "date32[day]"] | types.Date32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Date32Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date64", "date64[ms]"] | types.Date64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Date64Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time32Array[Literal["s"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time32Array[Literal["ms"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time64Array[Literal["us"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time64Array[Literal["ns"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[Literal["s"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[Literal["ms"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[Literal["us"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]"] | types.DurationType[Literal["s"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["s"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["ms"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[us]"] | types.DurationType[Literal["us"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["us"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["ns"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: _DataTypeT, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[Scalar[_DataTypeT]]: ... -def array(*args, **kawrgs): - """ - Create pyarrow.Array instance from a Python object. - - Parameters - ---------- - obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array - If both type and size are specified may be a single use iterable. If - not strongly-typed, Arrow type will be inferred for resulting array. - Any Arrow-compatible array that implements the Arrow PyCapsule Protocol - (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) - can be passed as well. - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred from - the data. - mask : array[bool], optional - Indicate which values are null (True) or not null (False). - size : int64, optional - Size of the elements. If the input is larger than size bail at this - length. For iterators, if size is larger than the input iterator this - will be treated as a "max size", but will involve an initial allocation - of size followed by a resize to the actual size (so if you know the - exact size specifying it correctly will give you better performance). - from_pandas : bool, default None - Use pandas's semantics for inferring nulls from values in - ndarray-like data. If passed, the mask tasks precedence, but - if a value is unmasked (not-null), but still null according to - pandas semantics, then it is null. Defaults to False if not - passed explicitly by user, or True if a pandas object is - passed in. - safe : bool, default True - Check for overflows or other unsafe conversions. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Returns - ------- - array : pyarrow.Array or pyarrow.ChunkedArray - A ChunkedArray instead of an Array is returned if: - - - the object data overflowed binary storage. - - the object's ``__arrow_array__`` protocol method returned a chunked - array. - - Notes - ----- - Timezone will be preserved in the returned array for timezone-aware data, - else no timezone will be returned for naive timestamps. - Internally, UTC values are stored for timezone-aware data with the - timezone set in the data type. - - Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by - default converted as MonthDayNanoIntervalArray. relativedelta leapdays - are ignored as are all absolute fields on both objects. datetime.timedelta - can also be converted to MonthDayNanoIntervalArray but this requires - passing MonthDayNanoIntervalType explicitly. - - Converting to dictionary array will promote to a wider integer type for - indices if the number of distinct values cannot be represented, even if - the index type was explicitly set. This means that if there are more than - 127 values the returned dictionary array's index type will be at least - pa.int16() even if pa.int8() was passed to the function. Note that an - explicit index type will not be demoted even if it is wider than required. - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> pa.array(pd.Series([1, 2])) - - [ - 1, - 2 - ] - - >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) - - ... - -- dictionary: - [ - "a", - "b" - ] - -- indices: - [ - 0, - 1, - 0 - ] - - >>> import numpy as np - >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) - - [ - 1, - null - ] - - >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) - >>> arr.type.index_type - DataType(int16) - """ - -@overload -def asarray(values: NullableCollection[bool]) -> BooleanArray: ... -@overload -def asarray(values: NullableCollection[int]) -> Int64Array: ... -@overload -def asarray(values: NullableCollection[float]) -> DoubleArray: ... -@overload -def asarray(values: NullableCollection[Decimal]) -> Decimal128Array: ... -@overload -def asarray(values: NullableCollection[dict[str, Any]]) -> StructArray: ... -@overload -def asarray(values: NullableCollection[dt.date]) -> Date32Array: ... -@overload -def asarray(values: NullableCollection[dt.time]) -> Time64Array: ... -@overload -def asarray(values: NullableCollection[dt.timedelta]) -> DurationArray: ... -@overload -def asarray(values: NullableCollection[MonthDayNano]) -> MonthDayNanoIntervalArray: ... -@overload -def asarray(values: NullableCollection[list[Any]]) -> ListArray[Any]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["null"] | types.NullType, -) -> NullArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["bool", "boolean"] | types.BoolType, -) -> BooleanArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i1", "int8"] | types.Int8Type, -) -> Int8Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i2", "int16"] | types.Int16Type, -) -> Int16Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i4", "int32"] | types.Int32Type, -) -> Int32Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i8", "int64"] | types.Int64Type, -) -> Int64Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"] | types.UInt8Type, -) -> UInt8Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"] | types.UInt16Type, -) -> UInt16Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u4", "uint32"] | types.Uint32Type, -) -> UInt32Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"] | types.UInt64Type, -) -> UInt64Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f2", "halffloat", "float16"] | types.Float16Type, -) -> HalfFloatArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f4", "float", "float32"] | types.Float32Type, -) -> FloatArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f8", "double", "float64"] | types.Float64Type, -) -> DoubleArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string", "str", "utf8"] | types.StringType, -) -> StringArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary"] | types.BinaryType, -) -> BinaryArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, -) -> LargeStringArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_binary"] | types.LargeBinaryType, -) -> LargeBinaryArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary_view"] | types.BinaryViewType, -) -> BinaryViewArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string_view"] | types.StringViewType, -) -> StringViewArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date32", "date32[day]"] | types.Date32Type, -) -> Date32Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date64", "date64[ms]"] | types.Date64Type, -) -> Date64Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], -) -> Time32Array[Literal["s"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], -) -> Time32Array[Literal["ms"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], -) -> Time64Array[Literal["us"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], -) -> Time64Array[Literal["ns"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], -) -> TimestampArray[Literal["s"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], -) -> TimestampArray[Literal["ms"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], -) -> TimestampArray[Literal["us"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], -) -> TimestampArray[Literal["ns"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]"] | types.DurationType[Literal["s"]], -) -> DurationArray[Literal["s"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], -) -> DurationArray[Literal["ms"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[us]"] | types.DurationType[Literal["us"]], -) -> DurationArray[Literal["us"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], -) -> DurationArray[Literal["ns"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, -) -> MonthDayNanoIntervalArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: _DataTypeT, -) -> Array[Scalar[_DataTypeT]]: ... -def asarray(*args, **kwargs): - """ - Convert to pyarrow.Array, inferring type if not provided. - - Parameters - ---------- - values : array-like - This can be a sequence, numpy.ndarray, pyarrow.Array or - pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be - a ChunkedArray, otherwise the output will be a Array. - type : string or DataType - Explicitly construct the array with this type. Attempt to cast if - indicated type is different. - - Returns - ------- - arr : Array or ChunkedArray - """ - -@overload -def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... -@overload -def nulls( - size: int, type: types.NullType | None, memory_pool: MemoryPool | None = None -) -> NullArray: ... -@overload -def nulls( - size: int, type: types.BoolType, memory_pool: MemoryPool | None = None -) -> BooleanArray: ... -@overload -def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... -@overload -def nulls( - size: int, type: types.Int16Type, memory_pool: MemoryPool | None = None -) -> Int16Array: ... -@overload -def nulls( - size: int, type: types.Int32Type, memory_pool: MemoryPool | None = None -) -> Int32Array: ... -@overload -def nulls( - size: int, type: types.Int64Type, memory_pool: MemoryPool | None = None -) -> Int64Array: ... -@overload -def nulls( - size: int, type: types.UInt8Type, memory_pool: MemoryPool | None = None -) -> UInt8Array: ... -@overload -def nulls( - size: int, type: types.UInt16Type, memory_pool: MemoryPool | None = None -) -> UInt16Array: ... -@overload -def nulls( - size: int, type: types.Uint32Type, memory_pool: MemoryPool | None = None -) -> UInt32Array: ... -@overload -def nulls( - size: int, type: types.UInt64Type, memory_pool: MemoryPool | None = None -) -> UInt64Array: ... -@overload -def nulls( - size: int, type: types.Float16Type, memory_pool: MemoryPool | None = None -) -> HalfFloatArray: ... -@overload -def nulls( - size: int, type: types.Float32Type, memory_pool: MemoryPool | None = None -) -> FloatArray: ... -@overload -def nulls( - size: int, type: types.Float64Type, memory_pool: MemoryPool | None = None -) -> DoubleArray: ... -@overload -def nulls( - size: int, type: types.Decimal32Type, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def nulls( - size: int, type: types.Decimal64Type, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def nulls( - size: int, type: types.Decimal128Type, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def nulls( - size: int, type: types.Decimal256Type, memory_pool: MemoryPool | None = None -) -> Decimal256Array: ... -@overload -def nulls( - size: int, type: types.Date32Type, memory_pool: MemoryPool | None = None -) -> Date32Array: ... -@overload -def nulls( - size: int, type: types.Date64Type, memory_pool: MemoryPool | None = None -) -> Date64Array: ... -@overload -def nulls( - size: int, type: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None -) -> Time32Array[types._Time32Unit]: ... -@overload -def nulls( - size: int, type: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None -) -> Time64Array[types._Time64Unit]: ... -@overload -def nulls( - size: int, - type: types.TimestampType[types._Unit, types._Tz], - memory_pool: MemoryPool | None = None, -) -> TimestampArray[types._Unit, types._Tz]: ... -@overload -def nulls( - size: int, type: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None -) -> DurationArray[types._Unit]: ... -@overload -def nulls( - size: int, type: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None -) -> MonthDayNanoIntervalArray: ... -@overload -def nulls( - size: int, - type: types.BinaryType, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def nulls( - size: int, - type: types.LargeBinaryType, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryArray: ... -@overload -def nulls( - size: int, - type: types.FixedSizeBinaryType, - memory_pool: MemoryPool | None = None, -) -> FixedSizeBinaryArray: ... -@overload -def nulls( - size: int, - type: types.StringType, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def nulls( - size: int, - type: types.LargeStringType, - memory_pool: MemoryPool | None = None, -) -> LargeStringArray: ... -@overload -def nulls( - size: int, - type: types.BinaryViewType, - memory_pool: MemoryPool | None = None, -) -> BinaryViewArray: ... -@overload -def nulls( - size: int, - type: types.StringViewType, - memory_pool: MemoryPool | None = None, -) -> StringViewArray: ... -@overload -def nulls( - size: int, - type: types.LargeListType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> LargeListArray[_DataTypeT]: ... -@overload -def nulls( - size: int, - type: types.ListViewType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> ListViewArray[_DataTypeT]: ... -@overload -def nulls( - size: int, - type: types.LargeListViewType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> LargeListViewArray[_DataTypeT]: ... -@overload -def nulls( - size: int, - type: types.FixedSizeListType[_DataTypeT, _Size], - memory_pool: MemoryPool | None = None, -) -> FixedSizeListArray[_DataTypeT, _Size]: ... -@overload -def nulls( - size: int, - type: types.ListType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... -@overload -def nulls( - size: int, - type: types.StructType, - memory_pool: MemoryPool | None = None, -) -> StructArray: ... -@overload -def nulls( - size: int, - type: types.MapType[_MapKeyT, _MapItemT], - memory_pool: MemoryPool | None = None, -) -> MapArray[_MapKeyT, _MapItemT]: ... -@overload -def nulls( - size: int, - type: types.DictionaryType[_IndexT, _BasicValueT], - memory_pool: MemoryPool | None = None, -) -> DictionaryArray[_IndexT, _BasicValueT]: ... -@overload -def nulls( - size: int, - type: types.RunEndEncodedType[_RunEndType, _BasicValueT], - memory_pool: MemoryPool | None = None, -) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... -@overload -def nulls( - size: int, - type: types.UnionType, - memory_pool: MemoryPool | None = None, -) -> UnionArray: ... -@overload -def nulls( - size: int, - type: types.FixedShapeTensorType[types._ValueT], - memory_pool: MemoryPool | None = None, -) -> FixedShapeTensorArray[Any]: ... -@overload -def nulls( - size: int, - type: types.Bool8Type, - memory_pool: MemoryPool | None = None, -) -> Bool8Array: ... -@overload -def nulls( - size: int, - type: types.UuidType, - memory_pool: MemoryPool | None = None, -) -> UuidArray[Any]: ... -@overload -def nulls( - size: int, - type: types.JsonType, - memory_pool: MemoryPool | None = None, -) -> JsonArray[Any]: ... -@overload -def nulls( - size: int, - type: types.OpaqueType, - memory_pool: MemoryPool | None = None, -) -> OpaqueArray[Any]: ... -@overload -def nulls( - size: int, - type: types.ExtensionType, - memory_pool: MemoryPool | None = None, -) -> ExtensionArray[Any]: ... -def nulls(*args, **kwargs): +) -> ArrayLike: """ - Create a strongly-typed Array instance with all elements null. + Create pyarrow.Array instance from a Python object. Parameters ---------- - size : int - Array length. - type : pyarrow.DataType, default None - Explicit type for the array. By default use NullType. - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. + obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) + can be passed as well. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data. + mask : array[bool], optional + Indicate which values are null (True) or not null (False). + size : int64, optional + Size of the elements. If the input is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. If passed, the mask tasks precedence, but + if a value is unmasked (not-null), but still null according to + pandas semantics, then it is null. Defaults to False if not + passed explicitly by user, or True if a pandas object is + passed in. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. Returns ------- - arr : Array + array : pyarrow.Array or pyarrow.ChunkedArray + A ChunkedArray instead of an Array is returned if: + + - the object data overflowed binary storage. + - the object's ``__arrow_array__`` protocol method returned a chunked + array. + + Notes + ----- + Timezone will be preserved in the returned array for timezone-aware data, + else no timezone will be returned for naive timestamps. + Internally, UTC values are stored for timezone-aware data with the + timezone set in the data type. + + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by + default converted as MonthDayNanoIntervalArray. relativedelta leapdays + are ignored as are all absolute fields on both objects. datetime.timedelta + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. + + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. Examples -------- + >>> import pandas as pd >>> import pyarrow as pa - >>> pa.nulls(10) - - 10 nulls - - >>> pa.nulls(3, pa.uint32()) - + >>> pa.array(pd.Series([1, 2])) + [ - null, - null, - null - ] - """ - -@overload -def repeat( - value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None -) -> NullArray: ... -@overload -def repeat( # type: ignore[overload-overlap] - value: bool | scalar.BooleanScalar, size: int, memory_pool: MemoryPool | None = None -) -> BooleanArray: ... -@overload -def repeat( - value: scalar.Int8Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int8Array: ... -@overload -def repeat( - value: scalar.Int16Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int16Array: ... -@overload -def repeat( - value: scalar.Int32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int32Array: ... -@overload -def repeat( - value: int | scalar.Int64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int64Array: ... -@overload -def repeat( - value: scalar.UInt8Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt8Array: ... -@overload -def repeat( - value: scalar.UInt16Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt16Array: ... -@overload -def repeat( - value: scalar.UInt32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt32Array: ... -@overload -def repeat( - value: scalar.UInt64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt64Array: ... -@overload -def repeat( - value: scalar.HalfFloatScalar, size: int, memory_pool: MemoryPool | None = None -) -> HalfFloatArray: ... -@overload -def repeat( - value: scalar.FloatScalar, size: int, memory_pool: MemoryPool | None = None -) -> FloatArray: ... -@overload -def repeat( - value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None -) -> DoubleArray: ... -@overload -def repeat( - value: Decimal | scalar.Decimal32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal32Array: ... -@overload -def repeat( - value: scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal64Array: ... -@overload -def repeat( - value: scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def repeat( - value: scalar.Decimal256Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal256Array: ... -@overload -def repeat( - value: dt.date | scalar.Date32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Date32Array: ... -@overload -def repeat( - value: scalar.Date64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Date64Array: ... -@overload -def repeat( - value: scalar.Time32Scalar[types._Time32Unit], size: int, memory_pool: MemoryPool | None = None -) -> Time32Array[types._Time32Unit]: ... -@overload -def repeat( - value: dt.time | scalar.Time64Scalar[types._Time64Unit], - size: int, - memory_pool: MemoryPool | None = None, -) -> Time64Array[types._Time64Unit]: ... -@overload -def repeat( - value: scalar.TimestampScalar[types._Unit, types._Tz], - size: int, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[types._Unit, types._Tz]: ... -@overload -def repeat( - value: dt.timedelta | scalar.DurationScalar[types._Unit], - size: int, - memory_pool: MemoryPool | None = None, -) -> DurationArray[types._Unit]: ... -@overload -def repeat( # pyright: ignore[reportOverlappingOverload] - value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalArray: ... -@overload -def repeat( - value: bytes | scalar.BinaryScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def repeat( - value: scalar.LargeBinaryScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryArray: ... -@overload -def repeat( - value: scalar.FixedSizeBinaryScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> FixedSizeBinaryArray: ... -@overload -def repeat( - value: str | scalar.StringScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def repeat( - value: scalar.LargeStringScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeStringArray: ... -@overload -def repeat( - value: scalar.BinaryViewScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> BinaryViewArray: ... -@overload -def repeat( - value: scalar.StringViewScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> StringViewArray: ... -@overload -def repeat( - value: list[Any] | tuple[Any] | scalar.ListScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... -@overload -def repeat( - value: scalar.FixedSizeListScalar[_DataTypeT, _Size], - size: int, - memory_pool: MemoryPool | None = None, -) -> FixedSizeListArray[_DataTypeT, _Size]: ... -@overload -def repeat( - value: scalar.LargeListScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeListArray[_DataTypeT]: ... -@overload -def repeat( - value: scalar.ListViewScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> ListViewArray[_DataTypeT]: ... -@overload -def repeat( - value: scalar.LargeListViewScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeListViewArray[_DataTypeT]: ... -@overload -def repeat( - value: dict[str, Any] | scalar.StructScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> StructArray: ... -@overload -def repeat( - value: scalar.MapScalar[_MapKeyT, _MapItemT], - size: int, - memory_pool: MemoryPool | None = None, -) -> MapArray[_MapKeyT, _MapItemT]: ... -@overload -def repeat( - value: scalar.DictionaryScalar[_IndexT, _BasicValueT], - size: int, - memory_pool: MemoryPool | None = None, -) -> DictionaryArray[_IndexT, _BasicValueT]: ... -@overload -def repeat( - value: scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT], - size: int, - memory_pool: MemoryPool | None = None, -) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... -@overload -def repeat( - value: scalar.UnionScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> UnionArray: ... -@overload -def repeat( - value: scalar.FixedShapeTensorScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> FixedShapeTensorArray[Any]: ... -@overload -def repeat( - value: scalar.Bool8Scalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> Bool8Array: ... -@overload -def repeat( - value: scalar.UuidScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> UuidArray[Any]: ... -@overload -def repeat( - value: scalar.JsonScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> JsonArray[Any]: ... -@overload -def repeat( - value: scalar.OpaqueScalar, + 1, + 2 + ] + + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + ... + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + + [ + 1, + null + ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) + """ + +def asarray( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT | Any | None = None, +) -> Array[Scalar[_DataTypeT]] | ArrayLike: + """ + Convert to pyarrow.Array, inferring type if not provided. + + Parameters + ---------- + values : array-like + This can be a sequence, numpy.ndarray, pyarrow.Array or + pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be + a ChunkedArray, otherwise the output will be a Array. + type : string or DataType + Explicitly construct the array with this type. Attempt to cast if + indicated type is different. + + Returns + ------- + arr : Array or ChunkedArray + """ + +def nulls( size: int, + type: Any | None = None, memory_pool: MemoryPool | None = None, -) -> OpaqueArray[Any]: ... -@overload +) -> ArrayLike: + """ + Create a strongly-typed Array instance with all elements null. + + Parameters + ---------- + size : int + Array length. + type : pyarrow.DataType, default None + Explicit type for the array. By default use NullType. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.nulls(10) + + 10 nulls + + >>> pa.nulls(3, pa.uint32()) + + [ + null, + null, + null + ] + """ + def repeat( - value: scalar.ExtensionScalar, + value: Any, size: int, memory_pool: MemoryPool | None = None, -) -> ExtensionArray[Any]: ... -def repeat(*args, **kwargs): +) -> ArrayLike: """ Create an Array instance whose slots are the given scalar. @@ -1427,7 +298,7 @@ def repeat(*args, **kwargs): "string" ] - >>> pa.repeat(pa.scalar({"a": 1, "b": [1, 2]}), 2) + >>> pa.repeat(pa.scalar({'a': 1, 'b': [1, 2]}), 2) -- is_valid: all not null -- child 0 type: int64 @@ -1620,13 +491,10 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): Convert a Table to pandas DataFrame: - >>> table = pa.table( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) + >>> table = pa.table([ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + ... ], names=['n_legs', 'animals']) >>> table.to_pandas() n_legs animals 0 2 Flamingo @@ -1641,7 +509,8 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): >>> import pyarrow as pa >>> n_legs = pa.array([2, 4, 5, 100]) >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + >>> batch = pa.record_batch([n_legs, animals], + ... names=["n_legs", "animals"]) >>> batch pyarrow.RecordBatch n_legs: int64 @@ -1705,7 +574,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): >>> import pyarrow as pa >>> left = pa.array(["one", "two", "three"]) >>> right = pa.array(["two", None, "two-and-a-half", "three"]) - >>> print(left.diff(right)) # doctest: +SKIP + >>> print(left.diff(right)) # doctest: +SKIP @@ -0, +0 @@ -"one" @@ -1798,7 +667,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): encoded : DictionaryArray A dictionary-encoded version of this array. """ - def value_count(self) -> StructArray: + def value_counts(self) -> StructArray: """ Compute counts of unique elements in array. @@ -1807,27 +676,15 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): StructArray An array of structs """ - @overload - @staticmethod - def from_pandas( - obj: pd.Series | np.ndarray | ArrayLike, - *, - mask: Mask | None = None, - type: _DataTypeT, - safe: bool = True, - memory_pool: MemoryPool | None = None, - ) -> Array[Scalar[_DataTypeT]]: ... - @overload @staticmethod def from_pandas( obj: pd.Series | np.ndarray | ArrayLike, *, mask: Mask | None = None, + type: _DataTypeT | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, - ) -> Array[Scalar]: ... - @staticmethod - def from_pandas(*args, **kwargs): + ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: """ Convert pandas.Series to an Arrow Array. @@ -1926,7 +783,10 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): only be counted once. """ def __sizeof__(self) -> int: ... - def __iter__(self) -> Iterator[_Scalar_co]: ... + def __iter__(self) -> Iterator[_Scalar_co]: + """ + Implement iter(self). + """ def to_string( self, *, @@ -1961,10 +821,24 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): skip_new_lines : bool If the array should be rendered as a single line of text or if each element should be on its own line. + element_size_limit : int, default 100 + Maximum number of characters of a single element before it is truncated. """ format = to_string - def equals(self, other: Self | Iterable[Any]) -> bool: ... - def __len__(self) -> int: ... + def equals(self, other: Self) -> bool: + """ + Parameters + ---------- + other : pyarrow.Array + + Returns + ------- + bool + """ + def __len__(self) -> int: + """ + Return len(self). + """ def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: """ Return BooleanArray indicating the null values. @@ -1991,7 +865,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): Return BooleanArray indicating the non-null values. """ def fill_null( - self: Array[Scalar[_BasicDataType[_AsPyType]]] | Array[Scalar[_DataTypeT]], fill_value: Scalar[_DataTypeT] | _AsPyType | str | None + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: """ See :func:`pyarrow.compute.fill_null` for usage. @@ -2006,11 +880,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): result : Array A new array with nulls replaced by the given value. """ - @overload - def __getitem__(self, key: int) -> _Scalar_co: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - def __getitem__(self, key): + def __getitem__(self, key: int | slice) -> _Scalar_co | Self: """ Slice or return value at given index @@ -2085,25 +955,15 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): An array of the same type, with only the elements selected by the boolean mask. """ - @overload - def index( - self: Array[_ScalarT], - value: _ScalarT, - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> scalar.Int64Scalar: ... - @overload + def index( - self: Array[Scalar[_BasicDataType[_AsPyType]]], - value: _AsPyType | None, + self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]], + value: _ScalarT | _AsPyType, start: int | None = None, end: int | None = None, *, memory_pool: MemoryPool | None = None, - ) -> scalar.Int64Scalar: ... - def index(self, *args, **kwargs): + ) -> scalar.Int64Scalar | scalar.Int64Scalar: """ Find the first index of a value. @@ -2171,9 +1031,9 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): array : numpy.ndarray """ def to_pylist( - self: Array[Scalar[_BasicDataType[_AsPyType]]] | Array[Scalar[ListType[Any]]] | StructArray | DictionaryArray[Unknown, Unknown], + self: Array[Scalar[_BasicDataType[_AsPyType]]], *, - map_as_pydicts: Literal["lossy", "strict"] | None = None, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[_AsPyType | None]: """ Convert to a list of native Python objects. @@ -2363,7 +1223,8 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): @classmethod def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... def __dlpack__(self, stream: int | None = None) -> Any: - """Export a primitive array as a DLPack capsule. + """ + Export a primitive array as a DLPack capsule. Parameters ---------- @@ -2372,163 +1233,307 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): Stream is provided by the consumer to the producer to instruct the producer to ensure that operations can safely be performed on the array. - Returns - ------- - capsule : PyCapsule - A DLPack capsule for the array, pointing to a DLManagedTensor. + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + def __dlpack_device__(self) -> tuple[int, int]: + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the array resides. + + Returns + ------- + DeviceAllocationType + """ + + @property + def is_cpu(self) -> bool: + """ + Whether the array is CPU-accessible. + """ + @property + def statistics(self) -> ArrayStatistics | None: + """ + Statistics of the array. + """ + +class NullArray(Array[scalar.NullScalar]): + """ + Concrete class for Arrow arrays of null data type. + """ + +class BooleanArray(Array[scalar.BooleanScalar]): + """ + Concrete class for Arrow arrays of boolean data type. + """ + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + +class NumericArray(Array[_ScalarT]): + """ + A base class for Arrow numeric arrays. + """ +class IntegerArray(NumericArray[_ScalarT]): + """ + A base class for Arrow integer arrays. + """ +class FloatingPointArray(NumericArray[_ScalarT]): + """ + A base class for Arrow floating-point arrays. + """ +class Int8Array(IntegerArray[scalar.Int8Scalar]): + """ + Concrete class for Arrow arrays of int8 data type. + """ +class UInt8Array(IntegerArray[scalar.UInt8Scalar]): + """ + Concrete class for Arrow arrays of uint8 data type. + """ +class Int16Array(IntegerArray[scalar.Int16Scalar]): + """ + Concrete class for Arrow arrays of int16 data type. + """ +class UInt16Array(IntegerArray[scalar.UInt16Scalar]): + """ + Concrete class for Arrow arrays of uint16 data type. + """ +class Int32Array(IntegerArray[scalar.Int32Scalar]): + """ + Concrete class for Arrow arrays of int32 data type. + """ +class UInt32Array(IntegerArray[scalar.UInt32Scalar]): + """ + Concrete class for Arrow arrays of uint32 data type. + """ +class Int64Array(IntegerArray[scalar.Int64Scalar]): + """ + Concrete class for Arrow arrays of int64 data type. + """ +class UInt64Array(IntegerArray[scalar.UInt64Scalar]): + """ + Concrete class for Arrow arrays of uint64 data type. + """ +class Date32Array(NumericArray[scalar.Date32Scalar]): + """ + Concrete class for Arrow arrays of date32 data type. + """ +class Date64Array(NumericArray[scalar.Date64Scalar]): + """ + Concrete class for Arrow arrays of date64 data type. + """ +class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): + """ + Concrete class for Arrow arrays of timestamp data type. + """ +class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): + """ + Concrete class for Arrow arrays of time32 data type. + """ +class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): + """ + Concrete class for Arrow arrays of time64 data type. + """ +class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): + """ + Concrete class for Arrow arrays of duration data type. + """ +class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): + """ + Concrete class for Arrow arrays of interval[MonthDayNano] type. + """ +class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): + """ + Concrete class for Arrow arrays of float16 data type. + """ +class FloatArray(FloatingPointArray[scalar.FloatScalar]): + """ + Concrete class for Arrow arrays of float32 data type. + """ +class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): + """ + Concrete class for Arrow arrays of float64 data type. + """ +class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): + """ + Concrete class for Arrow arrays of a fixed-size binary data type. + """ +class Decimal32Array(FixedSizeBinaryArray): + """ + """ +class Decimal64Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal64 data type. + """ +class Decimal128Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal128 data type. + """ +class Decimal256Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal256 data type. + """ + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: + """ + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursive'. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. + + Returns + ------- + result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. + + >>> array = pa.array([ + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - def __dlpack_device__(self) -> tuple[int, int]: + def value_parent_indices(self) -> Int64Array: """ - Return the DLPack device tuple this arrays resides on. + Return array of same length as list child values array where each + output value is the index of the parent list array slot containing each + child value. - Returns - ------- - tuple : Tuple[int, int] - Tuple with index specifying the type of the device (where - CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the - device which is 0 by default for CPU. - """ - @property - def device_type(self) -> DeviceAllocationType: + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array([[1, 2, 3], [], None, [4]], + ... type=pa.list_(pa.int32())) + >>> arr.value_parent_indices() + + [ + 0, + 0, + 0, + 3 + ] """ - The device type where the array resides. - - Returns - ------- - DeviceAllocationType + def value_lengths(self) -> Int32Array: """ + Return integers array with values equal to the respective length of + each list element. Null list values are null in the output. - @property - def is_cpu(self) -> bool: - """ - Whether the array is CPU-accessible. - """ - @property - def statistics(self) -> ArrayStatistics | None: - """ - Statistics of the array. + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array([[1, 2, 3], [], None, [4]], + ... type=pa.list_(pa.int32())) + >>> arr.value_lengths() + + [ + 3, + 0, + null, + 1 + ] """ -class NullArray(Array[scalar.NullScalar]): ... - -class BooleanArray(Array[scalar.BooleanScalar]): - @property - def false_count(self) -> int: ... - @property - def true_count(self) -> int: ... - -class NumericArray(Array[_ScalarT]): ... -class IntegerArray(NumericArray[_ScalarT]): ... -class FloatingPointArray(NumericArray[_ScalarT]): ... -class Int8Array(IntegerArray[scalar.Int8Scalar]): ... -class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... -class Int16Array(IntegerArray[scalar.Int16Scalar]): ... -class UInt16Array(IntegerArray[scalar.UInt16Scalar]): ... -class Int32Array(IntegerArray[scalar.Int32Scalar]): ... -class UInt32Array(IntegerArray[scalar.UInt32Scalar]): ... -class Int64Array(IntegerArray[scalar.Int64Scalar]): ... -class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... -class Date32Array(NumericArray[scalar.Date32Scalar]): ... -class Date64Array(NumericArray[scalar.Date64Scalar]): ... -class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): ... -class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): ... -class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): ... -class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): ... -class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... -class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... -class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... -class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... -class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... -class Decimal32Array(FixedSizeBinaryArray): ... -class Decimal64Array(FixedSizeBinaryArray): ... -class Decimal128Array(FixedSizeBinaryArray): ... -class Decimal256Array(FixedSizeBinaryArray): ... - -class BaseListArray(Array[_ScalarT]): - def flatten(self, recursive: bool = False) -> Array: ... - def value_parent_indices(self) -> Int64Array: ... - def value_lengths(self) -> Int32Array: ... - class ListArray(BaseListArray[_ScalarT]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[int], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.Int64Type]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[float], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.Float64Type]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[str], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.StringType]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[bytes], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.BinaryType]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list, - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray: ... - @overload + """ + Concrete class for Arrow arrays of a list data type. + """ @classmethod def from_arrays( cls, offsets: Int32Array | list[int], - values: Array | list, + values: Array[Scalar[_DataTypeT]] | list[int] | list[float] | list[str] | list[bytes] | list, *, - type: _DataTypeT, + type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): + ) -> ListArray[scalar.ListScalar[_DataTypeT | types.Int64Type | types.Float64Type | types.StringType | types.BinaryType]] | ListArray: """ Construct ListArray from arrays of int32 offsets and values. @@ -2646,7 +1651,6 @@ class ListArray(BaseListArray[_ScalarT]): null, 6 ] - """ @property def offsets(self) -> Int32Array: @@ -2676,30 +1680,21 @@ class ListArray(BaseListArray[_ScalarT]): """ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> LargeListArray[_DataTypeT]: ... - @overload + """ + Concrete class for Arrow arrays of a large list data type. + + Identical to ListArray, but 64-bit offsets. + """ @classmethod def from_arrays( cls, offsets: Int64Array, - values: Array, + values: Array[Scalar[_DataTypeT]] | Array, *, - type: _DataTypeT, + type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> LargeListArray[_DataTypeT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): + ) -> LargeListArray[_DataTypeT] | LargeListArray[_DataTypeT]: """ Construct LargeListArray from arrays of int64 offsets and values. @@ -2803,30 +1798,19 @@ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): """ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListViewArray[_DataTypeT]: ... - @overload + """ + Concrete class for Arrow arrays of a list view data type. + """ @classmethod def from_arrays( cls, offsets: Int32Array, - values: Array, + values: Array[Scalar[_DataTypeT]] | Array, *, - type: _DataTypeT, + type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> ListViewArray[_DataTypeT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): + ) -> ListViewArray[_DataTypeT] | ListViewArray[_DataTypeT]: """ Construct ListViewArray from arrays of int32 offsets, sizes, and values. @@ -3009,30 +1993,21 @@ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): """ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> LargeListViewArray[_DataTypeT]: ... - @overload + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ @classmethod def from_arrays( cls, offsets: Int64Array, - values: Array, + values: Array[Scalar[_DataTypeT]] | Array, *, - type: _DataTypeT, + type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> LargeListViewArray[_DataTypeT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): + ) -> LargeListViewArray[_DataTypeT]: """ Construct LargeListViewArray from arrays of int64 offsets and values. @@ -3222,27 +2197,18 @@ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): """ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): - @overload - @classmethod - def from_arrays( - cls, - values: Array[Scalar[_DataTypeT]], - *, - type: types.FixedSizeListType[_DataTypeT, Literal[int]] | None = None, - mask: Mask | None = None, - ) -> FixedSizeListArray[_DataTypeT, None]: ... - @overload + """ + Concrete class for Arrow arrays of a fixed size list data type. + """ @classmethod def from_arrays( cls, values: Array[Scalar[_DataTypeT]], - limit_size: _Size, + limit_size: _Size | None = None, *, type: None = None, mask: Mask | None = None, - ) -> FixedSizeListArray[_DataTypeT, _Size]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): + ) -> FixedSizeListArray[_DataTypeT, _Size | None]: """ Construct FixedSizeListArray from array of values and a list length. @@ -3304,7 +2270,7 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: """ Return the underlying array of values which backs the - FixedSizeListArray. + FixedSizeListArray ignoring the array's offset. Note even null elements are included. @@ -3322,7 +2288,10 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S Examples -------- >>> import pyarrow as pa - >>> array = pa.array([[1, 2], None, [3, None]], type=pa.list_(pa.int32(), 2)) + >>> array = pa.array( + ... [[1, 2], None, [3, None]], + ... type=pa.list_(pa.int32(), 2) + ... ) >>> array.values [ @@ -3333,38 +2302,27 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S 3, null ] - """ _MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) _MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) -class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): - @overload +class MapArray(BaseListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): + """ + Concrete class for Arrow arrays of a map data type. + """ @classmethod def from_arrays( cls, - offsets: Int64Array, - keys: Array[Scalar[_MapKeyT]], - items: Array[Scalar[_MapItemT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> MapArray[_MapKeyT, _MapItemT]: ... - @overload - @classmethod - def from_arrays( # pyright: ignore[reportIncompatibleMethodOverride] - cls, - offsets: Int64Array, - values: Array, + offsets: Int64Array | list[int] | None, + keys: Array[Scalar[_MapKeyT]] | None = None, + items: Array[Scalar[_MapItemT]] | None = None, + values: Array | None = None, *, - type: MapType[_MapKeyT, _MapItemT], + type: MapType[_MapKeyT, _MapItemT] | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> MapArray[_MapKeyT, _MapItemT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + ) -> MapArray[_MapKeyT, _MapItemT]: """ Construct MapArray from arrays of int32 offsets and key, item arrays. @@ -3391,43 +2349,41 @@ class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): represents the null bitmask corresponding to the missing values in the integer array. >>> import pyarrow as pa - >>> movies_rectangular = np.ma.masked_array( - ... [[10, -1, -1], [8, 4, 5], [-1, 10, 3], [-1, -1, -1], [-1, -1, -1]], - ... [ - ... [False, True, True], - ... [False, False, False], - ... [True, False, False], - ... [True, True, True], - ... [True, True, True], - ... ], - ... ) + >>> movies_rectangular = np.ma.masked_array([ + ... [10, -1, -1], + ... [8, 4, 5], + ... [-1, 10, 3], + ... [-1, -1, -1], + ... [-1, -1, -1] + ... ], + ... [ + ... [False, True, True], + ... [False, False, False], + ... [True, False, False], + ... [True, True, True], + ... [True, True, True], + ... ]) To represent the same data with the MapArray and from_arrays, the data is formed like this: >>> offsets = [ - ... 0, # -- row 1 start - ... 1, # -- row 2 start - ... 4, # -- row 3 start - ... 6, # -- row 4 start - ... 6, # -- row 5 start - ... 6, # -- row 5 end + ... 0, # -- row 1 start + ... 1, # -- row 2 start + ... 4, # -- row 3 start + ... 6, # -- row 4 start + ... 6, # -- row 5 start + ... 6, # -- row 5 end ... ] >>> movies = [ - ... "Dark Knight", # ---------------------------------- row 1 - ... "Dark Knight", - ... "Meet the Parents", - ... "Superman", # -- row 2 - ... "Meet the Parents", - ... "Superman", # ----------------- row 3 + ... "Dark Knight", # ---------------------------------- row 1 + ... "Dark Knight", "Meet the Parents", "Superman", # -- row 2 + ... "Meet the Parents", "Superman", # ----------------- row 3 ... ] >>> likings = [ - ... 10, # -------- row 1 - ... 8, - ... 4, - ... 5, # --- row 2 - ... 10, - ... 3, # ------ row 3 + ... 10, # -------- row 1 + ... 8, 4, 5, # --- row 2 + ... 10, 3 # ------ row 3 ... ] >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() 0 [(Dark Knight, 10)] @@ -3443,12 +2399,12 @@ class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): offset still has to refer to the existing value from keys (and values): >>> offsets = [ - ... 0, # ----- row 1 start - ... 1, # ----- row 2 start - ... 4, # ----- row 3 start - ... None, # -- row 4 start - ... None, # -- row 5 start - ... 6, # ----- row 5 end + ... 0, # ----- row 1 start + ... 1, # ----- row 2 start + ... 4, # ----- row 3 start + ... None, # -- row 4 start + ... None, # -- row 5 start + ... 6, # ----- row 5 end ... ] >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() 0 [(Dark Knight, 10)] @@ -3460,12 +2416,19 @@ class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): """ @property def keys(self) -> Array: - """Flattened array of keys across all maps in array""" + """ + Flattened array of keys across all maps in array + """ @property def items(self) -> Array: - """Flattened array of items across all maps in array""" + """ + Flattened array of items across all maps in array + """ class UnionArray(Array[scalar.UnionScalar]): + """ + Concrete class for Arrow arrays of a Union data type. + """ @deprecated("Use fields() instead") def child(self, pos: int) -> Field: """ @@ -3502,7 +2465,9 @@ class UnionArray(Array[scalar.UnionScalar]): """ @property def type_codes(self) -> Int8Array: - """Get the type codes array.""" + """ + Get the type codes array. + """ @property def offsets(self) -> Int32Array: """ @@ -3558,6 +2523,9 @@ class UnionArray(Array[scalar.UnionScalar]): """ class StringArray(Array[scalar.StringScalar]): + """ + Concrete class for Arrow arrays of string (or utf8) data type. + """ @staticmethod def from_buffers( # type: ignore[override] length: int, @@ -3587,6 +2555,9 @@ class StringArray(Array[scalar.StringScalar]): """ class LargeStringArray(Array[scalar.LargeStringScalar]): + """ + Concrete class for Arrow arrays of large string (or utf8) data type. + """ @staticmethod def from_buffers( # type: ignore[override] length: int, @@ -3615,9 +2586,15 @@ class LargeStringArray(Array[scalar.LargeStringScalar]): string_array : StringArray """ -class StringViewArray(Array[scalar.StringViewScalar]): ... +class StringViewArray(Array[scalar.StringViewScalar]): + """ + Concrete class for Arrow arrays of string (or utf8) view data type. + """ class BinaryArray(Array[scalar.BinaryScalar]): + """ + Concrete class for Arrow arrays of variable-sized binary data type. + """ @property def total_values_length(self) -> int: """ @@ -3626,6 +2603,9 @@ class BinaryArray(Array[scalar.BinaryScalar]): """ class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): + """ + Concrete class for Arrow arrays of large variable-sized binary data type. + """ @property def total_values_length(self) -> int: """ @@ -3633,9 +2613,15 @@ class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): by the offsets of this LargeBinaryArray. """ -class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... +class BinaryViewArray(Array[scalar.BinaryViewScalar]): + """ + Concrete class for Arrow arrays of variable-sized binary view data type. + """ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): + """ + Concrete class for dictionary-encoded Arrow arrays. + """ def dictionary_encode(self) -> Self: ... # type: ignore[override] def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: """ @@ -3680,7 +2666,7 @@ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): @staticmethod def from_arrays( indices: Indices, - dictionary: Array | np.ndarray | pd.Series | list[Any], + dictionary: Array | np.ndarray | pd.Series, mask: np.ndarray | pd.Series | BooleanArray | None = None, ordered: bool = False, from_pandas: bool = False, @@ -3715,6 +2701,9 @@ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): """ class StructArray(Array[scalar.StructScalar]): + """ + Concrete class for Arrow arrays of a struct data type. + """ def field(self, index: int | str) -> Array: """ Retrieves the child array belonging to field. @@ -3743,8 +2732,8 @@ class StructArray(Array[scalar.StructScalar]): """ @staticmethod def from_arrays( - arrays: Iterable[Array] | list[list[Any]], - names: list[str] | list[LiteralString] | None = None, + arrays: Iterable[Array], + names: list[str] | None = None, fields: list[Field] | None = None, mask=None, memory_pool: MemoryPool | None = None, @@ -3796,29 +2785,15 @@ class StructArray(Array[scalar.StructScalar]): """ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): - @overload - @staticmethod - def from_arrays( - run_ends: Int16Array, - values: Array, - type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... - @overload - @staticmethod - def from_arrays( - run_ends: Int32Array, - values: Array, - type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... - @overload + """ + Concrete class for Arrow run-end encoded arrays. + """ @staticmethod def from_arrays( - run_ends: Int64Array, + run_ends: Int16Array | Int32Array | Int64Array, values: Array, type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... - @staticmethod - def from_arrays(*args, **kwargs): + ) -> RunEndEncodedArray[types.Int16Type | types.Int32Type | types.Int64Type, _BasicValueT]: # type: ignore[type-var] """ Construct RunEndEncodedArray from run_ends and values arrays. @@ -3836,7 +2811,7 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal RunEndEncodedArray """ @staticmethod - def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] + def from_buffers( # type: ignore[override] type: DataType, length: int, buffers: list[Buffer], @@ -3910,6 +2885,9 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal _ArrayT = TypeVar("_ArrayT", bound=Array) class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): + """ + Concrete class for Arrow extension arrays. + """ @property def storage(self) -> Any: ... @staticmethod @@ -3954,8 +2932,35 @@ class JsonArray(ExtensionArray[_ArrayT]): "{ "id":30, "values":["a", "b"] }" ] """ + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ -class UuidArray(ExtensionArray[_ArrayT]): ... +class UuidArray(ExtensionArray[_ArrayT]): + """ + Concrete class for Arrow arrays of UUID data type. + """ class FixedShapeTensorArray(ExtensionArray[_ArrayT]): """ @@ -4042,12 +3047,16 @@ class FixedShapeTensorArray(ExtensionArray[_ArrayT]): Parameters ---------- obj : numpy.ndarray + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. Examples -------- >>> import pyarrow as pa >>> import numpy as np - >>> arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + >>> arr = np.array( + ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + ... dtype=np.float32) >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) [ @@ -4221,7 +3230,6 @@ def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = No 2, 4 ] - """ def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: @@ -4236,7 +3244,6 @@ __all__ = [ "repeat", "infer_type", "_PandasConvertible", - "_CastAs", "Array", "NullArray", "BooleanArray", diff --git a/python/pyarrow/__lib_pxi/io.pyi b/python/pyarrow/__lib_pxi/io.pyi index dca26a52940..ebcfa8c470b 100644 --- a/python/pyarrow/__lib_pxi/io.pyi +++ b/python/pyarrow/__lib_pxi/io.pyi @@ -31,12 +31,12 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias -from typing import Any, Literal, SupportsIndex, overload +from typing import Any, Literal, SupportsIndex from pyarrow._stubs_typing import Compression, SupportPyBuffer from pyarrow.lib import MemoryPool, _Weakrefable -from .device import Device, DeviceAllocationType, MemoryManager +# from .device import Device, DeviceAllocationType, MemoryManager from .types import KeyValueMetadata def have_libhdfs() -> bool: @@ -113,7 +113,10 @@ class NativeFile(_Weakrefable): def readable(self) -> bool: ... def seekable(self) -> bool: ... def isatty(self) -> bool: ... - def fileno(self) -> int: ... + def fileno(self) -> int: + """ + NOT IMPLEMENTED + """ @property def closed(self) -> bool: ... def close(self) -> None: ... @@ -216,7 +219,8 @@ class NativeFile(_Weakrefable): data : bytes """ def read1(self) -> bytes: - """Read and return up to n bytes. + """ + Read and return up to n bytes. Unlike read(), if *nbytes* is None then a chunk is read, not the entire file. @@ -259,14 +263,18 @@ class NativeFile(_Weakrefable): maximum number of bytes read """ def readlines(self, hint: int | None = None) -> list[bytes]: - """Read lines of the file + """ + NOT IMPLEMENTED. Read lines of the file Parameters ---------- hint : int maximum number of bytes read until we stop """ - def __iter__(self) -> Self: ... + def __iter__(self) -> Self: + """ + Implement iter(self). + """ def __next__(self) -> bytes: ... def read_buffer(self, nbytes: int | None = None) -> Buffer: """ @@ -277,7 +285,10 @@ class NativeFile(_Weakrefable): nbytes : int, optional maximum number of bytes read """ - def truncate(self) -> None: ... + def truncate(self) -> None: + """ + NOT IMPLEMENTED + """ def writelines(self, lines: list[bytes]): """ Write lines to the file. @@ -337,10 +348,10 @@ class PythonFile(NativeFile): Create a stream for writing: >>> buf = io.BytesIO() - >>> f = pa.PythonFile(buf, mode="w") + >>> f = pa.PythonFile(buf, mode = 'w') >>> f.writable() True - >>> f.write(b"PythonFile") + >>> f.write(b'PythonFile') 10 >>> buf.getvalue() b'PythonFile' @@ -350,8 +361,8 @@ class PythonFile(NativeFile): Create a stream for reading: - >>> buf = io.BytesIO(b"PythonFile") - >>> f = pa.PythonFile(buf, mode="r") + >>> buf = io.BytesIO(b'PythonFile') + >>> f = pa.PythonFile(buf, mode = 'r') >>> f.mode 'rb' >>> f.read() @@ -381,15 +392,16 @@ class MemoryMappedFile(NativeFile): Create a new file with memory map: >>> import pyarrow as pa - >>> mmap = pa.create_memory_map("example_mmap.dat", 10) + >>> mmap = pa.create_memory_map('example_mmap.dat', 10) >>> mmap >>> mmap.close() Open an existing file with memory map: - >>> with pa.memory_map("example_mmap.dat") as mmap: + >>> with pa.memory_map('example_mmap.dat') as mmap: ... mmap + ... """ @classmethod @@ -436,11 +448,13 @@ def memory_map( Reading from a memory map without any memory allocation or copying: >>> import pyarrow as pa - >>> with pa.output_stream("example_mmap.txt") as stream: - ... stream.write(b"Constructing a buffer referencing the mapped memory") + >>> with pa.output_stream('example_mmap.txt') as stream: + ... stream.write(b'Constructing a buffer referencing the mapped memory') + ... 51 - >>> with pa.memory_map("example_mmap.txt") as mmap: - ... mmap.read_at(6, 45) + >>> with pa.memory_map('example_mmap.txt') as mmap: + ... mmap.read_at(6,45) + ... b'memory' """ @@ -455,36 +469,40 @@ class OSFile(NativeFile): Create a new file to write to: >>> import pyarrow as pa - >>> with pa.OSFile("example_osfile.arrow", mode="w") as f: + >>> with pa.OSFile('example_osfile.arrow', mode='w') as f: ... f.writable() - ... f.write(b"OSFile") + ... f.write(b'OSFile') ... f.seekable() + ... True 6 False Open the file to read: - >>> with pa.OSFile("example_osfile.arrow", mode="r") as f: + >>> with pa.OSFile('example_osfile.arrow', mode='r') as f: ... f.mode ... f.read() + ... 'rb' b'OSFile' Open the file to append: - >>> with pa.OSFile("example_osfile.arrow", mode="ab") as f: + >>> with pa.OSFile('example_osfile.arrow', mode='ab') as f: ... f.mode - ... f.write(b" is super!") + ... f.write(b' is super!') + ... 'ab' 10 - >>> with pa.OSFile("example_osfile.arrow") as f: + >>> with pa.OSFile('example_osfile.arrow') as f: ... f.read() + ... b'OSFile is super!' Inspect created OSFile: - >>> pa.OSFile("example_osfile.arrow") + >>> pa.OSFile('example_osfile.arrow') """ def __init__( @@ -505,8 +523,9 @@ class FixedSizeBufferWriter(NativeFile): >>> import pyarrow as pa >>> buf = pa.allocate_buffer(5) >>> with pa.output_stream(buf) as stream: - ... stream.write(b"abcde") + ... stream.write(b'abcde') ... stream + ... 5 @@ -518,9 +537,24 @@ class FixedSizeBufferWriter(NativeFile): """ def __init__(self, buffer: Buffer) -> None: ... - def set_memcopy_threads(self, num_threads: int) -> None: ... - def set_memcopy_blocksize(self, blocksize: int) -> None: ... - def set_memcopy_threshold(self, threshold: int) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: + """ + Parameters + ---------- + num_threads : int + """ + def set_memcopy_blocksize(self, blocksize: int) -> None: + """ + Parameters + ---------- + blocksize : int64 + """ + def set_memcopy_threshold(self, threshold: int) -> None: + """ + Parameters + ---------- + threshold : int64 + """ # ---------------------------------------------------------------------- # Arrow buffers @@ -532,7 +566,10 @@ class Buffer(_Weakrefable): A buffer represents a contiguous memory area. Many buffers will own their memory, though not all of them do. """ - def __len__(self) -> int: ... + def __len__(self) -> int: + """ + Return len(self). + """ def _assert_cpu(self) -> None: ... @property def size(self) -> int: @@ -565,39 +602,40 @@ class Buffer(_Weakrefable): """ Whether the buffer is CPU-accessible. """ + # TODO + # @property + # def device(self) -> Device: + # """ + # The device where the buffer resides. + # + # Returns + # ------- + # Device + # """ + # @property + # def memory_manager(self) -> MemoryManager: + # """ + # The memory manager associated with the buffer. + # + # Returns + # ------- + # MemoryManager + # """ + # @property + # def device_type(self) -> DeviceAllocationType: + # """ + # The device type where the buffer resides. + # + # Returns + # ------- + # DeviceAllocationType + # """ @property - def device(self) -> Device: - """ - The device where the buffer resides. - - Returns - ------- - Device - """ - @property - def memory_manager(self) -> MemoryManager: - """ - The memory manager associated with the buffer. - - Returns - ------- - MemoryManager - """ - @property - def device_type(self) -> DeviceAllocationType: + def parent(self) -> Buffer | None: ... + def __getitem__(self, key: slice | int) -> Self | int: """ - The device type where the buffer resides. - - Returns - ------- - DeviceAllocationType + Return self[key]. """ - @property - def parent(self) -> Buffer | None: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - @overload - def __getitem__(self, key: int) -> int: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: """ Slice this buffer. Memory is not copied. @@ -635,7 +673,6 @@ class Buffer(_Weakrefable): """ Return this buffer as a Python bytes object. Memory is copied. """ - def __buffer__(self, flags: int, /) -> memoryview: ... class ResizableBuffer(Buffer): """ @@ -656,17 +693,9 @@ class ResizableBuffer(Buffer): If this is false, the buffer is never shrunk. """ -@overload -def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... -@overload -def allocate_buffer( - size: int, memory_pool: MemoryPool | None, resizable: Literal[False] -) -> Buffer: ... -@overload def allocate_buffer( - size: int, memory_pool: MemoryPool | None, resizable: Literal[True] -) -> ResizableBuffer: ... -def allocate_buffer(*args, **kwargs): + size: int, memory_pool: MemoryPool | None = None, resizable: Literal[False] | Literal[True] | None = None +) -> Buffer | ResizableBuffer: """ Allocate a mutable buffer. @@ -700,7 +729,7 @@ class BufferOutputStream(NativeFile): >>> import pyarrow as pa >>> f = pa.BufferOutputStream() - >>> f.write(b"pyarrow.Buffer") + >>> f.write(b'pyarrow.Buffer') 14 >>> f.closed False @@ -734,19 +763,23 @@ class BufferReader(NativeFile): Create an Arrow input stream and inspect it: >>> import pyarrow as pa - >>> data = b"reader data" + >>> data = b'reader data' >>> buf = memoryview(data) >>> with pa.input_stream(buf) as stream: ... stream.size() ... stream.read(6) ... stream.seek(7) ... stream.read(15) + ... 11 b'reader' 7 b'data' """ - def __init__(self, obj) -> None: ... + def __init__(self, obj) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ class CompressedInputStream(NativeFile): """ @@ -768,6 +801,7 @@ class CompressedInputStream(NativeFile): >>> raw = pa.BufferOutputStream() >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: ... compressed.write(data) + ... 17 Create an input stream with decompression referencing the @@ -776,6 +810,7 @@ class CompressedInputStream(NativeFile): >>> cdata = raw.getvalue() >>> with pa.input_stream(cdata, compression="gzip") as compressed: ... compressed.read() + ... b'Compressed stream' which actually translates to the use of ``BufferReader``and @@ -784,6 +819,7 @@ class CompressedInputStream(NativeFile): >>> raw = pa.BufferReader(cdata) >>> with pa.CompressedInputStream(raw, "gzip") as compressed: ... compressed.read() + ... b'Compressed stream' """ @@ -791,7 +827,10 @@ class CompressedInputStream(NativeFile): self, stream: StrPath | NativeFile | IOBase, compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: ... + ) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ class CompressedOutputStream(NativeFile): """ @@ -813,13 +852,17 @@ class CompressedOutputStream(NativeFile): >>> raw = pa.BufferOutputStream() >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: ... compressed.write(data) + ... 17 """ def __init__( self, stream: StrPath | NativeFile | IOBase, compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: ... + ) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ class BufferedInputStream(NativeFile): """ @@ -838,7 +881,10 @@ class BufferedInputStream(NativeFile): """ def __init__( self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None - ) -> None: ... + ) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ def detach(self) -> NativeFile: """ Release the raw InputStream. @@ -867,7 +913,10 @@ class BufferedOutputStream(NativeFile): """ def __init__( self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None - ) -> None: ... + ) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ def detach(self) -> NativeFile: """ Flush any buffered writes and release the raw OutputStream. @@ -890,7 +939,10 @@ class TransformInputStream(NativeFile): transform_func : callable The transformation to apply. """ - def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ class Transcoder: def __init__(self, decoder, encoder) -> None: ... @@ -986,7 +1038,10 @@ class CacheOptions(_Weakrefable): range_size_limit: int | None = None, lazy: bool = True, prefetch_limit: int = 0, - ) -> None: ... + ) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ @classmethod def from_network_metrics( cls, @@ -1073,15 +1128,18 @@ class Codec(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> pa.Codec.is_available("gzip") + >>> pa.Codec.is_available('gzip') True - >>> codec = pa.Codec("gzip") + >>> codec = pa.Codec('gzip') >>> codec.name 'gzip' >>> codec.compression_level 9 """ - def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ @classmethod def detect(cls, path: StrPath) -> Self: """ @@ -1166,34 +1224,21 @@ class Codec(_Weakrefable): """ @property def name(self) -> Compression: - """Returns the name of the codec""" + """ + Returns the name of the codec + """ @property def compression_level(self) -> int: - """Returns the compression level parameter of the codec""" - @overload - def compress( - self, - buf: Buffer | bytes | SupportPyBuffer, - *, - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def compress( - self, - buf: Buffer | bytes | SupportPyBuffer, - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload + """ + Returns the compression level parameter of the codec + """ def compress( self, buf: Buffer | bytes | SupportPyBuffer, *, - asbytes: Literal[True], + asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, - ) -> bytes: ... - def compress(self, *args, **kwargs): + ) -> Buffer | bytes: """ Compress data from buffer-like object. @@ -1209,33 +1254,14 @@ class Codec(_Weakrefable): ------- compressed : pyarrow.Buffer or bytes (if asbytes=True) """ - @overload def decompress( self, buf: Buffer | bytes | SupportPyBuffer, decompressed_size: int | None = None, *, + asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def decompress( - self, - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def decompress( - self, - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = None, - ) -> bytes: ... - def decompress(self, *args, **kwargs): + ) -> Buffer | bytes: """ Decompress data from buffer-like object. @@ -1254,30 +1280,13 @@ class Codec(_Weakrefable): uncompressed : pyarrow.Buffer or bytes (if asbytes=True) """ -@overload -def compress( - buf: Buffer | bytes | SupportPyBuffer, - codec: Compression = "lz4", - *, - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload def compress( buf: Buffer | bytes | SupportPyBuffer, codec: Compression = "lz4", *, - asbytes: Literal[False], + asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload -def compress( - buf: Buffer | bytes | SupportPyBuffer, - codec: Compression = "lz4", - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = None, -) -> bytes: ... -def compress(*args, **kwargs): +) -> Buffer | bytes: """ Compress data from buffer-like object. @@ -1297,33 +1306,14 @@ def compress(*args, **kwargs): compressed : pyarrow.Buffer or bytes (if asbytes=True) """ -@overload -def decompress( - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - codec: Compression = "lz4", - *, - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload -def decompress( - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - codec: Compression = "lz4", - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload def decompress( buf: Buffer | bytes | SupportPyBuffer, decompressed_size: int | None = None, codec: Compression = "lz4", *, - asbytes: Literal[True], + asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, -) -> bytes: ... -def decompress(*args, **kwargs): +) -> Buffer | bytes: """ Decompress data from buffer-like object. @@ -1376,25 +1366,30 @@ def input_stream( >>> buf = memoryview(b"some data") >>> with pa.input_stream(buf) as stream: ... stream.read(4) + ... b'some' Create a readable OSFile (NativeFile) from a string or file path: >>> import gzip - >>> with gzip.open("example.gz", "wb") as f: - ... f.write(b"some data") + >>> with gzip.open('example.gz', 'wb') as f: + ... f.write(b'some data') + ... 9 - >>> with pa.input_stream("example.gz") as stream: + >>> with pa.input_stream('example.gz') as stream: ... stream.read() + ... b'some data' Create a readable PythonFile (NativeFile) from a a Python file object: - >>> with open("example.txt", mode="w") as f: - ... f.write("some text") + >>> with open('example.txt', mode='w') as f: + ... f.write('some text') + ... 9 - >>> with pa.input_stream("example.txt") as stream: + >>> with pa.input_stream('example.txt') as stream: ... stream.read(6) + ... b'some t' """ @@ -1430,9 +1425,11 @@ def output_stream( >>> buf = pa.py_buffer(empty_obj) >>> with pa.output_stream(buf) as stream: ... stream.write(data) + ... 11 >>> with pa.input_stream(buf) as stream: ... stream.read(6) + ... b'buffer' or from a memoryview object: @@ -1440,18 +1437,22 @@ def output_stream( >>> buf = memoryview(empty_obj) >>> with pa.output_stream(buf) as stream: ... stream.write(data) + ... 11 >>> with pa.input_stream(buf) as stream: ... stream.read() + ... b'buffer data' Create a writable NativeFile from a string or file path: - >>> with pa.output_stream("example_second.txt") as stream: - ... stream.write(b"Write some data") + >>> with pa.output_stream('example_second.txt') as stream: + ... stream.write(b'Write some data') + ... 15 - >>> with pa.input_stream("example_second.txt") as stream: + >>> with pa.input_stream('example_second.txt') as stream: ... stream.read() + ... b'Write some data' """ diff --git a/python/pyarrow/__lib_pxi/memory.pyi b/python/pyarrow/__lib_pxi/memory.pyi index e969e3738b8..4fc723a1950 100644 --- a/python/pyarrow/__lib_pxi/memory.pyi +++ b/python/pyarrow/__lib_pxi/memory.pyi @@ -73,7 +73,12 @@ class MemoryPool(_Weakrefable): """ class LoggingMemoryPool(MemoryPool): ... -class ProxyMemoryPool(MemoryPool): ... +class ProxyMemoryPool(MemoryPool): + """ + Memory pool implementation that tracks the number of bytes and + maximum memory allocated through its direct calls, while redirecting + to another memory pool. + """ def default_memory_pool() -> MemoryPool: """ diff --git a/python/pyarrow/__lib_pxi/scalar.pyi b/python/pyarrow/__lib_pxi/scalar.pyi index c6819f7e863..b979ec43a3a 100644 --- a/python/pyarrow/__lib_pxi/scalar.pyi +++ b/python/pyarrow/__lib_pxi/scalar.pyi @@ -19,8 +19,6 @@ import collections.abc import datetime as dt import sys -from decimal import Decimal - if sys.version_info >= (3, 11): from typing import Self else: @@ -29,17 +27,17 @@ if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -from typing import Any, Generic, Iterator, Literal, Mapping, overload +from typing import Any, Generic, Iterator, Literal import numpy as np -from pyarrow._compute import CastOptions +from pyarrow._compute import CastOptions # type: ignore[import-not-found] from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable -from typing_extensions import Protocol, TypeVar +from typing_extensions import TypeVar from . import types from .types import ( - _AsPyType, + # _AsPyType, _DataTypeT, _Time32Unit, _Time64Unit, @@ -65,23 +63,13 @@ class Scalar(_Weakrefable, Generic[_DataType_co]): """ Holds a valid (non-null) value. """ - @overload - def cast( - self, - target_type: None, - safe: bool = True, - options: CastOptions | None = None, - memory_pool: MemoryPool | None = None, - ) -> Self: ... - @overload def cast( self, - target_type: _DataTypeT, + target_type: None | _DataTypeT, safe: bool = True, options: CastOptions | None = None, memory_pool: MemoryPool | None = None, - ) -> Scalar[_DataTypeT]: ... - def cast(self, *args, **kwargs): + ) -> Self | Scalar[_DataTypeT]: """ Cast scalar value to another data type. @@ -118,77 +106,21 @@ class Scalar(_Weakrefable, Generic[_DataType_co]): ------ ArrowInvalid """ - def equals(self, other: Scalar) -> bool: ... - def __hash__(self) -> int: ... - @overload - def as_py( - self: Scalar[types._BasicDataType[_AsPyType]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> _AsPyType: ... - @overload - def as_py( - self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType]: ... - @overload - def as_py( - self: Scalar[ - types.ListType[ - types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] - ] - ], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[int, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[ - types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], - ], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[Any, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[int, Any]]: ... - @overload - def as_py( - self: Scalar[types.StructType], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[str, Any]]: ... - @overload - def as_py( - self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] - ], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[tuple[Any, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[tuple[_AsPyTypeK, Any]]: ... - @overload - def as_py( - self: Scalar[Any], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> Any: ... - def as_py(self, *args, **kwargs): + def equals(self, other: Scalar) -> bool: + """ + Parameters + ---------- + other : pyarrow.Scalar + + Returns + ------- + bool + """ + def __hash__(self) -> int: + """ + Return hash(self). + """ + def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None) -> Any: """ Return this value as a Python representation. @@ -210,153 +142,413 @@ class Scalar(_Weakrefable, Generic[_DataType_co]): _NULL: TypeAlias = None NA = _NULL -class NullScalar(Scalar[types.NullType]): ... -class BooleanScalar(Scalar[types.BoolType]): ... -class UInt8Scalar(Scalar[types.UInt8Type]): ... -class Int8Scalar(Scalar[types.Int8Type]): ... -class UInt16Scalar(Scalar[types.UInt16Type]): ... -class Int16Scalar(Scalar[types.Int16Type]): ... -class UInt32Scalar(Scalar[types.Uint32Type]): ... -class Int32Scalar(Scalar[types.Int32Type]): ... -class UInt64Scalar(Scalar[types.UInt64Type]): ... -class Int64Scalar(Scalar[types.Int64Type]): ... -class HalfFloatScalar(Scalar[types.Float16Type]): ... -class FloatScalar(Scalar[types.Float32Type]): ... -class DoubleScalar(Scalar[types.Float64Type]): ... -class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): ... -class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): ... -class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): ... -class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): ... -class Date32Scalar(Scalar[types.Date32Type]): ... +class NullScalar(Scalar[types.NullType]): + """ + Concrete class for null scalars. + """ +class BooleanScalar(Scalar[types.BoolType]): + """ + Concrete class for boolean scalars. + """ +class UInt8Scalar(Scalar[types.UInt8Type]): + """ + Concrete class for uint8 scalars. + """ +class Int8Scalar(Scalar[types.Int8Type]): + """ + Concrete class for int8 scalars. + """ +class UInt16Scalar(Scalar[types.UInt16Type]): + """ + Concrete class for uint16 scalars. + """ +class Int16Scalar(Scalar[types.Int16Type]): + """ + Concrete class for int16 scalars. + """ +class UInt32Scalar(Scalar[types.Uint32Type]): + """ + Concrete class for uint32 scalars. + """ +class Int32Scalar(Scalar[types.Int32Type]): + """ + Concrete class for int32 scalars. + """ +class UInt64Scalar(Scalar[types.UInt64Type]): + """ + Concrete class for uint64 scalars. + """ +class Int64Scalar(Scalar[types.Int64Type]): + """ + Concrete class for int64 scalars. + """ +class HalfFloatScalar(Scalar[types.Float16Type]): + """ + Concrete class for float scalars. + """ +class FloatScalar(Scalar[types.Float32Type]): + """ + Concrete class for float scalars. + """ +class DoubleScalar(Scalar[types.Float64Type]): + """ + Concrete class for double scalars. + """ +class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): + """ + Concrete class for decimal32 scalars. + """ +class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): + """ + Concrete class for decimal64 scalars. + """ +class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): + """ + Concrete class for decimal128 scalars. + """ +class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): + """ + Concrete class for decimal256 scalars. + """ +class Date32Scalar(Scalar[types.Date32Type]): + """ + Concrete class for date32 scalars. + """ class Date64Scalar(Scalar[types.Date64Type]): + """ + Concrete class for date64 scalars. + """ @property def value(self) -> dt.date | None: ... class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): + """ + Concrete class for time32 scalars. + """ @property def value(self) -> dt.time | None: ... class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): + """ + Concrete class for time64 scalars. + """ @property def value(self) -> dt.time | None: ... class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): + """ + Concrete class for timestamp scalars. + """ @property def value(self) -> int | None: ... class DurationScalar(Scalar[types.DurationType[_Unit]]): + """ + Concrete class for duration scalars. + """ @property def value(self) -> dt.timedelta | None: ... class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): + """ + Concrete class for month, day, nanosecond interval scalars. + """ @property - def value(self) -> MonthDayNano | None: ... + def value(self) -> MonthDayNano | None: + """ + Same as self.as_py() + """ class BinaryScalar(Scalar[types.BinaryType]): - def as_buffer(self) -> Buffer: ... + """ + Concrete class for binary-like scalars. + """ + def as_buffer(self) -> Buffer: + """ + Return a view over this value as a Buffer object. + """ class LargeBinaryScalar(Scalar[types.LargeBinaryType]): - def as_buffer(self) -> Buffer: ... + """ + """ + def as_buffer(self) -> Buffer: + """ + BinaryScalar.as_buffer(self) + + Return a view over this value as a Buffer object. + """ class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): - def as_buffer(self) -> Buffer: ... + """ + """ + def as_buffer(self) -> Buffer: + """ + BinaryScalar.as_buffer(self) + + Return a view over this value as a Buffer object. + """ class StringScalar(Scalar[types.StringType]): - def as_buffer(self) -> Buffer: ... + """ + Concrete class for string-like (utf8) scalars. + """ + def as_buffer(self) -> Buffer: + """ + BinaryScalar.as_buffer(self) + + Return a view over this value as a Buffer object. + """ class LargeStringScalar(Scalar[types.LargeStringType]): - def as_buffer(self) -> Buffer: ... + """ + """ + def as_buffer(self) -> Buffer: + """ + BinaryScalar.as_buffer(self) + + Return a view over this value as a Buffer object. + """ class BinaryViewScalar(Scalar[types.BinaryViewType]): - def as_buffer(self) -> Buffer: ... + """ + """ + def as_buffer(self) -> Buffer: + """ + BinaryScalar.as_buffer(self) + + Return a view over this value as a Buffer object. + """ class StringViewScalar(Scalar[types.StringViewType]): - def as_buffer(self) -> Buffer: ... + """ + """ + def as_buffer(self) -> Buffer: + """ + BinaryScalar.as_buffer(self) + + Return a view over this value as a Buffer object. + """ class ListScalar(Scalar[types.ListType[_DataTypeT]]): + """ + Concrete class for list-like scalars. + """ @property def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... + def __len__(self) -> int: + """ + Return the number of values. + """ + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: + """ + Return the value at the given index. + """ + def __iter__(self) -> Iterator[Array]: + """ + Iterate over this element's values. + """ class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): + """ + """ @property def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... + def __len__(self) -> int: + """ + ListScalar.__len__(self) + + Return the number of values. + """ + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: + """ + ListScalar.__getitem__(self, i) + + Return the value at the given index. + """ + def __iter__(self) -> Iterator[Array]: + """ + ListScalar.__iter__(self) + + Iterate over this element's values. + """ class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): + """ + """ @property def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... + def __len__(self) -> int: + """ + ListScalar.__len__(self) + + Return the number of values. + """ + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: + """ + ListScalar.__getitem__(self, i) + + Return the value at the given index. + """ + def __iter__(self) -> Iterator[Array]: + """ + ListScalar.__iter__(self) + + Iterate over this element's values. + """ class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): + """ + """ @property def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... + def __len__(self) -> int: + """ + ListScalar.__len__(self) + + Return the number of values. + """ + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: + """ + ListScalar.__getitem__(self, i) + + Return the value at the given index. + """ + def __iter__(self) -> Iterator[Array]: + """ + ListScalar.__iter__(self) + + Iterate over this element's values. + """ class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): + """ + """ @property def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... + def __len__(self) -> int: + """ + ListScalar.__len__(self) + + Return the number of values. + """ + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: + """ + ListScalar.__getitem__(self, i) + + Return the value at the given index. + """ + def __iter__(self) -> Iterator[Array]: + """ + ListScalar.__iter__(self) + + Iterate over this element's values. + """ class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): - def __len__(self) -> int: ... - def __iter__(self) -> Iterator[str]: ... - def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] + """ + Concrete class for struct scalars. + """ + def __len__(self) -> int: + """ + Return len(self). + """ + def __iter__(self) -> Iterator[str]: + """ + Implement iter(self). + """ + def __getitem__(self, key: int | str) -> Scalar[Any]: + """ + Return the child value for the given field. + + Parameters + ---------- + key : Union[int, str] + Index / position or name of the field. + + Returns + ------- + result : Scalar + """ def _as_py_tuple(self) -> list[tuple[str, Any]]: ... - def tolist(self) -> list[Any]: ... class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): + """ + Concrete class for map scalars. + """ @property def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: ... - @overload + def __len__(self) -> int: + """ + ListScalar.__len__(self) + + Return the number of values. + """ + def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: + """ + Return the value at the given index or key. + """ def __iter__( self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] - ], - ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... - @overload - def __iter__( - self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]],], - ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... - @overload - def __iter__( - self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], - ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]],] + | Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]] + | Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]] + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] | Iterator[tuple[Any, _AsPyTypeV]] | Iterator[tuple[_AsPyTypeK, Any]]: + """ + Iterate over this element's values. + """ class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): + """ + Concrete class for dictionary-encoded scalars. + """ @property - def index(self) -> Scalar[types._IndexT]: ... + def index(self) -> Scalar[types._IndexT]: + """ + Return this value's underlying index as a scalar. + """ @property - def value(self) -> Scalar[types._BasicValueT]: ... + def value(self) -> Scalar[types._BasicValueT]: + """ + Return the encoded value as a scalar. + """ @property def dictionary(self) -> Array: ... class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): + """ + Concrete class for RunEndEncoded scalars. + """ @property - def value(self) -> tuple[int, types._BasicValueT] | None: ... + def value(self) -> tuple[int, types._BasicValueT] | None: + """ + Return underlying value as a scalar. + """ class UnionScalar(Scalar[types.UnionType]): + """ + Concrete class for Union scalars. + """ @property - def value(self) -> Any | None: ... + def value(self) -> Any | None: + """ + Return underlying value as a scalar. + """ @property - def type_code(self) -> str: ... + def type_code(self) -> str: + """ + Return the union type code for this scalar. + """ class ExtensionScalar(Scalar[types.ExtensionType]): + """ + Concrete class for Extension scalars. + """ @property - def value(self) -> Any | None: ... + def value(self) -> Any | None: + """ + Return storage value as a scalar. + """ @staticmethod def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: """ @@ -374,12 +566,27 @@ class ExtensionScalar(Scalar[types.ExtensionType]): ext_scalar : ExtensionScalar """ -class Bool8Scalar(Scalar[types.Bool8Type]): ... -class UuidScalar(Scalar[types.UuidType]): ... -class JsonScalar(Scalar[types.JsonType]): ... -class OpaqueScalar(Scalar[types.OpaqueType]): ... +class Bool8Scalar(Scalar[types.Bool8Type]): + """ + Concrete class for bool8 extension scalar. + """ +class UuidScalar(Scalar[types.UuidType]): + """ + Concrete class for Uuid extension scalar. + """ +class JsonScalar(Scalar[types.JsonType]): + """ + Concrete class for JSON extension scalar. + """ +class OpaqueScalar(Scalar[types.OpaqueType]): + """ + Concrete class for opaque extension scalar. + """ class FixedShapeTensorScalar(ExtensionScalar): + """ + Concrete class for fixed shape tensor extension scalar. + """ def to_numpy(self) -> np.ndarray: """ Convert fixed shape tensor scalar to a numpy.ndarray. @@ -405,542 +612,13 @@ class FixedShapeTensorScalar(ExtensionScalar): Tensor represented stored in FixedShapeTensorScalar. """ -_V = TypeVar("_V") - -class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] - def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... - def __len__(self) -> int: ... - def __contains__(self, item: Any, /) -> bool: ... - -@overload -def scalar( - value: str, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StringScalar: ... -@overload -def scalar( - value: bytes, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BinaryScalar: ... -@overload -def scalar( # pyright: ignore[reportOverlappingOverload] - value: bool, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BooleanScalar: ... -@overload -def scalar( - value: int, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int64Scalar: ... -@overload -def scalar( - value: float, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DoubleScalar: ... -@overload -def scalar( - value: Decimal, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal128Scalar: ... -@overload -def scalar( # pyright: ignore[reportOverlappingOverload] - value: dt.datetime, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> TimestampScalar[Literal["us"]]: ... -@overload -def scalar( - value: dt.date, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Date32Scalar: ... -@overload -def scalar( - value: dt.time, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Time64Scalar[Literal["us"]]: ... -@overload -def scalar( - value: dt.timedelta, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DurationScalar[Literal["us"]]: ... -@overload -def scalar( # pyright: ignore[reportOverlappingOverload] - value: MonthDayNano, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalScalar: ... -@overload -def scalar( - value: Mapping[str, Any], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StructScalar: ... -@overload -def scalar( - value: NullableCollection[str], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.StringType]]: ... -@overload -def scalar( - value: NullableCollection[bytes], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.BinaryType]]: ... -@overload -def scalar( - value: NullableCollection[bool], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.BoolType]]: ... -@overload -def scalar( - value: NullableCollection[int], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Int64Type]]: ... -@overload -def scalar( - value: NullableCollection[float], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Float64Type]]: ... -@overload -def scalar( - value: NullableCollection[Decimal], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Decimal32Type]]: ... -@overload -def scalar( - value: NullableCollection[dt.datetime], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.TimestampType[Literal["us"]]]]: ... -@overload -def scalar( - value: NullableCollection[dt.date], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Date32Type]]: ... -@overload -def scalar( - value: NullableCollection[dt.time], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Time64Type[Literal["us"]]]]: ... -@overload -def scalar( - value: NullableCollection[dt.timedelta], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.DurationType[Literal["us"]]]]: ... -@overload -def scalar( - value: NullableCollection[MonthDayNano], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... -@overload -def scalar( - value: NullableCollection[Any], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[Any]: ... -@overload -def scalar( - value: Any, - type: types.NullType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> NullScalar: ... -@overload -def scalar( - value: Any, - type: types.BoolType | Literal["bool"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BooleanScalar: ... -@overload -def scalar( - value: Any, - type: types.UInt8Type | Literal["uint8"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt8Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int8Type | Literal["int8"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int8Scalar: ... -@overload -def scalar( - value: Any, - type: types.UInt16Type | Literal["uint16"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt16Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int16Type | Literal["int16"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int16Scalar: ... -@overload -def scalar( - value: Any, - type: types.Uint32Type | Literal["uint32"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt32Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int32Type | Literal["int32"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int32Scalar: ... -@overload -def scalar( - value: Any, - type: types.UInt64Type | Literal["uint64"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt64Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int64Type | Literal["int64"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int64Scalar: ... -@overload -def scalar( - value: Any, - type: types.Float16Type | Literal["f16"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> HalfFloatScalar: ... -@overload -def scalar( - value: Any, - type: types.Float32Type | Literal["f32"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> FloatScalar: ... -@overload -def scalar( - value: Any, - type: types.Float64Type | Literal["f64"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DoubleScalar: ... -@overload -def scalar( - value: Any, - type: types.Date32Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Date32Scalar: ... -@overload -def scalar( - value: Any, - type: types.Date64Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Date64Scalar: ... -@overload -def scalar( - value: Any, - type: types.MonthDayNanoIntervalType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalScalar: ... -@overload -def scalar( - value: Any, - type: types.StringType | Literal["string"], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StringScalar: ... -@overload -def scalar( - value: Any, - type: types.LargeStringType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeStringScalar: ... -@overload -def scalar( - value: Any, - type: types.StringViewType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StringViewScalar: ... -@overload -def scalar( - value: Any, - type: types.BinaryType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BinaryScalar: ... -@overload -def scalar( - value: Any, - type: types.LargeBinaryType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryScalar: ... -@overload -def scalar( - value: Any, - type: types.BinaryViewType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BinaryViewScalar: ... -@overload -def scalar( - value: Any, - type: types.TimestampType[types._Unit, types._Tz], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> TimestampScalar[types._Unit, types._Tz]: ... -@overload -def scalar( - value: Any, - type: types.Time32Type[types._Time32Unit], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Time32Scalar[types._Time32Unit]: ... -@overload -def scalar( - value: Any, - type: types.Time64Type[types._Time64Unit], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Time64Scalar[types._Time64Unit]: ... -@overload -def scalar( - value: Any, - type: types.DurationType[types._Unit], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DurationScalar[types._Unit]: ... -@overload -def scalar( - value: Any, - type: types.Decimal32Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal32Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.Decimal64Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal64Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.Decimal128Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal128Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.Decimal256Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal256Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.ListType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.LargeListType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeListScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.ListViewType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListViewScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.LargeListViewType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeListViewScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.FixedSizeListType[_DataTypeT, types._Size], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> FixedSizeListScalar[_DataTypeT, types._Size]: ... -@overload -def scalar( - value: Any, - type: types.DictionaryType[types._IndexT, types._BasicValueT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DictionaryScalar[types._IndexT, types._BasicValueT]: ... -@overload -def scalar( - value: Any, - type: types.MapType[types._K, types._ValueT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> MapScalar[types._K, types._ValueT]: ... -@overload -def scalar( - value: Any, - type: types.StructType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StructScalar: ... -@overload -def scalar( - value: Any, - type: types.UnionType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UnionScalar: ... -@overload -def scalar( - value: Any, - type: types.RunEndEncodedType[types._RunEndType, types._BasicValueT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> RunEndEncodedScalar[types._RunEndType, types._BasicValueT]: ... -@overload -def scalar( - value: Any, - type: types.Bool8Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Bool8Scalar: ... -@overload -def scalar( - value: Any, - type: types.UuidType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UuidScalar: ... -@overload -def scalar( - value: Any, - type: types.JsonType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> JsonScalar: ... -@overload -def scalar( - value: Any, - type: types.OpaqueType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> OpaqueScalar: ... -@overload def scalar( value: Any, type: _DataTypeT, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> Scalar[_DataTypeT]: ... -def scalar(*args, **kwargs): +) -> Scalar[_DataTypeT]: """ Create a pyarrow.Scalar instance from a Python object. @@ -1032,5 +710,4 @@ __all__ = [ "JsonScalar", "OpaqueScalar", "scalar", - "NullableCollection", ] diff --git a/python/pyarrow/__lib_pxi/tensor.pyi b/python/pyarrow/__lib_pxi/tensor.pyi index 5ad950c84d0..ac34fa08ffc 100644 --- a/python/pyarrow/__lib_pxi/tensor.pyi +++ b/python/pyarrow/__lib_pxi/tensor.pyi @@ -26,7 +26,7 @@ import numpy as np from pyarrow.lib import _Weakrefable from scipy.sparse import coo_matrix, csr_matrix -from sparse import COO +from sparse import COO # type: ignore class Tensor(_Weakrefable): """ @@ -37,7 +37,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) type: int32 shape: (2, 3) @@ -61,7 +61,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) type: int32 shape: (2, 3) @@ -76,7 +76,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.to_numpy() array([[ 2, 2, 4], [ 4, 5, 100]], dtype=int32) @@ -95,9 +95,9 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) - >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a", "b"]) + >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a","b"]) >>> tensor.equals(tensor) True >>> tensor.equals(tensor2) @@ -117,7 +117,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.dim_name(0) 'dim1' >>> tensor.dim_name(1) @@ -133,7 +133,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.dim_names ['dim1', 'dim2'] """ @@ -147,7 +147,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.is_mutable True """ @@ -161,7 +161,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.is_contiguous True """ @@ -175,7 +175,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.ndim 2 """ @@ -189,7 +189,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.size 6 """ @@ -203,7 +203,7 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.shape (2, 3) """ @@ -217,12 +217,15 @@ class Tensor(_Weakrefable): >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.strides (12, 4) """ class SparseCOOTensor(_Weakrefable): + """ + A sparse COO tensor. + """ @classmethod def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: """ @@ -265,12 +268,12 @@ class SparseCOOTensor(_Weakrefable): @classmethod def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: """ - Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + Convert scipy.sparse.coo_array or scipy.sparse.coo_matrix to arrow::SparseCOOTensor Parameters ---------- - obj : scipy.sparse.csr_matrix - The scipy matrix that should be converted. + obj : scipy.sparse.coo_array or scipy.sparse.coo_matrix + The scipy array or matrix that should be converted. dim_names : list, optional Names of the dimensions. """ @@ -302,7 +305,7 @@ class SparseCOOTensor(_Weakrefable): """ def to_scipy(self) -> coo_matrix: """ - Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. + Convert arrow::SparseCOOTensor to scipy.sparse.coo_array. """ def to_pydata_sparse(self) -> COO: """ @@ -399,11 +402,11 @@ class SparseCSRMatrix(_Weakrefable): @classmethod def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: """ - Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + Convert scipy.sparse.csr_array or scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. Parameters ---------- - obj : scipy.sparse.csr_matrix + obj : scipy.sparse.csr_array or scipy.sparse.csr_matrix The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. @@ -424,7 +427,7 @@ class SparseCSRMatrix(_Weakrefable): """ def to_scipy(self) -> csr_matrix: """ - Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. + Convert arrow::SparseCSRMatrix to scipy.sparse.csr_array. """ def to_tensor(self) -> Tensor: """ @@ -515,11 +518,11 @@ class SparseCSCMatrix(_Weakrefable): @classmethod def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: """ - Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + Convert scipy.sparse.csc_array or scipy.sparse.csc_matrix to arrow::SparseCSCMatrix Parameters ---------- - obj : scipy.sparse.csc_matrix + obj : scipy.sparse.csc_array or scipy.sparse.csc_matrix The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. @@ -540,7 +543,7 @@ class SparseCSCMatrix(_Weakrefable): """ def to_scipy(self) -> csr_matrix: """ - Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix + Convert arrow::SparseCSCMatrix to scipy.sparse.csc_array """ def to_tensor(self) -> Tensor: """ diff --git a/python/pyarrow/__lib_pxi/types.pyi b/python/pyarrow/__lib_pxi/types.pyi index aa965e3506c..27a2c75d68d 100644 --- a/python/pyarrow/__lib_pxi/types.pyi +++ b/python/pyarrow/__lib_pxi/types.pyi @@ -26,19 +26,20 @@ if sys.version_info >= (3, 11): else: from typing_extensions import Self -from typing import Any, Generic, Iterable, Iterator, Literal, overload +from typing import Any, Generic, Iterable, Iterator, Literal import numpy as np import pandas as pd from pyarrow._stubs_typing import SupportArrowSchema +# TODO from pyarrow.lib import ( Array, - ChunkedArray, + # ChunkedArray, ExtensionArray, MemoryPool, MonthDayNano, - Table, + # Table, ) from typing_extensions import TypeVar, deprecated @@ -119,7 +120,7 @@ class DataType(_Weakrefable): ListType(list) >>> pa.list_(pa.string()).num_fields 1 - >>> struct = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct = pa.struct({'x': pa.int32(), 'y': pa.string()}) >>> struct.num_fields 2 """ @@ -137,7 +138,10 @@ class DataType(_Weakrefable): >>> pa.string().num_buffers 3 """ - def __hash__(self) -> int: ... + def __hash__(self) -> int: + """ + Return hash(self). + """ def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: """ Return true if type is equivalent to passed value. @@ -240,12 +244,12 @@ class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): Create an instance of timestamp type: - >>> pa.timestamp("us") + >>> pa.timestamp('us') TimestampType(timestamp[us]) Create an instance of timestamp type with timezone: - >>> pa.timestamp("s", tz="UTC") + >>> pa.timestamp('s', tz='UTC') TimestampType(timestamp[s, tz=UTC]) """ @property @@ -256,7 +260,7 @@ class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): Examples -------- >>> import pyarrow as pa - >>> t = pa.timestamp("us") + >>> t = pa.timestamp('us') >>> t.unit 'us' """ @@ -268,7 +272,7 @@ class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): Examples -------- >>> import pyarrow as pa - >>> t = pa.timestamp("s", tz="UTC") + >>> t = pa.timestamp('s', tz='UTC') >>> t.tz 'UTC' """ @@ -287,7 +291,7 @@ class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): Create an instance of time32 type: >>> import pyarrow as pa - >>> pa.time32("ms") + >>> pa.time32('ms') Time32Type(time32[ms]) """ @property @@ -298,7 +302,7 @@ class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): Examples -------- >>> import pyarrow as pa - >>> t = pa.time32("ms") + >>> t = pa.time32('ms') >>> t.unit 'ms' """ @@ -317,7 +321,7 @@ class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): Create an instance of time64 type: >>> import pyarrow as pa - >>> pa.time64("us") + >>> pa.time64('us') Time64Type(time64[us]) """ @property @@ -328,7 +332,7 @@ class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): Examples -------- >>> import pyarrow as pa - >>> t = pa.time64("us") + >>> t = pa.time64('us') >>> t.unit 'us' """ @@ -342,7 +346,7 @@ class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): Create an instance of duration type: >>> import pyarrow as pa - >>> pa.duration("s") + >>> pa.duration('s') DurationType(duration[s]) """ @property @@ -353,7 +357,7 @@ class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): Examples -------- >>> import pyarrow as pa - >>> t = pa.duration("s") + >>> t = pa.duration('s') >>> t.unit 's' """ @@ -860,17 +864,17 @@ class StructType(DataType): Accessing fields using direct indexing: - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) >>> struct_type[0] pyarrow.Field - >>> struct_type["y"] + >>> struct_type['y'] pyarrow.Field Accessing fields using ``field()``: >>> struct_type.field(1) pyarrow.Field - >>> struct_type.field("x") + >>> struct_type.field('x') pyarrow.Field # Creating a schema from the struct type's fields: @@ -897,16 +901,16 @@ class StructType(DataType): Examples -------- >>> import pyarrow as pa - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) Index of the field with a name 'y': - >>> struct_type.get_field_index("y") + >>> struct_type.get_field_index('y') 1 Index of the field that does not exist: - >>> struct_type.get_field_index("z") + >>> struct_type.get_field_index('z') -1 """ def field(self, i: int | str) -> Field: @@ -925,7 +929,7 @@ class StructType(DataType): -------- >>> import pyarrow as pa - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) Select the second field: @@ -934,7 +938,7 @@ class StructType(DataType): Select the field named 'x': - >>> struct_type.field("x") + >>> struct_type.field('x') pyarrow.Field """ def get_all_field_indices(self, name: str) -> list[int]: @@ -953,12 +957,18 @@ class StructType(DataType): Examples -------- >>> import pyarrow as pa - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) - >>> struct_type.get_all_field_indices("x") + >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) + >>> struct_type.get_all_field_indices('x') [0] """ - def __len__(self) -> int: ... - def __iter__(self) -> Iterator[Field]: ... + def __len__(self) -> int: + """ + Like num_fields(). + """ + def __iter__(self) -> Iterator[Field]: + """ + Iterate over struct fields, in order. + """ __getitem__ = field # pyright: ignore[reportUnknownVariableType] @property def names(self) -> list[str]: @@ -968,7 +978,7 @@ class StructType(DataType): Examples -------- >>> import pyarrow as pa - >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type = pa.struct([('a', pa.int64()), ('b', pa.float64()), ('c', pa.string())]) >>> struct_type.names ['a', 'b', 'c'] """ @@ -980,7 +990,7 @@ class StructType(DataType): Examples -------- >>> import pyarrow as pa - >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type = pa.struct([('a', pa.int64()), ('b', pa.float64()), ('c', pa.string())]) >>> struct_type.fields [pyarrow.Field, pyarrow.Field, pyarrow.Field] """ @@ -994,32 +1004,24 @@ class UnionType(DataType): Create an instance of a dense UnionType using ``pa.union``: >>> import pyarrow as pa - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_DENSE, - ... ), - ... ) + >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], + ... mode=pa.lib.UnionMode_DENSE), (DenseUnionType(dense_union),) Create an instance of a dense UnionType using ``pa.dense_union``: - >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> pa.dense_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) DenseUnionType(dense_union) Create an instance of a sparse UnionType using ``pa.union``: - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_SPARSE, - ... ), - ... ) + >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], + ... mode=pa.lib.UnionMode_SPARSE), (SparseUnionType(sparse_union),) Create an instance of a sparse UnionType using ``pa.sparse_union``: - >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) SparseUnionType(sparse_union) """ @property @@ -1030,7 +1032,7 @@ class UnionType(DataType): Examples -------- >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) >>> union.mode 'sparse' """ @@ -1042,12 +1044,18 @@ class UnionType(DataType): Examples -------- >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) >>> union.type_codes [0, 1] """ - def __len__(self) -> int: ... - def __iter__(self) -> Iterator[Field]: ... + def __len__(self) -> int: + """ + Like num_fields(). + """ + def __iter__(self) -> Iterator[Field]: + """ + Iterate over union members, in order. + """ def field(self, i: int) -> Field: """ Return a child field by its numeric index. @@ -1063,7 +1071,7 @@ class UnionType(DataType): Examples -------- >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) >>> union[0] pyarrow.Field """ @@ -1078,21 +1086,27 @@ class SparseUnionType(UnionType): Create an instance of a sparse UnionType using ``pa.union``: >>> import pyarrow as pa - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_SPARSE, - ... ), - ... ) + >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], + ... mode=pa.lib.UnionMode_SPARSE), (SparseUnionType(sparse_union),) Create an instance of a sparse UnionType using ``pa.sparse_union``: - >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) SparseUnionType(sparse_union) """ @property - def mode(self) -> Literal["sparse"]: ... + def mode(self) -> Literal["sparse"]: + """ + The mode of the union ("dense" or "sparse"). + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) + >>> union.mode + 'sparse' + """ class DenseUnionType(UnionType): """ @@ -1103,22 +1117,28 @@ class DenseUnionType(UnionType): Create an instance of a dense UnionType using ``pa.union``: >>> import pyarrow as pa - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_DENSE, - ... ), - ... ) + >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], + ... mode=pa.lib.UnionMode_DENSE), (DenseUnionType(dense_union),) Create an instance of a dense UnionType using ``pa.dense_union``: - >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> pa.dense_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) DenseUnionType(dense_union) """ @property - def mode(self) -> Literal["dense"]: ... + def mode(self) -> Literal["dense"]: + """ + The mode of the union ("dense" or "sparse"). + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) + >>> union.mode + 'sparse' + """ _RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) @@ -1131,7 +1151,9 @@ class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): @property def value_type(self) -> _BasicValueT: ... -_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) +# TODO: replace below with: +# _StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) +_StorageT = TypeVar("_StorageT", bound=Array | Any) class BaseExtensionType(DataType): """ @@ -1155,7 +1177,19 @@ class BaseExtensionType(DataType): """ The underlying storage type. """ - def wrap_array(self, storage: _StorageT) -> _StorageT: ... + def wrap_array(self, storage: _StorageT) -> _StorageT: + """ + Wrap the given storage array as an extension array. + + Parameters + ---------- + storage : Array or ChunkedArray + + Returns + ------- + array : Array or ChunkedArray + Extension array wrapping the storage array + """ class ExtensionType(BaseExtensionType): """ @@ -1219,7 +1253,7 @@ class ExtensionType(BaseExtensionType): ... {"numer": 10, "denom": 17}, ... {"numer": 20, "denom": 13}, ... ], - ... type=rational_type.storage_type, + ... type=rational_type.storage_type ... ) >>> rational_array = rational_type.wrap_array(storage_array) >>> rational_array @@ -1264,7 +1298,13 @@ class ExtensionType(BaseExtensionType): ``__arrow_ext_deserialize__``. """ - def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + def __init__(self, storage_type: DataType, extension_name: str) -> None: + """ + Initialize an extension type instance. + + This should be called at the end of the subclass' + ``__init__`` method. + """ def __arrow_ext_serialize__(self) -> bytes: """ Serialized representation of metadata to reconstruct the type object. @@ -1301,7 +1341,8 @@ class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): Create an instance of fixed shape tensor extension type with permutation: - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) >>> tensor_type.permutation [0, 2, 1] """ @@ -1397,37 +1438,37 @@ class OpaqueType(BaseExtensionType): The name of the external system. """ -@deprecated( - "This class is deprecated and its deserialization is disabled by default. " - ":class:`ExtensionType` is recommended instead." -) -class PyExtensionType(ExtensionType): - """ - Concrete base class for Python-defined extension types based on pickle - for (de)serialization. - - .. warning:: - This class is deprecated and its deserialization is disabled by default. - :class:`ExtensionType` is recommended instead. - - Parameters - ---------- - storage_type : DataType - The storage type for which the extension is built. - """ - def __init__(self, storage_type: DataType) -> None: ... - @classmethod - def set_auto_load(cls, value: bool) -> None: - """ - Enable or disable auto-loading of serialized PyExtensionType instances. - - Parameters - ---------- - value : bool - Whether to enable auto-loading. - """ +# @deprecated( +# "This class is deprecated and its deserialization is disabled by default. " +# ":class:`ExtensionType` is recommended instead." +# ) +# class PyExtensionType(ExtensionType): +# """ +# Concrete base class for Python-defined extension types based on pickle +# for (de)serialization. +# +# .. warning:: +# This class is deprecated and its deserialization is disabled by default. +# :class:`ExtensionType` is recommended instead. +# +# Parameters +# ---------- +# storage_type : DataType +# The storage type for which the extension is built. +# """ +# def __init__(self, storage_type: DataType) -> None: ... +# @classmethod +# def set_auto_load(cls, value: bool) -> None: +# """ +# Enable or disable auto-loading of serialized PyExtensionType instances. +# +# Parameters +# ---------- +# value : bool +# Whether to enable auto-loading. +# """ -class UnknownExtensionType(PyExtensionType): # type: ignore +class UnknownExtensionType(ExtensionType): # type: ignore """ A concrete class for Python-defined extension types that refer to an unknown Python implementation. @@ -1439,9 +1480,12 @@ class UnknownExtensionType(PyExtensionType): # type: ignore serialized : bytes The serialised output. """ - def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + def __init__(self, storage_type: DataType, serialized: bytes) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ -def register_extension_type(ext_type: PyExtensionType) -> None: # type: ignore +def register_extension_type(ext_type: ExtensionType) -> None: # type: ignore """ Register a Python extension type. @@ -1549,23 +1593,52 @@ class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): **kwargs : optional additional key-value metadata """ - def __init__(self, __arg0__: Mapping[bytes, bytes] | Mapping[str, str] | None = None, **kwargs) -> None: ... - def equals(self, other: KeyValueMetadata) -> bool: ... - def __len__(self) -> int: ... - def __contains__(self, __key: object) -> bool: ... - def __getitem__(self, __key: Any) -> Any: ... - def __iter__(self) -> Iterator[bytes]: ... - def get_all(self, key: str) -> list[bytes]: ... + def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: + """ + Initialize self. See help(type(self)) for accurate signature. + """ + def equals(self, other: KeyValueMetadata) -> bool: + """ + Parameters + ---------- + other : pyarrow.KeyValueMetadata + + Returns + ------- + bool + """ + def __len__(self) -> int: + """ + Return len(self). + """ + def __contains__(self, __key: object) -> bool: + """ + Return bool(key in self). + """ + def __getitem__(self, __key: Any) -> Any: + """ + Return self[key]. + """ + def __iter__(self) -> Iterator[bytes]: + """ + Implement iter(self). + """ + def get_all(self, key: str) -> list[bytes]: + """ + Parameters + ---------- + key : str + + Returns + ------- + list[byte] + """ def to_dict(self) -> dict[bytes, bytes]: """ Convert KeyValueMetadata to dict. If a key occurs twice, the value for the first one is returned """ -def ensure_metadata( - meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False -) -> KeyValueMetadata | None: ... - class Field(_Weakrefable, Generic[_DataTypeT]): """ A named field, with a data type, nullability, and optional metadata. @@ -1579,11 +1652,12 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Create an instance of pyarrow.Field: >>> import pyarrow as pa - >>> pa.field("key", pa.int32()) + >>> pa.field('key', pa.int32()) pyarrow.Field - >>> pa.field("key", pa.int32(), nullable=False) + >>> pa.field('key', pa.int32(), nullable=False) pyarrow.Field - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field = pa.field('key', pa.int32(), + ... metadata={"key": "Something important"}) >>> field pyarrow.Field >>> field.metadata @@ -1612,14 +1686,17 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> f1 = pa.field("key", pa.int32()) - >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1 = pa.field('key', pa.int32()) + >>> f2 = pa.field('key', pa.int32(), nullable=False) >>> f1.equals(f2) False >>> f1.equals(f1) True """ - def __hash__(self) -> int: ... + def __hash__(self) -> int: + """ + Return hash(self). + """ @property def nullable(self) -> bool: """ @@ -1628,8 +1705,8 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> f1 = pa.field("key", pa.int32()) - >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1 = pa.field('key', pa.int32()) + >>> f2 = pa.field('key', pa.int32(), nullable=False) >>> f1.nullable True >>> f2.nullable @@ -1643,7 +1720,7 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) + >>> field = pa.field('key', pa.int32()) >>> field.name 'key' """ @@ -1659,7 +1736,8 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field = pa.field('key', pa.int32(), + ... metadata={"key": "Something important"}) >>> field.metadata {b'key': b'Something important'} """ @@ -1681,7 +1759,7 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) + >>> field = pa.field('key', pa.int32()) Create new field by adding metadata to existing one: @@ -1702,7 +1780,8 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field = pa.field('key', pa.int32(), + ... metadata={"key": "Something important"}) >>> field.metadata {b'key': b'Something important'} @@ -1726,7 +1805,7 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) + >>> field = pa.field('key', pa.int32()) >>> field pyarrow.Field @@ -1751,13 +1830,13 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) + >>> field = pa.field('key', pa.int32()) >>> field pyarrow.Field Create new field by replacing the name of an existing one: - >>> field_new = field.with_name("lock") + >>> field_new = field.with_name('lock') >>> field_new pyarrow.Field """ @@ -1776,7 +1855,7 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) + >>> field = pa.field('key', pa.int32()) >>> field pyarrow.Field >>> field.nullable @@ -1802,9 +1881,9 @@ class Field(_Weakrefable, Generic[_DataTypeT]): Examples -------- >>> import pyarrow as pa - >>> f1 = pa.field("bar", pa.float64(), nullable=False) - >>> f2 = pa.field("foo", pa.int32()).with_metadata({"key": "Something important"}) - >>> ff = pa.field("ff", pa.struct([f1, f2]), nullable=False) + >>> f1 = pa.field('bar', pa.float64(), nullable=False) + >>> f2 = pa.field('foo', pa.int32()).with_metadata({"key": "Something important"}) + >>> ff = pa.field('ff', pa.struct([f1, f2]), nullable=False) Flatten a struct field: @@ -1865,27 +1944,42 @@ class Schema(_Weakrefable): Create a new Arrow Schema object: >>> import pyarrow as pa - >>> pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + >>> pa.schema([ + ... ('some_int', pa.int32()), + ... ('some_string', pa.string()) + ... ]) some_int: int32 some_string: string Create Arrow Schema with metadata: - >>> pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) + >>> pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}) n_legs: int64 animals: string -- schema metadata -- n_legs: 'Number of legs per animal' """ - def __len__(self) -> int: ... - def __getitem__(self, key: str) -> Field: ... + def __len__(self) -> int: + """ + Return len(self). + """ + def __getitem__(self, key: str) -> Field: + """ + Return self[key]. + """ _field = __getitem__ # pyright: ignore[reportUnknownVariableType] - def __iter__(self) -> Iterator[Field]: ... - def __hash__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: + """ + Implement iter(self). + """ + def __hash__(self) -> int: + """ + Return hash(self). + """ def __sizeof__(self) -> int: ... @property def pandas_metadata(self) -> dict: @@ -1896,12 +1990,8 @@ class Schema(_Weakrefable): -------- >>> import pyarrow as pa >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) + >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], + ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) >>> schema = pa.Table.from_pandas(df).schema Select pandas metadata field from Arrow Schema: @@ -1921,7 +2011,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Get the names of the schema's fields: @@ -1940,7 +2032,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Get the types of the schema's fields: @@ -1959,17 +2053,19 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}) Get the metadata of the schema's fields: >>> schema.metadata {b'n_legs': b'Number of legs per animal'} """ - def empty_table(self) -> Table: + # TODO: replace below with: + # def empty_table(self) -> Table: + def empty_table(self) -> Any: """ Provide an empty table according to the schema. @@ -1980,7 +2076,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Create an empty table with schema's fields: @@ -2009,11 +2107,14 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema1 = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> schema2 = pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + >>> schema1 = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}) + >>> schema2 = pa.schema([ + ... ('some_int', pa.int32()), + ... ('some_string', pa.string()) + ... ]) Test two equal schemas: @@ -2048,7 +2149,10 @@ class Schema(_Weakrefable): -------- >>> import pandas as pd >>> import pyarrow as pa - >>> df = pd.DataFrame({"int": [1, 2], "str": ["a", "b"]}) + >>> df = pd.DataFrame({ + ... 'int': [1, 2], + ... 'str': ['a', 'b'] + ... }) Create an Arrow Schema from the schema of a pandas dataframe: @@ -2073,7 +2177,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Select the second field: @@ -2082,7 +2188,7 @@ class Schema(_Weakrefable): Select the field of the column named 'n_legs': - >>> schema.field("n_legs") + >>> schema.field('n_legs') pyarrow.Field """ @deprecated("Use 'field' instead") @@ -2117,7 +2223,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Get the index of the field named 'animals': @@ -2126,14 +2234,11 @@ class Schema(_Weakrefable): Index in case of several fields with the given name: - >>> schema = pa.schema( - ... [ - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... pa.field("animals", pa.bool_()), - ... ], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string()), + ... pa.field('animals', pa.bool_())], + ... metadata={"n_legs": "Number of legs per animal"}) >>> schema.get_field_index("animals") -1 """ @@ -2153,13 +2258,10 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema( - ... [ - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... pa.field("animals", pa.bool_()), - ... ] - ... ) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string()), + ... pa.field('animals', pa.bool_())]) Get the indexes of the fields named 'animals': @@ -2185,11 +2287,13 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Append a field 'extra' at the end of the schema: - >>> schema_new = schema.append(pa.field("extra", pa.bool_())) + >>> schema_new = schema.append(pa.field('extra', pa.bool_())) >>> schema_new n_legs: int64 animals: string @@ -2217,11 +2321,13 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Insert a new field on the second position: - >>> schema.insert(1, pa.field("extra", pa.bool_())) + >>> schema.insert(1, pa.field('extra', pa.bool_())) n_legs: int64 extra: bool animals: string @@ -2241,7 +2347,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Remove the second field of the schema: @@ -2264,11 +2372,13 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Replace the second field of the schema with a new field 'extra': - >>> schema.set(1, pa.field("replaced", pa.bool_())) + >>> schema.set(1, pa.field('replaced', pa.bool_())) n_legs: int64 replaced: bool """ @@ -2298,7 +2408,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Add metadata to existing schema field: @@ -2324,7 +2436,9 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())]) Write schema to Buffer: @@ -2342,10 +2456,10 @@ class Schema(_Weakrefable): Examples -------- >>> import pyarrow as pa - >>> schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) + >>> schema = pa.schema([ + ... pa.field('n_legs', pa.int64()), + ... pa.field('animals', pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}) >>> schema n_legs: int64 animals: string @@ -2376,6 +2490,8 @@ class Schema(_Weakrefable): Display Field-level KeyValueMetadata show_schema_metadata : boolean, default True Display Schema-level KeyValueMetadata + element_size_limit : int, default 100 + Maximum number of characters of a single element before it is truncated. Returns ------- @@ -2452,13 +2568,9 @@ def unify_schemas( If Fields of the same name are not mergeable. """ -@overload -def field(name: SupportArrowSchema) -> Field[Any]: ... -@overload def field( - name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None -) -> Field[_DataTypeT]: ... -def field(*args, **kwargs): + name: SupportArrowSchema | str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT] | Field[Any]: """ Create a pyarrow.Field instance. @@ -2485,12 +2597,13 @@ def field(*args, **kwargs): Create an instance of pyarrow.Field: >>> import pyarrow as pa - >>> pa.field("key", pa.int32()) + >>> pa.field('key', pa.int32()) pyarrow.Field - >>> pa.field("key", pa.int32(), nullable=False) + >>> pa.field('key', pa.int32(), nullable=False) pyarrow.Field - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field = pa.field('key', pa.int32(), + ... metadata={"key": "Something important"}) >>> field pyarrow.Field >>> field.metadata @@ -2503,7 +2616,7 @@ def field(*args, **kwargs): A str can also be passed for the type parameter: - >>> pa.field("key", "int32") + >>> pa.field('key', 'int32') pyarrow.Field """ @@ -2523,7 +2636,7 @@ def null() -> NullType: Create a ``Field`` type with a null type and a name: - >>> pa.field("null_field", pa.null()) + >>> pa.field('null_field', pa.null()) pyarrow.Field """ @@ -2544,7 +2657,7 @@ def bool_() -> BoolType: Create a ``Field`` type with a boolean type and a name: - >>> pa.field("bool_field", pa.bool_()) + >>> pa.field('bool_field', pa.bool_()) pyarrow.Field """ @@ -2748,50 +2861,7 @@ def uint64() -> UInt64Type: ] """ -def tzinfo_to_string(tz: dt.tzinfo) -> str: - """ - Converts a time zone object into a string indicating the name of a time - zone, one of: - * As used in the Olson time zone database (the "tz database" or - "tzdata"), such as "America/New_York" - * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - - Parameters - ---------- - tz : datetime.tzinfo - Time zone object - - Returns - ------- - name : str - Time zone name - """ - -def string_to_tzinfo(name: str) -> dt.tzinfo: - """ - Convert a time zone name into a time zone object. - - Supported input strings are: - * As used in the Olson time zone database (the "tz database" or - "tzdata"), such as "America/New_York" - * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - - Parameters - ---------- - name: str - Time zone name. - - Returns - ------- - tz : datetime.tzinfo - Time zone object - """ - -@overload -def timestamp(unit: _Unit | str) -> TimestampType[_Unit, _Tz]: ... -@overload -def timestamp(unit: _Unit | str, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... -def timestamp(*args, **kwargs): +def timestamp(unit: _Unit, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: """ Create instance of timestamp type with resolution and optional time zone. @@ -2808,19 +2878,19 @@ def timestamp(*args, **kwargs): Create an instance of timestamp type: >>> import pyarrow as pa - >>> pa.timestamp("us") + >>> pa.timestamp('us') TimestampType(timestamp[us]) - >>> pa.timestamp("s", tz="America/New_York") + >>> pa.timestamp('s', tz='America/New_York') TimestampType(timestamp[s, tz=America/New_York]) - >>> pa.timestamp("s", tz="+07:30") + >>> pa.timestamp('s', tz='+07:30') TimestampType(timestamp[s, tz=+07:30]) Use timestamp type when creating a scalar object: >>> from datetime import datetime - >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("s", tz="UTC")) + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('s', tz='UTC')) - >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("us")) + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('us')) Returns @@ -2844,9 +2914,9 @@ def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: Examples -------- >>> import pyarrow as pa - >>> pa.time32("s") + >>> pa.time32('s') Time32Type(time32[s]) - >>> pa.time32("ms") + >>> pa.time32('ms') Time32Type(time32[ms]) """ @@ -2866,9 +2936,9 @@ def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: Examples -------- >>> import pyarrow as pa - >>> pa.time64("us") + >>> pa.time64('us') Time64Type(time64[us]) - >>> pa.time64("ns") + >>> pa.time64('ns') Time64Type(time64[ns]) """ @@ -2891,14 +2961,14 @@ def duration(unit: _Unit) -> DurationType[_Unit]: Create an instance of duration type: >>> import pyarrow as pa - >>> pa.duration("us") + >>> pa.duration('us') DurationType(duration[us]) - >>> pa.duration("s") + >>> pa.duration('s') DurationType(duration[s]) Create an array with duration type: - >>> pa.array([0, 1, 2], type=pa.duration("s")) + >>> pa.array([0, 1, 2], type=pa.duration('s')) [ 0, @@ -2985,15 +3055,15 @@ def float16() -> Float16Type: >>> a [ - 15872, - 32256 + 1.5, + nan ] Note that unlike other float types, if you convert this array to a python list, the types of its elements will be ``np.float16`` >>> [type(val) for val in a.to_pylist()] - [, ] + [, ] """ def float32() -> Float32Type: @@ -3046,11 +3116,7 @@ def float64() -> Float64Type: ] """ -@overload -def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... -@overload -def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... -def decimal32(*args, **kwargs): +def decimal32(precision: _Precision, scale: _Scale | None = None) -> Decimal32Type[_Precision, _Scale| Literal[0]]: """ Create decimal type with precision and scale and 32-bit width. @@ -3091,7 +3157,7 @@ def decimal32(*args, **kwargs): Create an array with decimal type: >>> import decimal - >>> a = decimal.Decimal("123.45") + >>> a = decimal.Decimal('123.45') >>> pa.array([a], pa.decimal32(5, 2)) [ @@ -3099,11 +3165,7 @@ def decimal32(*args, **kwargs): ] """ -@overload -def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... -@overload -def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... -def decimal64(*args, **kwargs): +def decimal64(precision: _Precision, scale: _Scale | None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: """ Create decimal type with precision and scale and 64-bit width. @@ -3144,7 +3206,7 @@ def decimal64(*args, **kwargs): Create an array with decimal type: >>> import decimal - >>> a = decimal.Decimal("123.45") + >>> a = decimal.Decimal('123.45') >>> pa.array([a], pa.decimal64(5, 2)) [ @@ -3152,11 +3214,7 @@ def decimal64(*args, **kwargs): ] """ -@overload -def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... -@overload -def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... -def decimal128(*args, **kwargs): +def decimal128(precision: _Precision, scale: _Scale | None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: """ Create decimal type with precision and scale and 128-bit width. @@ -3197,7 +3255,7 @@ def decimal128(*args, **kwargs): Create an array with decimal type: >>> import decimal - >>> a = decimal.Decimal("123.45") + >>> a = decimal.Decimal('123.45') >>> pa.array([a], pa.decimal128(5, 2)) [ @@ -3205,11 +3263,7 @@ def decimal128(*args, **kwargs): ] """ -@overload -def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... -@overload -def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... -def decimal256(*args, **kwargs): +def decimal256(precision: _Precision, scale: _Scale | None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: """ Create decimal type with precision and scale and 256-bit width. @@ -3248,7 +3302,7 @@ def string() -> StringType: and use the string type to create an array: - >>> pa.array(["foo", "bar", "baz"], type=pa.string()) + >>> pa.array(['foo', 'bar', 'baz'], type=pa.string()) [ "foo", @@ -3280,11 +3334,7 @@ and use the string type to create an array: ] """ -@overload -def binary(length: Literal[-1] = ...) -> BinaryType: ... -@overload -def binary(length: int) -> FixedSizeBinaryType: ... -def binary(length): +def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: """ Create variable-length or fixed size binary type. @@ -3305,7 +3355,7 @@ def binary(length): and use the variable-length binary type to create an array: - >>> pa.array(["foo", "bar", "baz"], type=pa.binary()) + >>> pa.array(['foo', 'bar', 'baz'], type=pa.binary()) [ 666F6F, @@ -3320,7 +3370,7 @@ def binary(length): and use the fixed-length binary type to create an array: - >>> pa.array(["foo", "bar", "baz"], type=pa.binary(3)) + >>> pa.array(['foo', 'bar', 'baz'], type=pa.binary(3)) [ 666F6F, @@ -3346,7 +3396,7 @@ def large_binary() -> LargeBinaryType: and use the type to create an array: - >>> pa.array(["foo", "bar", "baz"], type=pa.large_binary()) + >>> pa.array(['foo', 'bar', 'baz'], type=pa.large_binary()) [ 666F6F, @@ -3372,7 +3422,7 @@ def large_string() -> LargeStringType: and use the type to create an array: - >>> pa.array(["foo", "bar"] * 50, type=pa.large_string()) + >>> pa.array(['foo', 'bar'] * 50, type=pa.large_string()) [ "foo", @@ -3434,15 +3484,9 @@ def string_view() -> StringViewType: DataType(string_view) """ -@overload def list_( - value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... -) -> ListType[_DataTypeT]: ... -@overload -def list_( - value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size -) -> FixedSizeListType[_DataTypeT, _Size]: ... -def list_(*args, **kwargs): + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] | _Size | None = None +) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: """ Create ListType instance from child data type or field. @@ -3469,7 +3513,7 @@ def list_(*args, **kwargs): Use the ListType to create a scalar: - >>> pa.scalar(["foo", None], type=pa.list_(pa.string(), 2)) + >>> pa.scalar(['foo', None], type=pa.list_(pa.string(), 2)) or an array: @@ -3578,13 +3622,9 @@ def large_list_view( LargeListViewType(large_list_view) """ -@overload -def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... -@overload def map_( - key_type: _K, item_type: _ValueT, key_sorted: _Ordered -) -> MapType[_K, _ValueT, _Ordered]: ... -def map_(*args, **kwargs): + key_type: _K, item_type: _ValueT, key_sorted: _Ordered | None = None +) -> MapType[_K, _ValueT, _Ordered]: """ Create MapType instance from key and item data types or fields. @@ -3610,7 +3650,7 @@ def map_(*args, **kwargs): Use MapType to create an array: - >>> data = [[{"key": "a", "value": 1}, {"key": "b", "value": 2}], [{"key": "c", "value": 3}]] + >>> data = [[{'key': 'a', 'value': 1}, {'key': 'b', 'value': 2}], [{'key': 'c', 'value': 3}]] >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) [ @@ -3635,15 +3675,9 @@ def map_(*args, **kwargs): ] """ -@overload -def dictionary( - index_type: _IndexT, value_type: _BasicValueT -) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... -@overload def dictionary( - index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered -) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... -def dictionary(*args, **kwargs): + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered | None = None +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: """ Dictionary (categorical, or simply encoded) type. @@ -3707,8 +3741,8 @@ def struct( >>> import pyarrow as pa >>> fields = [ - ... ("f1", pa.int32()), - ... ("f2", pa.string()), + ... ('f1', pa.int32()), + ... ('f2', pa.string()), ... ] >>> struct_type = pa.struct(fields) >>> struct_type @@ -3718,14 +3752,14 @@ def struct( >>> struct_type[0] pyarrow.Field - >>> struct_type["f1"] + >>> struct_type['f1'] pyarrow.Field Create an instance of StructType from an iterable of Fields: >>> fields = [ - ... pa.field("f1", pa.int32()), - ... pa.field("f2", pa.string(), nullable=False), + ... pa.field('f1', pa.int32()), + ... pa.field('f2', pa.string(), nullable=False), ... ] >>> pa.struct(fields) StructType(struct) @@ -3790,15 +3824,9 @@ def dense_union( type : DenseUnionType """ -@overload def union( - child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None -) -> SparseUnionType: ... -@overload -def union( - child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None -) -> DenseUnionType: ... -def union(*args, **kwargs): + child_fields: list[Field[Any]], mode: Literal["sparse"] | Literal["dense"], type_codes: list[int] | None = None +) -> SparseUnionType | DenseUnionType: """ Create UnionType from child fields. @@ -3939,14 +3967,16 @@ def fixed_shape_tensor( Create an instance of fixed shape tensor extension type with names of tensor dimensions: - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=["C", "H", "W"]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... dim_names=['C', 'H', 'W']) >>> tensor_type.dim_names ['C', 'H', 'W'] Create an instance of fixed shape tensor extension type with permutation: - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) >>> tensor_type.permutation [0, 2, 1] @@ -4036,77 +4066,7 @@ def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueTy type : OpaqueType """ -@overload -def type_for_alias(name: Literal["null"]) -> NullType: ... -@overload -def type_for_alias(name: Literal["bool", "boolean"]) -> BoolType: ... -@overload -def type_for_alias(name: Literal["i1", "int8"]) -> Int8Type: ... -@overload -def type_for_alias(name: Literal["i2", "int16"]) -> Int16Type: ... -@overload -def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... -@overload -def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... -@overload -def type_for_alias(name: Literal["u1", "uint8"]) -> UInt8Type: ... -@overload -def type_for_alias(name: Literal["u2", "uint16"]) -> UInt16Type: ... -@overload -def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... -@overload -def type_for_alias(name: Literal["u8", "uint64"]) -> UInt64Type: ... -@overload -def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... -@overload -def type_for_alias(name: Literal["f4", "float", "float32"]) -> Float32Type: ... -@overload -def type_for_alias(name: Literal["f8", "double", "float64"]) -> Float64Type: ... -@overload -def type_for_alias(name: Literal["string", "str", "utf8"]) -> StringType: ... -@overload -def type_for_alias(name: Literal["binary"]) -> BinaryType: ... -@overload -def type_for_alias( - name: Literal["large_string", "large_str", "large_utf8"], -) -> LargeStringType: ... -@overload -def type_for_alias(name: Literal["large_binary"]) -> LargeBinaryType: ... -@overload -def type_for_alias(name: Literal["binary_view"]) -> BinaryViewType: ... -@overload -def type_for_alias(name: Literal["string_view"]) -> StringViewType: ... -@overload -def type_for_alias(name: Literal["date32", "date32[day]"]) -> Date32Type: ... -@overload -def type_for_alias(name: Literal["date64", "date64[ms]"]) -> Date64Type: ... -@overload -def type_for_alias(name: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... -@overload -def type_for_alias(name: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... -@overload -def type_for_alias(name: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... -@overload -def type_for_alias(name: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... -@overload -def type_for_alias(name: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... -@overload -def type_for_alias(name: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... -@overload -def type_for_alias(name: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... -@overload -def type_for_alias(name: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... -@overload -def type_for_alias(name: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... -@overload -def type_for_alias(name: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... -@overload -def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... -@overload -def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... -@overload -def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... -def type_for_alias(name): +def type_for_alias(name: Any) -> DataType: """ Return DataType given a string alias if one exists. @@ -4120,80 +4080,6 @@ def type_for_alias(name): type : DataType """ -@overload -def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... -@overload -def ensure_type(ty: _DataTypeT) -> _DataTypeT: ... -@overload -def ensure_type(ty: Literal["null"]) -> NullType: ... -@overload -def ensure_type(ty: Literal["bool", "boolean"]) -> BoolType: ... -@overload -def ensure_type(ty: Literal["i1", "int8"]) -> Int8Type: ... -@overload -def ensure_type(ty: Literal["i2", "int16"]) -> Int16Type: ... -@overload -def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... -@overload -def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... -@overload -def ensure_type(ty: Literal["u1", "uint8"]) -> UInt8Type: ... -@overload -def ensure_type(ty: Literal["u2", "uint16"]) -> UInt16Type: ... -@overload -def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... -@overload -def ensure_type(ty: Literal["u8", "uint64"]) -> UInt64Type: ... -@overload -def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... -@overload -def ensure_type(ty: Literal["f4", "float", "float32"]) -> Float32Type: ... -@overload -def ensure_type(ty: Literal["f8", "double", "float64"]) -> Float64Type: ... -@overload -def ensure_type(ty: Literal["string", "str", "utf8"]) -> StringType: ... -@overload -def ensure_type(ty: Literal["binary"]) -> BinaryType: ... -@overload -def ensure_type( - ty: Literal["large_string", "large_str", "large_utf8"], -) -> LargeStringType: ... -@overload -def ensure_type(ty: Literal["large_binary"]) -> LargeBinaryType: ... -@overload -def ensure_type(ty: Literal["binary_view"]) -> BinaryViewType: ... -@overload -def ensure_type(ty: Literal["string_view"]) -> StringViewType: ... -@overload -def ensure_type(ty: Literal["date32", "date32[day]"]) -> Date32Type: ... -@overload -def ensure_type(ty: Literal["date64", "date64[ms]"]) -> Date64Type: ... -@overload -def ensure_type(ty: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... -@overload -def ensure_type(ty: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... -@overload -def ensure_type(ty: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... -@overload -def ensure_type(ty: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... -@overload -def ensure_type(ty: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... -@overload -def ensure_type(ty: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... -@overload -def ensure_type(ty: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... -@overload -def ensure_type(ty: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... -@overload -def ensure_type(ty: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... -@overload -def ensure_type(ty: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... -@overload -def ensure_type(ty: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... -@overload -def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... -@overload -def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... def schema( fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], metadata: dict[bytes | str, bytes | str] | None = None, @@ -4214,33 +4100,40 @@ def schema( Create a Schema from iterable of tuples: >>> import pyarrow as pa - >>> pa.schema( - ... [ - ... ("some_int", pa.int32()), - ... ("some_string", pa.string()), - ... pa.field("some_required_string", pa.string(), nullable=False), - ... ] - ... ) + >>> pa.schema([ + ... ('some_int', pa.int32()), + ... ('some_string', pa.string()), + ... pa.field('some_required_string', pa.string(), nullable=False) + ... ]) some_int: int32 some_string: string some_required_string: string not null Create a Schema from iterable of Fields: - >>> pa.schema([pa.field("some_int", pa.int32()), pa.field("some_string", pa.string())]) + >>> pa.schema([ + ... pa.field('some_int', pa.int32()), + ... pa.field('some_string', pa.string()) + ... ]) some_int: int32 some_string: string DataTypes can also be passed as strings. The following is equivalent to the above example: - >>> pa.schema([pa.field("some_int", "int32"), pa.field("some_string", "string")]) + >>> pa.schema([ + ... pa.field('some_int', "int32"), + ... pa.field('some_string', "string") + ... ]) some_int: int32 some_string: string Or more concisely: - >>> pa.schema([("some_int", "int32"), ("some_string", "string")]) + >>> pa.schema([ + ... ('some_int', "int32"), + ... ('some_string', "string") + ... ]) some_int: int32 some_string: string @@ -4264,9 +4157,9 @@ def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: >>> import pyarrow as pa >>> import numpy as np - >>> pa.from_numpy_dtype(np.dtype("float16")) + >>> pa.from_numpy_dtype(np.dtype('float16')) DataType(halffloat) - >>> pa.from_numpy_dtype("U") + >>> pa.from_numpy_dtype('U') DataType(string) >>> pa.from_numpy_dtype(bool) DataType(bool) @@ -4274,42 +4167,10 @@ def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: DataType(string) """ -def is_boolean_value(obj: Any) -> bool: - """ - Check if the object is a boolean. - - Parameters - ---------- - obj : object - The object to check - """ - -def is_integer_value(obj: Any) -> bool: - """ - Check if the object is an integer. - - Parameters - ---------- - obj : object - The object to check - """ - -def is_float_value(obj: Any) -> bool: - """ - Check if the object is a float. - - Parameters - ---------- - obj : object - The object to check - """ - __all__ = [ "_Weakrefable", "_Metadata", - "_AsPyType", "DataType", - "_DataTypeT", "_BasicDataType", "NullType", "BoolType", @@ -4362,12 +4223,10 @@ __all__ = [ "UuidType", "JsonType", "OpaqueType", - "PyExtensionType", "UnknownExtensionType", "register_extension_type", "unregister_extension_type", "KeyValueMetadata", - "ensure_metadata", "Field", "Schema", "unify_schemas", @@ -4382,8 +4241,6 @@ __all__ = [ "int32", "int64", "uint64", - "tzinfo_to_string", - "string_to_tzinfo", "timestamp", "time32", "time64", @@ -4423,10 +4280,6 @@ __all__ = [ "bool8", "opaque", "type_for_alias", - "ensure_type", "schema", "from_numpy_dtype", - "is_boolean_value", - "is_integer_value", - "is_float_value", ] diff --git a/python/pyarrow/_stubs_typing.pyi b/python/pyarrow/_stubs_typing.pyi index 549dc4059c3..98479791103 100644 --- a/python/pyarrow/_stubs_typing.pyi +++ b/python/pyarrow/_stubs_typing.pyi @@ -19,13 +19,13 @@ import datetime as dt from collections.abc import Sequence from decimal import Decimal -from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar, Iterator import numpy as np from numpy.typing import NDArray -from .compute import BooleanArray, IntegerArray +from pyarrow.lib import BooleanArray, IntegerArray ArrayLike: TypeAlias = Any ScalarLike: TypeAlias = Any @@ -52,6 +52,8 @@ PyScalar: TypeAlias = ( ) _T = TypeVar("_T") +_V = TypeVar("_V", covariant=True) + SingleOrList: TypeAlias = list[_T] | _T class SupportEq(Protocol): @@ -78,11 +80,9 @@ FilterTuple: TypeAlias = ( | tuple[str, Literal["in", "not in"], Collection] ) -class Buffer(Protocol): - def __buffer__(self, flags: int, /) -> memoryview: ... +class Buffer(Protocol): ... -class SupportPyBuffer(Protocol): - def __buffer__(self, flags: int, /) -> memoryview: ... +class SupportPyBuffer(Protocol): ... class SupportArrowStream(Protocol): def __arrow_c_stream__(self, requested_schema=None) -> Any: ... @@ -95,3 +95,8 @@ class SupportArrowDeviceArray(Protocol): class SupportArrowSchema(Protocol): def __arrow_c_schema(self) -> Any: ... + +class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index bf5beab589d..109d8ebe597 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3634,7 +3634,7 @@ cdef class FixedSizeListArray(BaseListArray): Or create from a values array, list size and matching type: >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) >>> arr [ diff --git a/python/pyarrow/lib.pyi b/python/pyarrow/lib.pyi index 3292c52b2c0..9d5bd7bedb2 100644 --- a/python/pyarrow/lib.pyi +++ b/python/pyarrow/lib.pyi @@ -16,24 +16,30 @@ # under the License. # ruff: noqa: F403 -from typing import NamedTuple +from collections.abc import Mapping +import datetime as dt +from typing import NamedTuple, Literal +from typing_extensions import TypeVar from .__lib_pxi.array import * -from .__lib_pxi.benchmark import * -from .__lib_pxi.builder import * -from .__lib_pxi.compat import * -from .__lib_pxi.config import * -from .__lib_pxi.device import * -from .__lib_pxi.error import * +# TODO +# from .__lib_pxi.benchmark import * +# from .__lib_pxi.builder import * +# from .__lib_pxi.compat import * +# from .__lib_pxi.config import * +# from .__lib_pxi.device import * +# from .__lib_pxi.error import * from .__lib_pxi.io import * -from .__lib_pxi.ipc import * +# from .__lib_pxi.ipc import * from .__lib_pxi.memory import * -from .__lib_pxi.pandas_shim import * +# from .__lib_pxi.pandas_shim import * from .__lib_pxi.scalar import * -from .__lib_pxi.table import * +# from .__lib_pxi.table import * from .__lib_pxi.tensor import * from .__lib_pxi.types import * +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + class MonthDayNano(NamedTuple): days: int months: int @@ -79,6 +85,51 @@ def is_threading_enabled() -> bool: threading doesn't work (e.g. Emscripten). """ +def ensure_metadata( + meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False +) -> KeyValueMetadata | None: ... + +def tzinfo_to_string(tz: dt.tzinfo) -> str: + """ + Converts a time zone object into a string indicating the name of a time + zone, one of: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + tz : datetime.tzinfo + Time zone object + + Returns + ------- + name : str + Time zone name + """ + +def string_to_tzinfo(name: str) -> dt.tzinfo: + """ + Convert a time zone name into a time zone object. + + Supported input strings are: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + name: str + Time zone name. + + Returns + ------- + tz : datetime.tzinfo + Time zone object + """ + +def ensure_type(ty: _DataTypeT | None, allow_none: Literal[True] | Literal[False] | None = None) -> _DataTypeT | None: ... + Type_NA: int Type_BOOL: int Type_UINT8: int diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 5934a7aa8cf..d26933e3f39 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1036,7 +1036,7 @@ cdef class StructScalar(Scalar, Mapping): Parameters ---------- - index : Union[int, str] + key : Union[int, str] Index / position or name of the field. Returns diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 450cce74f1d..6d7ec6f724f 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -18,21 +18,21 @@ import datetime import sys -import pytest -import hypothesis as h -import hypothesis.strategies as st +import pytest # type: ignore[import-not-found] +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] try: - import hypothesis.extra.numpy as npst + import hypothesis.extra.numpy as npst # type: ignore[import-not-found] except ImportError: - npst = None + npst = None # type: ignore[assignment] try: - import hypothesis.extra.pytz as tzst + import hypothesis.extra.pytz as tzst # type: ignore[import-not-found] except ImportError: - tzst = None + tzst = None # type: ignore[assignment] try: import zoneinfo except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] if sys.platform == 'win32': try: import tzdata # noqa:F401 @@ -41,7 +41,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa @@ -234,13 +234,13 @@ def schemas(type_strategy=primitive_types, max_fields=None): all_types = st.deferred( lambda: ( - primitive_types | - list_types() | - struct_types() | - dictionary_types() | - map_types() | - list_types(all_types) | - struct_types(all_types) + primitive_types + | list_types() + | struct_types() + | dictionary_types() + | map_types() + | list_types(all_types) # type: ignore[has-type] + | struct_types(all_types) # type: ignore[has-type] ) ) all_fields = fields(all_types) @@ -467,7 +467,9 @@ def pandas_compatible_list_types( dictionary_types( value_strategy=pandas_compatible_dictionary_value_types ), - pandas_compatible_list_types(pandas_compatible_types), - struct_types(pandas_compatible_types) + pandas_compatible_list_types( + pandas_compatible_types # type: ignore[has-type] + ), + struct_types(pandas_compatible_types) # type: ignore[has-type] ) ) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 009ab1e849b..5686420c688 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -18,10 +18,10 @@ from collections.abc import Iterable import datetime import decimal -import hypothesis as h -import hypothesis.strategies as st +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] import itertools -import pytest +import pytest # type: ignore[import-not-found] import struct import subprocess import sys @@ -30,7 +30,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 7820111b70f..5441dd493d3 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -28,17 +28,31 @@ import sys import textwrap +try: + import numpy as np +except ImportError: + np = None + +try: + import pandas as pd +except ImportError: + pd = None import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import ArrowNotImplementedError, ArrowTypeError +from pyarrow.lib import ArrowNotImplementedError + +try: + import pyarrow.substrait as pas +except ImportError: + pas = None exported_functions = [ - func for (_, func) in sorted(pc.__dict__.items()) + func for (name, func) in sorted(pc.__dict__.items()) if hasattr(func, '__arrow_compute_function__')] exported_option_classes = [ - cls for (_, cls) in sorted(pc.__dict__.items()) + cls for (name, cls) in sorted(pc.__dict__.items()) if (isinstance(cls, type) and cls is not pc.FunctionOptions and issubclass(cls, pc.FunctionOptions))] @@ -203,7 +217,7 @@ def test_option_class_equality(request): and cls != pc.AssumeTimezoneOptions ): try: - options.append(cls()) # type: ignore[reportArgumentType] + options.append(cls()) except TypeError: pytest.fail(f"Options class is not tested: {cls}") @@ -262,8 +276,6 @@ def test_get_function_hash_aggregate(): @pytest.mark.numpy def test_call_function_with_memory_pool(): - import numpy as np - arr = pa.array(["foo", "bar", "baz"]) indices = np.array([2, 2, 1]) result1 = arr.take(indices) @@ -787,11 +799,11 @@ def test_min_max(): # Wrong options type options = pc.TakeOptions() with pytest.raises(TypeError): - s = pc.min_max(data, options=options) # type: ignore[reportCallIssue] + s = pc.min_max(data, options=options) # Missing argument with pytest.raises(TypeError, match="min_max takes 1 positional"): - s = pc.min_max() # type: ignore[reportCallIssue] + s = pc.min_max() def test_any(): @@ -842,11 +854,11 @@ def test_all(): def test_is_valid(): # An example generated function wrapper without options - data = pa.array([4, 5, None]) + data = [4, 5, None] assert pc.is_valid(data).to_pylist() == [True, True, False] with pytest.raises(TypeError): - pc.is_valid(data, options=None) # type: ignore[no-matching-overload] + pc.is_valid(data, options=None) def test_generated_docstrings(): @@ -1025,6 +1037,21 @@ def find_new_unicode_codepoints(): 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } +# utf8proc does not store if a codepoint is numeric +numeric_info_missing = { + 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, + 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, + 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, + 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, + 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, + 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, + 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, + 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, + 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, + 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, + 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, + 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, + 0x10fcb, } # utf8proc has no no digit/numeric information digit_info_missing = { 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, @@ -1043,7 +1070,6 @@ def find_new_unicode_codepoints(): 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } -# utf8proc does not store if a codepoint is numeric numeric_info_missing = { 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, @@ -1078,8 +1104,7 @@ def test_string_py_compat_boolean(function_name, variant): py_name = function_name.replace('_', '') ignore = codepoints_ignore.get(function_name, set()) | \ find_new_unicode_codepoints() - for i in range(128 if ascii # type: ignore[reportUnnecessaryComparison] - else 0x11000): + for i in range(128 if ascii else 0x11000): if i in range(0xD800, 0xE000): continue # bug? pyarrow doesn't allow utf16 surrogates # the issues we know of, we skip @@ -1145,8 +1170,6 @@ def test_utf8_zfill(): @pytest.mark.pandas def test_replace_slice(): - import numpy as np - offsets = range(-3, 4) arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) @@ -1223,7 +1246,6 @@ def test_binary_join(): expected = pa.array(['a1b', 'c2d'], type=pa.binary()) ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary())) assert pc.binary_join(ar_list, separator_array).equals(expected) - assert expected.equals(pc.binary_join(ar_list, separator_array)) def test_binary_join_element_wise(): @@ -1287,8 +1309,7 @@ def test_take_indices_types(): for indices_type in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64']: - indices = pa.array( - [0, 4, 2, None], type=indices_type) # type: ignore[reportArgumentType] + indices = pa.array([0, 4, 2, None], type=indices_type) result = arr.take(indices) result.validate() expected = pa.array([0, 4, 2, None]) @@ -1297,7 +1318,7 @@ def test_take_indices_types(): for indices_type in [pa.float32(), pa.float64()]: indices = pa.array([0, 4, 2], type=indices_type) with pytest.raises(NotImplementedError): - arr.take(indices) # type: ignore[reportArgumentType] + arr.take(indices) def test_take_on_chunked_array(): @@ -1465,8 +1486,6 @@ def test_filter(ty, values): @pytest.mark.numpy @pytest.mark.parametrize(('ty', 'values'), all_array_types) def test_filter_numpy_array_mask(ty, values): - import numpy as np - arr = pa.array(values, type=ty) # same test as test_filter with different array type mask = np.array([True, False, False, True, None]) @@ -1543,7 +1562,7 @@ def test_filter_errors(): # non-boolean dtype mask = pa.array([0, 1, 0, 1, 0]) with pytest.raises(NotImplementedError): - obj.filter(mask) # type: ignore[reportArgumentType] + obj.filter(mask) # wrong length mask = pa.array([True, False, True]) @@ -1554,7 +1573,7 @@ def test_filter_errors(): scalar = pa.scalar(True) for filt in [batch, table, scalar]: with pytest.raises(TypeError): - table.filter(filt) # type: ignore[reportArgumentType] + table.filter(filt) def test_filter_null_type(): @@ -1573,10 +1592,11 @@ def test_filter_null_type(): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_array(typ): - def con(values): - if typ == "array": + if typ == "array": + def con(values): return pa.array(values) - else: + else: + def con(values): return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) @@ -1603,10 +1623,11 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_string_scalar(typ): - def con(values): - if typ == "array": + if typ == "array": + def con(values): return pa.array(values) - else: + else: + def con(values): return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) @@ -1639,10 +1660,11 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_scalar(typ): - def con(values): - if typ == "array": + if typ == "array": + def con(values): return pa.array(values) - else: + else: + def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) @@ -1735,17 +1757,14 @@ def test_round_to_integer(ty): "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None], } for round_mode, expected in rmode_and_expected.items(): - options = RoundOptions( # type: ignore[reportPossiblyUnboundVariable] - round_mode=round_mode) # type: ignore[reportArgumentType] - result = round(values, options=options) # type: ignore[reportArgumentType] + options = RoundOptions(round_mode=round_mode) + result = round(values, options=options) expected_array = pa.array(expected, type=pa.float64()) assert expected_array.equals(result) @pytest.mark.numpy def test_round(): - import numpy as np - values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] ndigits_and_expected = { -2: [300, 0, 0, 0, -0, -0, -0, None], @@ -1765,8 +1784,6 @@ def test_round(): @pytest.mark.numpy def test_round_to_multiple(): - import numpy as np - values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] multiple_and_expected = { 0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None], @@ -1790,8 +1807,7 @@ def test_round_to_multiple(): for multiple in [object, 99999999999999999999999]: with pytest.raises(TypeError, match="is not a valid multiple type"): - pc.round_to_multiple( - values, multiple=multiple) # type: ignore[reportArgumentType] + pc.round_to_multiple(values, multiple=multiple) def test_round_binary(): @@ -1862,7 +1878,7 @@ def test_fill_null(): fill_value = pa.array([5], type=pa.int8()) with pytest.raises(pa.ArrowInvalid, match="Array arguments must all be the same length"): - arr.fill_null(fill_value) # type: ignore[reportArgumentType] + arr.fill_null(fill_value) arr = pa.array([None, None, None, None], type=pa.null()) fill_value = pa.scalar(None, type=pa.null()) @@ -2059,16 +2075,14 @@ def test_fsl_to_fsl_cast(value_type): # Different sized FSL cast_type = pa.list_(pa.field("element", value_type), 3) err_msg = 'Size of FixedSizeList is not the same.' - with pytest.raises(ArrowTypeError, match=err_msg): + with pytest.raises(pa.lib.ArrowTypeError, match=err_msg): fsl.cast(cast_type) DecimalTypeTraits = namedtuple('DecimalTypeTraits', - # type: ignore[reportUntypedNamedTuple] ('name', 'factory', 'max_precision')) FloatToDecimalCase = namedtuple('FloatToDecimalCase', - # type: ignore[reportUntypedNamedTuple] ('precision', 'scale', 'float_val')) decimal_type_traits = [DecimalTypeTraits('decimal32', pa.decimal32, 9), @@ -2081,8 +2095,6 @@ def largest_scaled_float_not_above(val, scale): """ Find the largest float f such as `f * 10**scale <= val` """ - import numpy as np - assert val >= 0 assert scale >= 0 float_val = float(val) / 10**scale @@ -2143,7 +2155,7 @@ def random_float_to_decimal_cast_cases(float_ty, max_precision): r = random.Random(42) for precision in range(1, max_precision, 6): for scale in range(0, precision, 4): - for _ in range(20): + for i in range(20): unscaled = r.randrange(0, 10**precision) float_val = scaled_float(unscaled, scale) assert float_val * 10**scale < 10**precision @@ -2200,8 +2212,6 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): """ Test float-to-decimal conversion against exactly generated values. """ - import numpy as np - r = random.Random(43) np_float_ty = { pa.float32(): np.float32, @@ -2242,13 +2252,10 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): float_exp = (-mantissa_bits + math.floor(math.log2(10**(precision - scale)))) assert float_exp_min <= float_exp <= float_exp_max - for _ in range(5): + for i in range(5): mantissa = r.randrange(0, 2**mantissa_bits) - float_val = np.ldexp( - np_float_ty(mantissa), float_exp - ) - assert isinstance( - float_val, np_float_ty) # type: ignore[reportArgumentType] + float_val = np.ldexp(np_float_ty(mantissa), float_exp) + assert isinstance(float_val, np_float_ty) # Make sure we compute the exact expected value and # round by half-to-even when converting to the expected precision. if float_exp >= 0: @@ -2294,8 +2301,6 @@ def test_strptime(): @pytest.mark.pandas @pytest.mark.timezone_data def test_strftime(): - import pandas as pd - times = ["2018-03-10 09:00", "2038-01-31 12:23", None] timezones = ["CET", "UTC", "Europe/Ljubljana"] @@ -2306,8 +2311,7 @@ def test_strftime(): formats.extend(["%c", "%x", "%X"]) for timezone in timezones: - ts = pd.to_datetime(times # type: ignore[reportArgumentType] - ).tz_localize(timezone) + ts = pd.to_datetime(times).tz_localize(timezone) for unit in ["s", "ms", "us", "ns"]: tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) for fmt in formats: @@ -2354,7 +2358,7 @@ def test_strftime(): # Test timestamps without timezone fmt = "%Y-%m-%dT%H:%M:%S" - ts = pd.to_datetime(times) # type: ignore[reportArgumentType] + ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) expected = pa.array(ts.strftime(fmt)).cast(result.type) @@ -2373,7 +2377,6 @@ def test_strftime(): def _check_datetime_components(timestamps, timezone=None): from pyarrow.vendored.version import Version - import pandas as pd ts = pd.to_datetime(timestamps).tz_localize( "UTC").tz_convert(timezone).to_series() @@ -2389,15 +2392,9 @@ def _check_datetime_components(timestamps, timezone=None): if Version(pd.__version__) < Version("1.1.0"): # https://github.com/pandas-dev/pandas/issues/33206 - iso_year = ts.map( - lambda x: x.isocalendar()[0] # type: ignore[reportUnknownLambdaType] - ).astype("int64") - iso_week = ts.map( - lambda x: x.isocalendar()[1] # type: ignore[reportUnknownLambdaType] - ).astype("int64") - iso_day = ts.map( - lambda x: x.isocalendar()[2] # type: ignore[reportUnknownLambdaType] - ).astype("int64") + iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64") + iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64") + iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64") else: # Casting is required because pandas isocalendar returns int32 # while arrow isocalendar returns int64. @@ -2447,8 +2444,7 @@ def _check_datetime_components(timestamps, timezone=None): # datetime with utc returns None for dst() is_dst = [False] * len(ts) else: - is_dst = ts.apply( - lambda x: x.dst().seconds > 0) # type: ignore[reportUnknownLambdaType] + is_dst = ts.apply(lambda x: x.dst().seconds > 0) assert pc.is_dst(tsa).equals(pa.array(is_dst)) day_of_week_options = pc.DayOfWeekOptions( @@ -2509,9 +2505,6 @@ def test_iso_calendar_longer_array(unit): @pytest.mark.pandas @pytest.mark.timezone_data def test_assume_timezone(): - import numpy as np - import pandas as pd - ts_type = pa.timestamp("ns") timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", @@ -2536,9 +2529,9 @@ def test_assume_timezone(): ambiguous_array = pa.array(ambiguous, type=ts_type) nonexistent_array = pa.array(nonexistent, type=ts_type) - ta = pa.array(timestamps, type=ts_type) for timezone in ["UTC", "America/Chicago", "Asia/Kolkata"]: options = pc.AssumeTimezoneOptions(timezone) + ta = pa.array(timestamps, type=ts_type) expected = timestamps.tz_localize(timezone) result = pc.assume_timezone(ta, options=options) assert result.equals(pa.array(expected)) @@ -2547,8 +2540,7 @@ def test_assume_timezone(): ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone)) with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"): - pc.assume_timezone( - ta_zoned, options=options) # type: ignore[reportArgumentType] + pc.assume_timezone(ta_zoned, options=options) invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss") with pytest.raises(ValueError, match="not found in timezone database"): @@ -2591,22 +2583,18 @@ def test_assume_timezone(): f"timezone '{timezone}'"): pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) - expected = ambiguous.tz_localize(timezone, ambiguous=np.array([True, True, True])) + expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) - expected = ambiguous.tz_localize( - timezone, ambiguous=np.array([False, False, False])) + expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) def _check_temporal_rounding(ts, values, unit): - import numpy as np - import pandas as pd - unit_shorthand = { "nanosecond": "ns", "microsecond": "us", @@ -2650,7 +2638,7 @@ def _check_temporal_rounding(ts, values, unit): value, unit, calendar_based_origin=True) origin = ts.dt.floor(greater_unit[unit]) - if not hasattr(ta.type, "tz"): + if ta.type.tz is None: result = pc.ceil_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.ceil(frequency) + origin np.testing.assert_array_equal(result, expected) @@ -2681,20 +2669,16 @@ def _check_temporal_rounding(ts, values, unit): # to regular ceiled timestamp if it is equal to the original timestamp. # This does not work if timestamp is zoned since our logic will not # account for DST jumps. - if not hasattr(ta.type, "tz"): + if ta.type.tz is None: options = pc.RoundTemporalOptions( - value, # type: ignore[reportPossiblyUnboundVariable] - ceil_is_strictly_greater=True, - unit=unit) # type: ignore[reportPossiblyUnboundVariable] + value, unit, ceil_is_strictly_greater=True) result = pc.ceil_temporal(ta, options=options) - expected = ts.dt.ceil(frequency) # type: ignore[reportPossiblyUnboundVariable] + expected = ts.dt.ceil(frequency) expected = np.where( expected == ts, - expected + pd.Timedelta( - value, # type: ignore[reportPossiblyUnboundVariable] - unit=unit_shorthand[unit]), expected # type: ignore[reportArgumentType] - ) + expected + pd.Timedelta(value, unit_shorthand[unit]), + expected) np.testing.assert_array_equal(result, expected) # Check RoundTemporalOptions defaults @@ -2719,10 +2703,8 @@ def _check_temporal_rounding(ts, values, unit): "second", "minute", "hour", "day")) @pytest.mark.pandas def test_round_temporal(unit): - import pandas as pd - values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750) - timestamps = pd.Series([ + timestamps = [ "1923-07-07 08:52:35.203790336", "1931-03-17 10:45:00.641559040", "1932-06-16 01:16:42.911994368", @@ -2735,7 +2717,7 @@ def test_round_temporal(unit): "1982-01-21 18:43:44.517366784", "1992-01-01 00:00:00.100000000", "1999-12-04 05:55:34.794991104", - "2026-10-26 08:39:00.316686848"]) + "2026-10-26 08:39:00.316686848"] ts = pd.Series([pd.Timestamp(x, unit="ns") for x in timestamps]) _check_temporal_rounding(ts, values, unit) @@ -2757,7 +2739,7 @@ def test_count(): with pytest.raises(ValueError, match='"something else" is not a valid count mode'): - pc.count(arr, 'something else') # type: ignore[invalid-argument-type] + pc.count(arr, 'something else') def test_index(): @@ -2807,7 +2789,7 @@ def test_partition_nth(): with pytest.raises( ValueError, match="'partition_nth_indices' cannot be called without options"): - pc.partition_nth_indices(data) # type: ignore[no-matching-overload] + pc.partition_nth_indices(data) def test_partition_nth_null_placement(): @@ -2834,13 +2816,10 @@ def validate_select_k(select_k_indices, arr, order, stable_sort=False): arr = pa.array([1, 2, None, 0]) for k in [0, 2, 4]: - result = pc.select_k_unstable( - arr, k=k, sort_keys=[("dummy", "ascending")]) - validate_select_k(result, arr, "ascending") - - result = pc.select_k_unstable( - arr, k=k, sort_keys=[("dummy", "descending")]) - validate_select_k(result, arr, "descending") + for order in ["descending", "ascending"]: + result = pc.select_k_unstable( + arr, k=k, sort_keys=[("dummy", order)]) + validate_select_k(result, arr, order) result = pc.top_k_unstable(arr, k=k) validate_select_k(result, arr, "descending") @@ -2897,7 +2876,7 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): with pytest.raises( ValueError, match="'select_k_unstable' cannot be called without options"): - pc.select_k_unstable(table) # type: ignore[no-matching-overload] + pc.select_k_unstable(table) with pytest.raises(ValueError, match="select_k_unstable requires a nonnegative `k`"): @@ -2906,19 +2885,14 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): with pytest.raises(ValueError, match="select_k_unstable requires a " "non-empty `sort_keys`"): - pc.select_k_unstable(table, sort_keys=[], - k=2 # type: ignore[reportPossiblyUnboundVariable] - ) + pc.select_k_unstable(table, k=2, sort_keys=[]) with pytest.raises(ValueError, match="not a valid sort order"): - pc.select_k_unstable( - table, k=k, # type: ignore[reportPossiblyUnboundVariable] - sort_keys=[("a", "nonscending")]) # type: ignore[reportArgumentType] + pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - pc.select_k_unstable(table, k=k, # type: ignore[reportPossiblyUnboundVariable] - sort_keys=[("unknown", "ascending")]) + pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) def test_array_sort_indices(): @@ -2937,9 +2911,7 @@ def test_array_sort_indices(): assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="not a valid sort order"): - pc.array_sort_indices(arr, - order="nonscending" # type: ignore[reportArgumentType] - ) + pc.array_sort_indices(arr, order="nonscending") def test_sort_indices_array(): @@ -2995,19 +2967,14 @@ def test_sort_indices_table(): assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="Must specify one or more sort keys"): - pc.sort_indices(table) # type: ignore[reportArgumentType] + pc.sort_indices(table) with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - pc.sort_indices( - table, - sort_keys=[("unknown", "ascending")] # type: ignore[reportArgumentType] - ) + pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid sort order"): - pc.sort_indices( - table, sort_keys=[("a", "nonscending")] # type: ignore[reportArgumentType] - ) + pc.sort_indices(table, sort_keys=[("a", "nonscending")]) def test_is_in(): @@ -3085,9 +3052,9 @@ def test_quantile(): assert result.to_pylist() == [1.25, 1.5, 1.75] with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): - pc.quantile(arr, q=1.1) # type: ignore[invalid-argument-type] + pc.quantile(arr, q=1.1) with pytest.raises(ValueError, match="not a valid quantile interpolation"): - pc.quantile(arr, interpolation='zzz') # type: ignore[invalid-argument-type] + pc.quantile(arr, interpolation='zzz') def test_tdigest(): @@ -3153,8 +3120,6 @@ def test_min_max_element_wise(): @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_sum(start, skip_nulls): - import numpy as np - # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3203,15 +3168,13 @@ def test_cumulative_sum(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_sum([1, 2, 3], start=strt) # type: ignore[reportArgumentType] + pc.cumulative_sum([1, 2, 3], start=strt) @pytest.mark.numpy @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_prod(start, skip_nulls): - import numpy as np - # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3260,17 +3223,13 @@ def test_cumulative_prod(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_prod( - [1, 2, 3], start=strt # type: ignore[reportArgumentType] - ) + pc.cumulative_prod([1, 2, 3], start=strt) @pytest.mark.numpy @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_max(start, skip_nulls): - import numpy as np - # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3322,15 +3281,13 @@ def test_cumulative_max(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_max([1, 2, 3], start=strt) # type: ignore[reportArgumentType] + pc.cumulative_max([1, 2, 3], start=strt) @pytest.mark.numpy @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_min(start, skip_nulls): - import numpy as np - # Exact tests (e.g., integral types) start_int = int(start) starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), @@ -3378,12 +3335,11 @@ def test_cumulative_min(start, skip_nulls): expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) np.testing.assert_array_almost_equal(result.to_numpy( - # type: ignore[reportAttributeAccessIssue] zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_max([1, 2, 3], start=strt) # type: ignore[reportArgumentType] + pc.cumulative_max([1, 2, 3], start=strt) def test_make_struct(): @@ -3475,12 +3431,12 @@ def test_list_element(): lists = pa.array([l1, l2], list_type) index = 1 - result = pc.list_element(lists, index) + result = pa.compute.list_element(lists, index) expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type) assert result.equals(expected) index = 4 - result = pc.list_element(lists, index) + result = pa.compute.list_element(lists, index) expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type) assert result.equals(expected) @@ -3519,7 +3475,7 @@ def test_random(): pa.array([], type=pa.float64()) # System random initialization => outputs all distinct - arrays = [tuple(pc.random(100).to_pylist()) for _ in range(10)] + arrays = [tuple(pc.random(100).to_pylist()) for i in range(10)] assert len(set(arrays)) == len(arrays) arrays = [tuple(pc.random(100, initializer=i % 7).to_pylist()) @@ -3528,14 +3484,15 @@ def test_random(): # Arbitrary hashable objects can be given as initializer initializers = [object(), (4, 5, 6), "foo"] - initializers.extend(os.urandom(10) for _ in range(10)) - arrays = [tuple(pc.random(100, initializer=i).to_pylist()) for i in initializers] + initializers.extend(os.urandom(10) for i in range(10)) + arrays = [tuple(pc.random(100, initializer=i).to_pylist()) + for i in initializers] assert len(set(arrays)) == len(arrays) with pytest.raises(TypeError, match=r"initializer should be 'system', an integer, " r"or a hashable object; got \[\]"): - pc.random(100, initializer=[]) # type: ignore[invalid-argument-type] + pc.random(100, initializer=[]) @pytest.mark.parametrize( @@ -3585,7 +3542,7 @@ def test_rank_options(): match=r'"NonExisting" is not a valid tiebreaker'): pc.RankOptions(sort_keys="descending", null_placement="at_end", - tiebreaker="NonExisting") # type: ignore[invalid-argument-type] + tiebreaker="NonExisting") def test_rank_quantile_options(): @@ -3615,7 +3572,7 @@ def test_rank_quantile_options(): assert result.equals(expected_descending) with pytest.raises(ValueError, match="not a valid sort order"): - pc.rank_quantile(arr, sort_keys="XXX") # type: ignore[reportArgumentType] + pc.rank_quantile(arr, sort_keys="XXX") def test_rank_normal_options(): @@ -3643,8 +3600,6 @@ def test_rank_normal_options(): def create_sample_expressions(): - import numpy as np - # We need a schema for substrait conversion schema = pa.schema([pa.field("i64", pa.int64()), pa.field( "foo", pa.struct([pa.field("bar", pa.string())]))]) @@ -3659,7 +3614,7 @@ def create_sample_expressions(): e = pc.scalar(None) f = pc.scalar({'a': 1}) g = pc.scalar(pa.scalar(1)) - h = pc.scalar(np.int64(2)) # type: ignore[reportOptionalMemberAccess] + h = pc.scalar(np.int64(2)) j = pc.scalar(False) k = pc.scalar(0) @@ -3734,22 +3689,20 @@ def test_expression_serialization_arrow(pickle_module): def test_expression_serialization_substrait(): exprs = create_sample_expressions() - schema = pa.schema(exprs["schema"]) # type: ignore[reportAttributeAccessIssue] + schema = exprs["schema"] # Basic literals don't change on binding and so they will round # trip without any change - for expr in exprs["literals"]: # type: ignore[reportAttributeAccessIssue] - serialized = \ - expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] + for expr in exprs["literals"]: + serialized = expr.to_substrait(schema) deserialized = pc.Expression.from_substrait(serialized) - assert expr.equals(deserialized) # type: ignore[reportAttributeAccessIssue] + assert expr.equals(deserialized) # Expressions are bound when they get serialized. Since bound # expressions are not equal to their unbound variants we cannot # compare the round tripped with the original - for expr in exprs["calls"]: # type: ignore[reportAttributeAccessIssue] - serialized = \ - expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] + for expr in exprs["calls"]: + serialized = expr.to_substrait(schema) deserialized = pc.Expression.from_substrait(serialized) # We can't compare the expressions themselves because of the bound # unbound difference. But we can compare the string representation @@ -3759,8 +3712,7 @@ def test_expression_serialization_substrait(): assert deserialized.equals(deserialized_again) for expr, expr_norm in zip(exprs["refs"], exprs["numeric_refs"]): - serialized = \ - expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] + serialized = expr.to_substrait(schema) deserialized = pc.Expression.from_substrait(serialized) assert str(deserialized) == str(expr_norm) serialized_again = deserialized.to_substrait(schema) @@ -3770,16 +3722,15 @@ def test_expression_serialization_substrait(): # For the special cases we get various wrinkles in serialization but we # should always get the same thing from round tripping twice for expr in exprs["special"]: - serialized = \ - expr.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] + serialized = expr.to_substrait(schema) deserialized = pc.Expression.from_substrait(serialized) serialized_again = deserialized.to_substrait(schema) deserialized_again = pc.Expression.from_substrait(serialized_again) assert deserialized.equals(deserialized_again) # Special case, we lose the field names of struct literals - f = exprs["special"][0] # type: ignore[reportAttributeAccessIssue] - serialized = f.to_substrait(schema) # type: ignore[reportAttributeAccessIssue] + f = exprs["special"][0] + serialized = f.to_substrait(schema) deserialized = pc.Expression.from_substrait(serialized) assert deserialized.equals(pc.scalar({'': 1})) @@ -3807,10 +3758,10 @@ def test_expression_construction(): nested_field = pc.field(("nested", "field")) nested_field2 = pc.field("nested", "field") - _ = zero | one == string - _ = ~true == false + zero | one == string + ~true == false for typ in ("bool", pa.bool_()): - _ = field.cast(typ) == true + field.cast(typ) == true field.isin([1, 2]) nested_mixed_types.isin(["foo", "bar"]) @@ -3818,10 +3769,10 @@ def test_expression_construction(): nested_field2.isin(["foo", "bar"]) with pytest.raises(TypeError): - field.isin(1) # type: ignore[invalid-argument-type] + field.isin(1) with pytest.raises(pa.ArrowInvalid): - _ = field != object() + field != object() def test_expression_boolean_operators(): @@ -3830,16 +3781,16 @@ def test_expression_boolean_operators(): false = pc.scalar(False) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - _ = true and false + true and false with pytest.raises(ValueError, match="cannot be evaluated to python True"): - _ = true or false + true or false with pytest.raises(ValueError, match="cannot be evaluated to python True"): bool(true) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - _ = not true + not true def test_expression_call_function(): @@ -3861,14 +3812,14 @@ def test_expression_call_function(): # Invalid pc.scalar input gives original error message msg = "only other expressions allowed as arguments" with pytest.raises(TypeError, match=msg): - pc.add(field, object) # type: ignore[reportArgumentType] + pc.add(field, object) def test_cast_table_raises(): table = pa.table({'a': [1, 2]}) - with pytest.raises(ArrowTypeError): - pc.cast(table, pa.int64()) # type: ignore[reportArgumentType] + with pytest.raises(pa.lib.ArrowTypeError): + pc.cast(table, pa.int64()) @pytest.mark.parametrize("start,stop,expected", ( @@ -4015,31 +3966,31 @@ def test_run_end_encode(value_type, option): def test_pairwise_diff(): arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, 1, 1, None, None, 1]) - result = pc.pairwise_diff(arr, period=1) + result = pa.compute.pairwise_diff(arr, period=1) assert result.equals(expected) arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, None, 2, None, 1, None]) - result = pc.pairwise_diff(arr, period=2) + result = pa.compute.pairwise_diff(arr, period=2) assert result.equals(expected) # negative period arr = pa.array([1, 2, 3, None, 4, 5], type=pa.int8()) expected = pa.array([-1, -1, None, None, -1, None], type=pa.int8()) - result = pc.pairwise_diff(arr, period=-1) + result = pa.compute.pairwise_diff(arr, period=-1) assert result.equals(expected) # wrap around overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) expected = pa.array([255, 255, None, None, 255, None], type=pa.uint8()) - result = pc.pairwise_diff(arr, period=-1) + result = pa.compute.pairwise_diff(arr, period=-1) assert result.equals(expected) # fail on overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) with pytest.raises(pa.ArrowInvalid, match="overflow"): - pc.pairwise_diff_checked(arr, period=-1) + pa.compute.pairwise_diff_checked(arr, period=-1) def test_pivot_wider(): diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index a6d3546e57c..7c86f37587c 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -24,7 +24,7 @@ import math import os import pathlib -import pytest +import pytest # type: ignore[import-not-found] import random import sys import tempfile @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] from pyarrow.util import guid from pyarrow import Codec @@ -811,8 +811,9 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -852,8 +853,9 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -1748,9 +1750,9 @@ def test_unknown_compression_raises(): "gzip", "lz4", "zstd", - pytest.param( - "snappy", - marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("snappy", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ) ]) def test_compressed_roundtrip(compression): diff --git a/python/pyarrow/types.pyi b/python/pyarrow/types.pyi index 3ead6830421..1d1554da520 100644 --- a/python/pyarrow/types.pyi +++ b/python/pyarrow/types.pyi @@ -100,59 +100,459 @@ _Nested: TypeAlias = ( | _Union ) -def is_null(t: DataType) -> TypeIs[NullType]: ... -def is_boolean(t: DataType) -> TypeIs[BoolType]: ... -def is_integer(t: DataType) -> TypeIs[_Integer]: ... -def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... -def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... -def is_int8(t: DataType) -> TypeIs[Int8Type]: ... -def is_int16(t: DataType) -> TypeIs[Int16Type]: ... -def is_int32(t: DataType) -> TypeIs[Int32Type]: ... -def is_int64(t: DataType) -> TypeIs[Int64Type]: ... -def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... -def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... -def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... -def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... -def is_floating(t: DataType) -> TypeIs[_Floating]: ... -def is_float16(t: DataType) -> TypeIs[Float16Type]: ... -def is_float32(t: DataType) -> TypeIs[Float32Type]: ... -def is_float64(t: DataType) -> TypeIs[Float64Type]: ... -def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... -def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... -def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... -def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... -def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... -def is_struct(t: DataType) -> TypeIs[StructType]: ... -def is_union(t: DataType) -> TypeIs[_Union]: ... -def is_nested(t: DataType) -> TypeIs[_Nested]: ... -def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... -def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... -def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... -def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... -def is_time(t: DataType) -> TypeIs[_Time]: ... -def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... -def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... -def is_binary(t: DataType) -> TypeIs[BinaryType]: ... -def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... -def is_unicode(t: DataType) -> TypeIs[StringType]: ... -def is_string(t: DataType) -> TypeIs[StringType]: ... -def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... -def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... -def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... -def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... -def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... -def is_date(t: DataType) -> TypeIs[_Date]: ... -def is_date32(t: DataType) -> TypeIs[Date32Type]: ... -def is_date64(t: DataType) -> TypeIs[Date64Type]: ... -def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... -def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... -def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... -def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... -def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... -def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... -def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... -def is_interval(t: DataType) -> TypeIs[_Interval]: ... -def is_primitive(t: DataType) -> bool: ... +def is_null(t: DataType) -> TypeIs[NullType]: + """ + Return True if value is an instance of type: null. + + Parameters + ---------- + t : DataType + """ +def is_boolean(t: DataType) -> TypeIs[BoolType]: + """ + Return True if value is an instance of type: boolean. + + Parameters + ---------- + t : DataType + """ +def is_integer(t: DataType) -> TypeIs[_Integer]: + """ + Return True if value is an instance of type: any integer. + + Parameters + ---------- + t : DataType + """ +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: + """ + Return True if value is an instance of type: signed integer. + + Parameters + ---------- + t : DataType + """ +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: + """ + Return True if value is an instance of type: unsigned integer. + + Parameters + ---------- + t : DataType + """ +def is_int8(t: DataType) -> TypeIs[Int8Type]: + """ + Return True if value is an instance of type: int8. + + Parameters + ---------- + t : DataType + """ +def is_int16(t: DataType) -> TypeIs[Int16Type]: + """ + Return True if value is an instance of type: int16. + + Parameters + ---------- + t : DataType + """ +def is_int32(t: DataType) -> TypeIs[Int32Type]: + """ + Return True if value is an instance of type: int32. + + Parameters + ---------- + t : DataType + """ +def is_int64(t: DataType) -> TypeIs[Int64Type]: + """ + Return True if value is an instance of type: int64. + + Parameters + ---------- + t : DataType + """ +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: + """ + Return True if value is an instance of type: uint8. + + Parameters + ---------- + t : DataType + """ +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: + """ + Return True if value is an instance of type: uint16. + + Parameters + ---------- + t : DataType + """ +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: + """ + Return True if value is an instance of type: uint32. + + Parameters + ---------- + t : DataType + """ +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: + """ + Return True if value is an instance of type: uint64. + + Parameters + ---------- + t : DataType + """ +def is_floating(t: DataType) -> TypeIs[_Floating]: + """ + Return True if value is an instance of type: floating point numeric. + + Parameters + ---------- + t : DataType + """ +def is_float16(t: DataType) -> TypeIs[Float16Type]: + """ + Return True if value is an instance of type: float16 (half-precision). + + Parameters + ---------- + t : DataType + """ +def is_float32(t: DataType) -> TypeIs[Float32Type]: + """ + Return True if value is an instance of type: float32 (single precision). + + Parameters + ---------- + t : DataType + """ +def is_float64(t: DataType) -> TypeIs[Float64Type]: + """ + Return True if value is an instance of type: float64 (double precision). + + Parameters + ---------- + t : DataType + """ +def is_list(t: DataType) -> TypeIs[ListType[Any]]: + """ + Return True if value is an instance of type: list. + + Parameters + ---------- + t : DataType + """ +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: + """ + Return True if value is an instance of type: large list. + + Parameters + ---------- + t : DataType + """ +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: + """ + Return True if value is an instance of type: fixed size list. + + Parameters + ---------- + t : DataType + """ +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: + """ + Return True if value is an instance of type: list view. + + Parameters + ---------- + t : DataType + """ +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: + """ + Return True if value is an instance of type: large list view. + + Parameters + ---------- + t : DataType + """ +def is_struct(t: DataType) -> TypeIs[StructType]: + """ + Return True if value is an instance of type: struct. + + Parameters + ---------- + t : DataType + """ +def is_union(t: DataType) -> TypeIs[_Union]: + """ + Return True if value is an instance of type: union. + + Parameters + ---------- + t : DataType + """ +def is_nested(t: DataType) -> TypeIs[_Nested]: + """ + Return True if value is an instance of type: nested type. + + Parameters + ---------- + t : DataType + """ +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: + """ + Return True if value is an instance of type: run-end encoded. + + Parameters + ---------- + t : DataType + """ +def is_temporal(t: DataType) -> TypeIs[_Temporal]: + """ + Return True if value is an instance of type: date, time, timestamp or duration. + + Parameters + ---------- + t : DataType + """ +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: + """ + Return True if value is an instance of type: timestamp. + + Parameters + ---------- + t : DataType + """ +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: + """ + Return True if value is an instance of type: duration. + + Parameters + ---------- + t : DataType + """ +def is_time(t: DataType) -> TypeIs[_Time]: + """ + Return True if value is an instance of type: time. + + Parameters + ---------- + t : DataType + """ +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: + """ + Return True if value is an instance of type: time32. + + Parameters + ---------- + t : DataType + """ +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: + """ + Return True if value is an instance of type: time64. + + Parameters + ---------- + t : DataType + """ +def is_binary(t: DataType) -> TypeIs[BinaryType]: + """ + Return True if value is an instance of type: variable-length binary. + + Parameters + ---------- + t : DataType + """ +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: + """ + Return True if value is an instance of type: large variable-length binary. + + Parameters + ---------- + t : DataType + """ +def is_unicode(t: DataType) -> TypeIs[StringType]: + """ + Alias for is_string. + + Parameters + ---------- + t : DataType + """ +def is_string(t: DataType) -> TypeIs[StringType]: + """ + Return True if value is an instance of type: string (utf8 unicode). + + Parameters + ---------- + t : DataType + """ +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: + """ + Alias for is_large_string. + + Parameters + ---------- + t : DataType + """ +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: + """ + Return True if value is an instance of type: large string (utf8 unicode). + + Parameters + ---------- + t : DataType + """ +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: + """ + Return True if value is an instance of type: fixed size binary. + + Parameters + ---------- + t : DataType + """ +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: + """ + Return True if value is an instance of type: variable-length binary view. + + Parameters + ---------- + t : DataType + """ +def is_string_view(t: DataType) -> TypeIs[StringViewType]: + """ + Return True if value is an instance of type: variable-length string (utf-8) view. + + Parameters + ---------- + t : DataType + """ +def is_date(t: DataType) -> TypeIs[_Date]: + """ + Return True if value is an instance of type: date. + + Parameters + ---------- + t : DataType + """ +def is_date32(t: DataType) -> TypeIs[Date32Type]: + """ + Return True if value is an instance of type: date32 (days). + + Parameters + ---------- + t : DataType + """ +def is_date64(t: DataType) -> TypeIs[Date64Type]: + """ + Return True if value is an instance of type: date64 (milliseconds). + + Parameters + ---------- + t : DataType + """ +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: + """ + Return True if value is an instance of type: map. + + Parameters + ---------- + t : DataType + """ +def is_decimal(t: DataType) -> TypeIs[_Decimal]: + """ + Return True if value is an instance of type: decimal. + + Parameters + ---------- + t : DataType + """ +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: + """ + Return True if value is an instance of type: decimal32. + + Parameters + ---------- + t : DataType + """ +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: + """ + Return True if value is an instance of type: decimal64. + + Parameters + ---------- + t : DataType + """ +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: + """ + Return True if value is an instance of type: decimal128. + + Parameters + ---------- + t : DataType + """ +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: + """ + Return True if value is an instance of type: decimal256. + + Parameters + ---------- + t : DataType + """ +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: + """ + Return True if value is an instance of type: dictionary-encoded. + + Parameters + ---------- + t : DataType + """ +def is_interval(t: DataType) -> TypeIs[_Interval]: + """ + Return True if value is an instance of type: interval. + + Parameters + ---------- + t : DataType + """ +def is_primitive(t: DataType) -> bool: + """ + Return True if value is an instance of type: primitive type. + + Parameters + ---------- + t : DataType + """ +def is_boolean_value(obj: Any) -> bool: + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_integer_value(obj: Any) -> bool: + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_float_value(obj: Any) -> bool: + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ __all__ = [ "is_binary", diff --git a/python/pyproject.toml b/python/pyproject.toml index 598ddf7a75b..fac3b25c554 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -91,16 +91,3 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '22.0.0a0' - -[tool.pyright] -typeCheckingMode = "strict" -reportMissingImports = false -reportPrivateUsage = false -reportUnknownParameterType = false -reportMissingTypeArgument = false -reportMissingParameterType = false -reportMissingTypeStubs = false -reportUnknownVariableType = false -reportUnknownArgumentType = false -reportUnknownMemberType = false -include = ["pyarrow/tests/test_compute.py"] From e7c0202bb892e36cd2321a033fd9205bcd951682 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 12 Sep 2025 15:38:46 +0200 Subject: [PATCH 09/26] Remove more stubs --- .../pyarrow/_dataset_parquet_encryption.pyi | 102 - python/pyarrow/_feather.pyi | 46 - python/pyarrow/_gcsfs.pyi | 100 - python/pyarrow/_parquet_encryption.pyi | 84 - python/pyarrow/csv.pyi | 44 - python/pyarrow/cuda.pyi | 42 - python/pyarrow/flight.pyi | 112 - python/pyarrow/fs.pyi | 94 - python/pyarrow/interchange/__init__.pyi | 16 - python/pyarrow/interchange/buffer.pyi | 75 - python/pyarrow/interchange/column.pyi | 269 --- python/pyarrow/interchange/dataframe.pyi | 119 - python/pyarrow/interchange/from_dataframe.pyi | 261 --- python/pyarrow/parquet/__init__.pyi | 18 - python/pyarrow/parquet/core.pyi | 2078 ----------------- python/pyarrow/parquet/encryption.pyi | 32 - 16 files changed, 3492 deletions(-) delete mode 100644 python/pyarrow/_dataset_parquet_encryption.pyi delete mode 100644 python/pyarrow/_feather.pyi delete mode 100644 python/pyarrow/_gcsfs.pyi delete mode 100644 python/pyarrow/_parquet_encryption.pyi delete mode 100644 python/pyarrow/csv.pyi delete mode 100644 python/pyarrow/cuda.pyi delete mode 100644 python/pyarrow/flight.pyi delete mode 100644 python/pyarrow/fs.pyi delete mode 100644 python/pyarrow/interchange/__init__.pyi delete mode 100644 python/pyarrow/interchange/buffer.pyi delete mode 100644 python/pyarrow/interchange/column.pyi delete mode 100644 python/pyarrow/interchange/dataframe.pyi delete mode 100644 python/pyarrow/interchange/from_dataframe.pyi delete mode 100644 python/pyarrow/parquet/__init__.pyi delete mode 100644 python/pyarrow/parquet/core.pyi delete mode 100644 python/pyarrow/parquet/encryption.pyi diff --git a/python/pyarrow/_dataset_parquet_encryption.pyi b/python/pyarrow/_dataset_parquet_encryption.pyi deleted file mode 100644 index be40c0b39b3..00000000000 --- a/python/pyarrow/_dataset_parquet_encryption.pyi +++ /dev/null @@ -1,102 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions -from ._parquet import FileDecryptionProperties -from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig -from .lib import _Weakrefable - -class ParquetEncryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level encryption - within the Parquet framework. - - The ParquetEncryptionConfig class serves as a bridge for passing encryption-related - parameters to the appropriate components within the Parquet library. It maintains references - to objects that define the encryption strategy, Key Management Service (KMS) configuration, - and specific encryption configurations for Parquet data. - - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for - creating cryptographic components, such as encryptors and decryptors. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration - parameters necessary for connecting to a Key Management Service (KMS). - encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration - Shared pointer to an `EncryptionConfiguration` object. This object defines specific - encryption settings for Parquet data, including the keys assigned to different columns. - - Raises - ------ - ValueError - Raised if `encryption_config` is None. - """ - def __init__( - self, - crypto_factory: CryptoFactory, - kms_connection_config: KmsConnectionConfig, - encryption_config: EncryptionConfiguration, - ) -> None: ... - -class ParquetDecryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level decryption - within the Parquet framework. - - ParquetDecryptionConfig is designed to pass decryption-related parameters to - the appropriate decryption components within the Parquet library. It holds references to - objects that define the decryption strategy, Key Management Service (KMS) configuration, - and specific decryption configurations for reading encrypted Parquet data. - - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic - components for the decryption process. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary - for connecting to a Key Management Service (KMS) during decryption. - decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration - Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings - for reading encrypted Parquet data. - - Raises - ------ - ValueError - Raised if `decryption_config` is None. - """ - def __init__( - self, - crypto_factory: CryptoFactory, - kms_connection_config: KmsConnectionConfig, - encryption_config: EncryptionConfiguration, - ) -> None: ... - -def set_encryption_config( - opts: ParquetFileWriteOptions, - config: ParquetEncryptionConfig, -) -> None: ... -def set_decryption_properties( - opts: ParquetFragmentScanOptions, - config: FileDecryptionProperties, -): ... -def set_decryption_config( - opts: ParquetFragmentScanOptions, - config: ParquetDecryptionConfig, -): ... diff --git a/python/pyarrow/_feather.pyi b/python/pyarrow/_feather.pyi deleted file mode 100644 index 373fe38cdce..00000000000 --- a/python/pyarrow/_feather.pyi +++ /dev/null @@ -1,46 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import IO - -from _typeshed import StrPath - -from .lib import Buffer, NativeFile, Table, _Weakrefable - -class FeatherError(Exception): ... - -def write_feather( - table: Table, - dest: StrPath | IO | NativeFile, - compression: str | None = None, - compression_level: int | None = None, - chunksize: int | None = None, - version: int = 2, -): ... - -class FeatherReader(_Weakrefable): - def __init__( - self, - source: StrPath | IO | NativeFile | Buffer, - use_memory_map: bool, - use_threads: bool, - ) -> None: ... - @property - def version(self) -> str: ... - def read(self) -> Table: ... - def read_indices(self, indices: list[int]) -> Table: ... - def read_names(self, names: list[str]) -> Table: ... diff --git a/python/pyarrow/_gcsfs.pyi b/python/pyarrow/_gcsfs.pyi deleted file mode 100644 index 0ced106615a..00000000000 --- a/python/pyarrow/_gcsfs.pyi +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import datetime as dt - -from ._fs import FileSystem -from .lib import KeyValueMetadata - -class GcsFileSystem(FileSystem): - """ - Google Cloud Storage (GCS) backed FileSystem implementation - - By default uses the process described in https://google.aip.dev/auth/4110 - to resolve credentials. If not running on Google Cloud Platform (GCP), - this generally requires the environment variable - GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file - containing credentials. - - Note: GCS buckets are special and the operations available on them may be - limited or more expensive than expected compared to local file systems. - - Note: When pickling a GcsFileSystem that uses default credentials, resolution - credentials are not stored in the serialized data. Therefore, when unpickling - it is assumed that the necessary credentials are in place for the target - process. - - Parameters - ---------- - anonymous : boolean, default False - Whether to connect anonymously. - If true, will not attempt to look up credentials using standard GCP - configuration methods. - access_token : str, default None - GCP access token. If provided, temporary credentials will be fetched by - assuming this role; also, a `credential_token_expiration` must be - specified as well. - target_service_account : str, default None - An optional service account to try to impersonate when accessing GCS. This - requires the specified credential user or service account to have the necessary - permissions. - credential_token_expiration : datetime, default None - Expiration for credential generated with an access token. Must be specified - if `access_token` is specified. - default_bucket_location : str, default 'US' - GCP region to create buckets in. - scheme : str, default 'https' - GCS connection transport scheme. - endpoint_override : str, default None - Override endpoint with a connect string such as "localhost:9000" - default_metadata : mapping or pyarrow.KeyValueMetadata, default None - Default metadata for `open_output_stream`. This will be ignored if - non-empty metadata is passed to `open_output_stream`. - retry_time_limit : timedelta, default None - Set the maximum amount of time the GCS client will attempt to retry - transient errors. Subsecond granularity is ignored. - project_id : str, default None - The GCP project identifier to use for creating buckets. - If not set, the library uses the GOOGLE_CLOUD_PROJECT environment - variable. Most I/O operations do not need a project id, only applications - that create new buckets need a project id. - """ - - def __init__( - self, - *, - anonymous: bool = False, - access_token: str | None = None, - target_service_account: str | None = None, - credential_token_expiration: dt.datetime | None = None, - default_bucket_location: str = "US", - scheme: str = "https", - endpoint_override: str | None = None, - default_metadata: dict | KeyValueMetadata | None = None, - retry_time_limit: dt.timedelta | None = None, - project_id: str | None = None, - ): ... - @property - def default_bucket_location(self) -> str: - """ - The GCP location this filesystem will write to. - """ - @property - def project_id(self) -> str: - """ - The GCP project id this filesystem will use. - """ diff --git a/python/pyarrow/_parquet_encryption.pyi b/python/pyarrow/_parquet_encryption.pyi deleted file mode 100644 index e1228cbdb5a..00000000000 --- a/python/pyarrow/_parquet_encryption.pyi +++ /dev/null @@ -1,84 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import datetime as dt - -from typing import Callable - -from ._parquet import FileDecryptionProperties, FileEncryptionProperties -from .lib import _Weakrefable - -class EncryptionConfiguration(_Weakrefable): - footer_key: str - column_keys: dict[str, list[str]] - encryption_algorithm: str - plaintext_footer: bool - double_wrapping: bool - cache_lifetime: dt.timedelta - internal_key_material: bool - data_key_length_bits: int - - def __init__( - self, - footer_key: str, - *, - column_keys: dict[str, str | list[str]] | None = None, - encryption_algorithm: str | None = None, - plaintext_footer: bool | None = None, - double_wrapping: bool | None = None, - cache_lifetime: dt.timedelta | None = None, - internal_key_material: bool | None = None, - data_key_length_bits: int | None = None, - ) -> None: ... - -class DecryptionConfiguration(_Weakrefable): - cache_lifetime: dt.timedelta - def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... - -class KmsConnectionConfig(_Weakrefable): - kms_instance_id: str - kms_instance_url: str - key_access_token: str - custom_kms_conf: dict[str, str] - def __init__( - self, - *, - kms_instance_id: str | None = None, - kms_instance_url: str | None = None, - key_access_token: str | None = None, - custom_kms_conf: dict[str, str] | None = None, - ) -> None: ... - def refresh_key_access_token(self, value: str) -> None: ... - -class KmsClient(_Weakrefable): - def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... - def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... - -class CryptoFactory(_Weakrefable): - def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... - def file_encryption_properties( - self, - kms_connection_config: KmsConnectionConfig, - encryption_config: EncryptionConfiguration, - ) -> FileEncryptionProperties: ... - def file_decryption_properties( - self, - kms_connection_config: KmsConnectionConfig, - decryption_config: DecryptionConfiguration | None = None, - ) -> FileDecryptionProperties: ... - def remove_cache_entries_for_token(self, access_token: str) -> None: ... - def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/pyarrow/csv.pyi b/python/pyarrow/csv.pyi deleted file mode 100644 index a7abd413aab..00000000000 --- a/python/pyarrow/csv.pyi +++ /dev/null @@ -1,44 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._csv import ( - ISO8601, - ConvertOptions, - CSVStreamingReader, - CSVWriter, - InvalidRow, - ParseOptions, - ReadOptions, - WriteOptions, - open_csv, - read_csv, - write_csv, -) - -__all__ = [ - "ISO8601", - "ConvertOptions", - "CSVStreamingReader", - "CSVWriter", - "InvalidRow", - "ParseOptions", - "ReadOptions", - "WriteOptions", - "open_csv", - "read_csv", - "write_csv", -] diff --git a/python/pyarrow/cuda.pyi b/python/pyarrow/cuda.pyi deleted file mode 100644 index 0394965bb73..00000000000 --- a/python/pyarrow/cuda.pyi +++ /dev/null @@ -1,42 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._cuda import ( - BufferReader, - BufferWriter, - Context, - CudaBuffer, - HostBuffer, - IpcMemHandle, - new_host_buffer, - read_message, - read_record_batch, - serialize_record_batch, -) - -__all__ = [ - "BufferReader", - "BufferWriter", - "Context", - "CudaBuffer", - "HostBuffer", - "IpcMemHandle", - "new_host_buffer", - "read_message", - "read_record_batch", - "serialize_record_batch", -] diff --git a/python/pyarrow/flight.pyi b/python/pyarrow/flight.pyi deleted file mode 100644 index dcc6ee2244b..00000000000 --- a/python/pyarrow/flight.pyi +++ /dev/null @@ -1,112 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._flight import ( - Action, - ActionType, - BasicAuth, - CallInfo, - CertKeyPair, - ClientAuthHandler, - ClientMiddleware, - ClientMiddlewareFactory, - DescriptorType, - FlightCallOptions, - FlightCancelledError, - FlightClient, - FlightDataStream, - FlightDescriptor, - FlightEndpoint, - FlightError, - FlightInfo, - FlightInternalError, - FlightMetadataReader, - FlightMetadataWriter, - FlightMethod, - FlightServerBase, - FlightServerError, - FlightStreamChunk, - FlightStreamReader, - FlightStreamWriter, - FlightTimedOutError, - FlightUnauthenticatedError, - FlightUnauthorizedError, - FlightUnavailableError, - FlightWriteSizeExceededError, - GeneratorStream, - Location, - MetadataRecordBatchReader, - MetadataRecordBatchWriter, - RecordBatchStream, - Result, - SchemaResult, - ServerAuthHandler, - ServerCallContext, - ServerMiddleware, - ServerMiddlewareFactory, - Ticket, - TracingServerMiddlewareFactory, - connect, -) - -__all__ = [ - "Action", - "ActionType", - "BasicAuth", - "CallInfo", - "CertKeyPair", - "ClientAuthHandler", - "ClientMiddleware", - "ClientMiddlewareFactory", - "DescriptorType", - "FlightCallOptions", - "FlightCancelledError", - "FlightClient", - "FlightDataStream", - "FlightDescriptor", - "FlightEndpoint", - "FlightError", - "FlightInfo", - "FlightInternalError", - "FlightMetadataReader", - "FlightMetadataWriter", - "FlightMethod", - "FlightServerBase", - "FlightServerError", - "FlightStreamChunk", - "FlightStreamReader", - "FlightStreamWriter", - "FlightTimedOutError", - "FlightUnauthenticatedError", - "FlightUnauthorizedError", - "FlightUnavailableError", - "FlightWriteSizeExceededError", - "GeneratorStream", - "Location", - "MetadataRecordBatchReader", - "MetadataRecordBatchWriter", - "RecordBatchStream", - "Result", - "SchemaResult", - "ServerAuthHandler", - "ServerCallContext", - "ServerMiddleware", - "ServerMiddlewareFactory", - "Ticket", - "TracingServerMiddlewareFactory", - "connect", -] diff --git a/python/pyarrow/fs.pyi b/python/pyarrow/fs.pyi deleted file mode 100644 index 6c5a0af8d19..00000000000 --- a/python/pyarrow/fs.pyi +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._fs import ( # noqa - FileSelector, - FileType, - FileInfo, - FileSystem, - LocalFileSystem, - SubTreeFileSystem, - _MockFileSystem, - FileSystemHandler, - PyFileSystem, - SupportedFileSystem, -) -from pyarrow._azurefs import AzureFileSystem -from pyarrow._hdfs import HadoopFileSystem -from pyarrow._gcsfs import GcsFileSystem -from pyarrow._s3fs import ( # noqa - AwsDefaultS3RetryStrategy, - AwsStandardS3RetryStrategy, - S3FileSystem, - S3LogLevel, - S3RetryStrategy, - ensure_s3_initialized, - finalize_s3, - ensure_s3_finalized, - initialize_s3, - resolve_s3_region, -) - -FileStats = FileInfo - -def copy_files( - source: str, - destination: str, - source_filesystem: SupportedFileSystem | None = None, - destination_filesystem: SupportedFileSystem | None = None, - *, - chunk_size: int = 1024 * 1024, - use_threads: bool = True, -) -> None: ... - -class FSSpecHandler(FileSystemHandler): # type: ignore[misc] - fs: SupportedFileSystem - def __init__(self, fs: SupportedFileSystem) -> None: ... - -__all__ = [ - # _fs - "FileSelector", - "FileType", - "FileInfo", - "FileSystem", - "LocalFileSystem", - "SubTreeFileSystem", - "_MockFileSystem", - "FileSystemHandler", - "PyFileSystem", - # _azurefs - "AzureFileSystem", - # _hdfs - "HadoopFileSystem", - # _gcsfs - "GcsFileSystem", - # _s3fs - "AwsDefaultS3RetryStrategy", - "AwsStandardS3RetryStrategy", - "S3FileSystem", - "S3LogLevel", - "S3RetryStrategy", - "ensure_s3_initialized", - "finalize_s3", - "ensure_s3_finalized", - "initialize_s3", - "resolve_s3_region", - # fs - "FileStats", - "copy_files", - "FSSpecHandler", -] diff --git a/python/pyarrow/interchange/__init__.pyi b/python/pyarrow/interchange/__init__.pyi deleted file mode 100644 index 13a83393a91..00000000000 --- a/python/pyarrow/interchange/__init__.pyi +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/python/pyarrow/interchange/buffer.pyi b/python/pyarrow/interchange/buffer.pyi deleted file mode 100644 index 78d1dabb8b7..00000000000 --- a/python/pyarrow/interchange/buffer.pyi +++ /dev/null @@ -1,75 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import enum - -from pyarrow.lib import Buffer - -class DlpackDeviceType(enum.IntEnum): - """Integer enum for device type codes matching DLPack.""" - - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 - -class _PyArrowBuffer: - """ - Data in the buffer is guaranteed to be contiguous in memory. - - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - - This distinction is useful to support both data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. - """ - def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... - @property - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ - @property - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - def __dlpack__(self): - """ - Produce DLPack capsule (see array API standard). - - Raises: - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - """ - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. - Note: must be implemented even if ``__dlpack__`` is not. - """ diff --git a/python/pyarrow/interchange/column.pyi b/python/pyarrow/interchange/column.pyi deleted file mode 100644 index ce7e169bfb5..00000000000 --- a/python/pyarrow/interchange/column.pyi +++ /dev/null @@ -1,269 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import enum - -from typing import Any, Iterable, TypeAlias, TypedDict - -from pyarrow.lib import Array, ChunkedArray - -from .buffer import _PyArrowBuffer - -class DtypeKind(enum.IntEnum): - """ - Integer enum for data types. - - Attributes - ---------- - INT : int - Matches to signed integer data type. - UINT : int - Matches to unsigned integer data type. - FLOAT : int - Matches to floating point data type. - BOOL : int - Matches to boolean data type. - STRING : int - Matches to string data type (UTF-8 encoded). - DATETIME : int - Matches to datetime data type. - CATEGORICAL : int - Matches to categorical data type. - """ - - INT = 0 - UINT = 1 - FLOAT = 2 - BOOL = 20 - STRING = 21 # UTF-8 - DATETIME = 22 - CATEGORICAL = 23 - -Dtype: TypeAlias = tuple[DtypeKind, int, str, str] - -class ColumnNullType(enum.IntEnum): - """ - Integer enum for null type representation. - - Attributes - ---------- - NON_NULLABLE : int - Non-nullable column. - USE_NAN : int - Use explicit float NaN value. - USE_SENTINEL : int - Sentinel value besides NaN. - USE_BITMASK : int - The bit is set/unset representing a null on a certain position. - USE_BYTEMASK : int - The byte is set/unset representing a null on a certain position. - """ - - NON_NULLABLE = 0 - USE_NAN = 1 - USE_SENTINEL = 2 - USE_BITMASK = 3 - USE_BYTEMASK = 4 - -class ColumnBuffers(TypedDict): - data: tuple[_PyArrowBuffer, Dtype] - validity: tuple[_PyArrowBuffer, Dtype] | None - offsets: tuple[_PyArrowBuffer, Dtype] | None - -class CategoricalDescription(TypedDict): - is_ordered: bool - is_dictionary: bool - categories: _PyArrowColumn | None - -class Endianness(enum.Enum): - LITTLE = "<" - BIG = ">" - NATIVE = "=" - NA = "|" - -class NoBufferPresent(Exception): - """Exception to signal that there is no requested buffer.""" - -class _PyArrowColumn: - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? - - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - """ - def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... - def size(self) -> int: - """ - Size of the column, in elements. - - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. - - Is a method rather than a property because it may cause a (potentially - expensive) computation for some dataframe implementations. - """ - @property - def offset(self) -> int: - """ - Offset of first element. - - May be > 0 if using chunks; for example for a column with N chunks of - equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. - """ - @property - def dtype(self) -> tuple[DtypeKind, int, str, str]: - """ - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - - Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow C - Data Interface format. - Endianness : current only native endianness (``=``) is supported - - Notes: - - Kind specifiers are aligned with DLPack where possible (hence the - jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for - bit masks) or 8 (for byte masks). - - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the - future we need to support non-native endianness - - Went with Apache Arrow format strings over NumPy format strings - because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, and - for categoricals. - - For categoricals, the format string describes the type of the - categorical in the data buffer. In case of a separate encoding of - the categorical (e.g. an integer to string mapping), this can - be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, - decimal, and nested (list, struct, map, union) dtypes. - """ - @property - def describe_categorical(self) -> CategoricalDescription: - """ - If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate non-categorical Column encoding categorical - values. - - Raises TypeError if the dtype is not categorical - - Returns the dictionary with description on how to interpret the - data buffer: - - "is_ordered" : bool, whether the ordering of dictionary indices - is semantically meaningful. - - "is_dictionary" : bool, whether a mapping of - categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of - indices to category values (e.g. an array of - cat1, cat2, ...). None if not a dictionary-style - categorical. - - TBD: are there any other in-memory representations that are needed? - """ - @property - def describe_null(self) -> tuple[ColumnNullType, Any]: - """ - Return the missing value (or "null") representation the column dtype - uses, as a tuple ``(kind, value)``. - - Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. - None otherwise. - """ - @property - def null_count(self) -> int: - """ - Number of null elements, if known. - - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. - """ - @property - def metadata(self) -> dict[str, Any]: - """ - The metadata for the column. See `DataFrame.metadata` for more details. - """ - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: - """ - Return an iterator yielding the chunks. - - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - def get_buffers(self) -> ColumnBuffers: - """ - Return a dictionary containing the underlying buffers. - - The returned dictionary has the following contents: - - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ diff --git a/python/pyarrow/interchange/dataframe.pyi b/python/pyarrow/interchange/dataframe.pyi deleted file mode 100644 index a7ea6aeac74..00000000000 --- a/python/pyarrow/interchange/dataframe.pyi +++ /dev/null @@ -1,119 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import Any, Iterable, Sequence - -from pyarrow.interchange.column import _PyArrowColumn -from pyarrow.lib import RecordBatch, Table - -class _PyArrowDataFrame: - """ - A data frame class, with only the methods required by the interchange - protocol defined. - - A "data frame" represents an ordered collection of named columns. - A column's "name" must be a unique string. - Columns may be accessed by name or by position. - - This could be a public data frame class, or an object with the methods and - attributes defined on this DataFrame class could be returned from the - ``__dataframe__`` method of a public data frame class in a library adhering - to the dataframe interchange protocol specification. - """ - - def __init__( - self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True - ) -> None: ... - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: - """ - Construct a new exchange object, potentially changing the parameters. - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN``. - It is intended for cases where the consumer does not support the bit - mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this - protocol specifies contiguous buffers. - """ - @property - def metadata(self) -> dict[str, Any]: - """ - The metadata for the data frame, as a dictionary with string keys. The - contents of `metadata` may be anything, they are meant for a library - to store information that it needs to, e.g., roundtrip losslessly or - for two implementations to share data that is not (yet) part of the - interchange protocol specification. For avoiding collisions with other - entries, please add name the keys with the name of the library - followed by a period and the desired name, e.g, ``pandas.indexcol``. - """ - def num_columns(self) -> int: - """ - Return the number of columns in the DataFrame. - """ - def num_rows(self) -> int: - """ - Return the number of rows in the DataFrame, if available. - """ - def num_chunks(self) -> int: - """ - Return the number of chunks the DataFrame consists of. - """ - def column_names(self) -> Iterable[str]: - """ - Return an iterator yielding the column names. - """ - def get_column(self, i: int) -> _PyArrowColumn: - """ - Return the column at the indicated position. - """ - def get_column_by_name(self, name: str) -> _PyArrowColumn: - """ - Return the column whose name is the indicated name. - """ - def get_columns(self) -> Iterable[_PyArrowColumn]: - """ - Return an iterator yielding the columns. - """ - def select_columns(self, indices: Sequence[int]) -> Self: - """ - Create a new DataFrame by selecting a subset of columns by index. - """ - def select_columns_by_name(self, names: Sequence[str]) -> Self: - """ - Create a new DataFrame by selecting a subset of columns by name. - """ - def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: - """ - Return an iterator yielding the chunks. - - By default (None), yields the chunks that the data is stored as by the - producer. If given, ``n_chunks`` must be a multiple of - ``self.num_chunks()``, meaning the producer must subdivide each chunk - before yielding it. - - Note that the producer must ensure that all columns are chunked the - same way. - """ diff --git a/python/pyarrow/interchange/from_dataframe.pyi b/python/pyarrow/interchange/from_dataframe.pyi deleted file mode 100644 index aa6217b6181..00000000000 --- a/python/pyarrow/interchange/from_dataframe.pyi +++ /dev/null @@ -1,261 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any, Protocol, TypeAlias - -from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table - -from .column import ( - ColumnBuffers, - ColumnNullType, - Dtype, - DtypeKind, -) - -class DataFrameObject(Protocol): - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... - -ColumnObject: TypeAlias = Any - -def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: - """ - Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. - - Parameters - ---------- - df : DataFrameObject - Object supporting the interchange protocol, i.e. `__dataframe__` - method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Table - - Examples - -------- - >>> import pyarrow - >>> from pyarrow.interchange import from_dataframe - - Convert a pandas dataframe to a pyarrow table: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_attendees": [100, 10, 1], - ... "country": ["Italy", "Spain", "Slovenia"], - ... } - ... ) - >>> df - n_attendees country - 0 100 Italy - 1 10 Spain - 2 1 Slovenia - >>> from_dataframe(df) - pyarrow.Table - n_attendees: int64 - country: large_string - ---- - n_attendees: [[100,10,1]] - country: [["Italy","Spain","Slovenia"]] - """ - -def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: - """ - Convert interchange protocol chunk to ``pa.RecordBatch``. - - Parameters - ---------- - df : DataFrameObject - Object supporting the interchange protocol, i.e. `__dataframe__` - method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.RecordBatch - """ - -def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: - """ - Convert a column holding one of the primitive dtypes to a PyArrow array. - A primitive type is one of: int, uint, float, bool (1 bit). - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - """ - -def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: - """ - Convert a column holding boolean dtype to a PyArrow array. - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - """ - -def categorical_column_to_dictionary( - col: ColumnObject, allow_copy: bool = True -) -> DictionaryArray: - """ - Convert a column holding categorical data to a pa.DictionaryArray. - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.DictionaryArray - """ - -def parse_datetime_format_str(format_str: str) -> tuple[str, str]: - """Parse datetime `format_str` to interpret the `data`.""" - -def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: - """Map column date type to pyarrow date type.""" - -def buffers_to_array( - buffers: ColumnBuffers, - data_type: tuple[DtypeKind, int, str, str], - length: int, - describe_null: ColumnNullType, - offset: int = 0, - allow_copy: bool = True, -) -> Array: - """ - Build a PyArrow array from the passed buffer. - - Parameters - ---------- - buffer : ColumnBuffers - Dictionary containing tuples of underlying buffers and - their associated dtype. - data_type : Tuple[DtypeKind, int, str, str], - Dtype description of the column as a tuple ``(kind, bit-width, format string, - endianness)``. - length : int - The number of values in the array. - describe_null: ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - - Notes - ----- - The returned array doesn't own the memory. The caller of this function - is responsible for keeping the memory owner object alive as long as - the returned PyArrow array is being used. - """ - -def validity_buffer_from_mask( - validity_buff: Buffer, - validity_dtype: Dtype, - describe_null: ColumnNullType, - length: int, - offset: int = 0, - allow_copy: bool = True, -) -> Buffer: - """ - Build a PyArrow buffer from the passed mask buffer. - - Parameters - ---------- - validity_buff : BufferObject - Tuple of underlying validity buffer and associated dtype. - validity_dtype : Dtype - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - describe_null : ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - length : int - The number of values in the array. - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Buffer - """ - -def validity_buffer_nan_sentinel( - data_pa_buffer: Buffer, - data_type: Dtype, - describe_null: ColumnNullType, - length: int, - offset: int = 0, - allow_copy: bool = True, -) -> Buffer: - """ - Build a PyArrow buffer from NaN or sentinel values. - - Parameters - ---------- - data_pa_buffer : pa.Buffer - PyArrow buffer for the column data. - data_type : Dtype - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - describe_null : ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - length : int - The number of values in the array. - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Buffer - """ diff --git a/python/pyarrow/parquet/__init__.pyi b/python/pyarrow/parquet/__init__.pyi deleted file mode 100644 index 8d0b5374ea0..00000000000 --- a/python/pyarrow/parquet/__init__.pyi +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from .core import * # noqa diff --git a/python/pyarrow/parquet/core.pyi b/python/pyarrow/parquet/core.pyi deleted file mode 100644 index f5ac0510ffc..00000000000 --- a/python/pyarrow/parquet/core.pyi +++ /dev/null @@ -1,2078 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -from pathlib import Path - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import IO, Callable, Iterator, Literal, Sequence - -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from pyarrow import _parquet -from pyarrow._compute import Expression -from pyarrow._fs import FileSystem, SupportedFileSystem -from pyarrow._parquet import ( - ColumnChunkMetaData, - ColumnSchema, - FileDecryptionProperties, - FileEncryptionProperties, - FileMetaData, - ParquetLogicalType, - ParquetReader, - ParquetSchema, - RowGroupMetaData, - SortingColumn, - Statistics, -) -from pyarrow._stubs_typing import FilterTuple, SingleOrList -from pyarrow.dataset import ParquetFileFragment, Partitioning -from pyarrow.lib import NativeFile, RecordBatch, Schema, Table -from typing_extensions import deprecated - -__all__ = ( - "ColumnChunkMetaData", - "ColumnSchema", - "FileDecryptionProperties", - "FileEncryptionProperties", - "FileMetaData", - "ParquetDataset", - "ParquetFile", - "ParquetLogicalType", - "ParquetReader", - "ParquetSchema", - "ParquetWriter", - "RowGroupMetaData", - "SortingColumn", - "Statistics", - "read_metadata", - "read_pandas", - "read_schema", - "read_table", - "write_metadata", - "write_table", - "write_to_dataset", - "_filters_to_expression", - "filters_to_expression", -) - -def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: - """ - Check if filters are well-formed and convert to an ``Expression``. - - Parameters - ---------- - filters : List[Tuple] or List[List[Tuple]] - - Notes - ----- - See internal ``pyarrow._DNF_filter_doc`` attribute for more details. - - Examples - -------- - - >>> filters_to_expression([("foo", "==", "bar")]) - - - Returns - ------- - pyarrow.compute.Expression - An Expression representing the filters - """ - -@deprecated("use filters_to_expression") -def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... - -_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] - -class ParquetFile: - """ - Reader interface for a single Parquet file. - - Parameters - ---------- - source : str, pathlib.Path, pyarrow.NativeFile, or file-like object - Readable source. For passing bytes or buffer-like file containing a - Parquet file, use pyarrow.BufferReader. - metadata : FileMetaData, default None - Use existing metadata object, rather than reading from file. - common_metadata : FileMetaData, default None - Will be used in reads for pandas schema metadata if not found in the - main file's metadata, no other uses at the moment. - read_dictionary : list - List of column names to read directly as DictionaryArray. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - pre_buffer : bool, default False - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties, default None - File decryption properties for Parquet Modular Encryption. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Examples - -------- - - Generate an example PyArrow Table and write it to Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - - Create a ``ParquetFile`` object from the Parquet file: - - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read the data: - - >>> parquet_file.read() - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] - - Create a ParquetFile object with "animal" column as DictionaryArray: - - >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) - >>> parquet_file.read() - pyarrow.Table - n_legs: int64 - animal: dictionary - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [ -- dictionary: - ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: - [0,1,2,3,4,5]] - """ - - reader: ParquetReader - common_metadata: FileMetaData - - def __init__( - self, - source: str | Path | NativeFile | IO, - *, - metadata: FileMetaData | None = None, - common_metadata: FileMetaData | None = None, - read_dictionary: list[str] | None = None, - memory_map: bool = False, - buffer_size: int = 0, - pre_buffer: bool = False, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - filesystem: SupportedFileSystem | None = None, - page_checksum_verification: bool = False, - ): ... - def __enter__(self) -> Self: ... - def __exit__(self, *args, **kwargs) -> None: ... - @property - def metadata(self) -> FileMetaData: - """ - Return the Parquet metadata. - """ - @property - def schema(self) -> ParquetSchema: - """ - Return the Parquet schema, unconverted to Arrow types - """ - @property - def schema_arrow(self) -> Schema: - """ - Return the inferred Arrow schema, converted from the whole Parquet - file's schema - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read the Arrow schema: - - >>> parquet_file.schema_arrow - n_legs: int64 - animal: string - """ - @property - def num_row_groups(self) -> int: - """ - Return the number of row groups of the Parquet file. - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.num_row_groups - 1 - """ - def close(self, force: bool = False) -> None: ... - @property - def closed(self) -> bool: ... - def read_row_group( - self, - i: int, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a single row group from a Parquet file. - - Parameters - ---------- - i : int - Index of the individual row group that we want to read. - columns : list - If not None, only these columns will be read from the row group. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the row group as a table (of columns) - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.read_row_group(0) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] - """ - def read_row_groups( - self, - row_groups: list, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a multiple row groups from a Parquet file. - - Parameters - ---------- - row_groups : list - Only these row groups will be read from the file. - columns : list - If not None, only these columns will be read from the row group. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the row groups as a table (of columns). - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.read_row_groups([0, 0]) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] - animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] - """ - def iter_batches( - self, - batch_size: int = 65536, - row_groups: list | None = None, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Iterator[RecordBatch]: - """ - Read streaming batches from a Parquet file. - - Parameters - ---------- - batch_size : int, default 64K - Maximum number of records to yield per batch. Batches may be - smaller if there aren't enough rows in the file. - row_groups : list - Only these row groups will be read from the file. - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : boolean, default True - Perform multi-threaded column reads. - use_pandas_metadata : boolean, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Yields - ------ - pyarrow.RecordBatch - Contents of each batch as a record batch - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - >>> for i in parquet_file.iter_batches(): - ... print("RecordBatch") - ... print(i.to_pandas()) - RecordBatch - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - """ - def read( - self, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a Table from Parquet format. - - Parameters - ---------- - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read a Table: - - >>> parquet_file.read(columns=["animal"]) - pyarrow.Table - animal: string - ---- - animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] - """ - def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: - """ - Read contents of file for the given columns and batch size. - - Notes - ----- - This function's primary purpose is benchmarking. - The scan is executed on a single thread. - - Parameters - ---------- - columns : list of integers, default None - Select columns to read, if None scan all columns. - batch_size : int, default 64K - Number of rows to read at a time internally. - - Returns - ------- - num_rows : int - Number of rows in file - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.scan_contents() - 6 - """ - -class ParquetWriter: - """ - Class for incrementally building a Parquet file for Arrow tables. - - Parameters - ---------- - where : path or file-like object - schema : pyarrow.Schema - version : {"1.0", "2.4", "2.6"}, default "2.6" - Determine which Parquet logical types are available for use, whether the - reduced set from the Parquet 1.x.x format or the expanded logical types - added in later format versions. - Files written with version='2.4' or '2.6' may not be readable in all - Parquet implementations, so version='1.0' is likely the choice that - maximizes file compatibility. - UINT32 and some logical types are only available with version '2.4'. - Nanosecond timestamps are only available with version '2.6'. - Other features such as compression algorithms or the new serialized - data page format must be enabled separately (see 'compression' and - 'data_page_version'). - use_dictionary : bool or list, default True - Specify if we should use dictionary encoding in general or only for - some columns. - When encoding the column, if the dictionary size is too large, the - column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type - doesn't support dictionary encoding. - compression : str or dict, default 'snappy' - Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. - write_statistics : bool or list, default True - Specify if we should write statistics in general (default is True) or only - for some columns. - use_deprecated_int96_timestamps : bool, default None - Write timestamps to INT96 Parquet format. Defaults to False unless enabled - by flavor argument. This take priority over the coerce_timestamps option. - coerce_timestamps : str, default None - Cast timestamps to a particular resolution. If omitted, defaults are chosen - depending on `version`. For ``version='1.0'`` and ``version='2.4'``, - nanoseconds are cast to microseconds ('us'), while for - ``version='2.6'`` (the default), they are written natively without loss - of resolution. Seconds are always cast to milliseconds ('ms') by default, - as Parquet does not have any temporal type with seconds resolution. - If the casting results in loss of data, it will raise an exception - unless ``allow_truncated_timestamps=True`` is given. - Valid values: {None, 'ms', 'us'} - allow_truncated_timestamps : bool, default False - Allow loss of data when coercing timestamps to a particular - resolution. E.g. if microsecond or nanosecond data is lost when coercing to - 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` - will NOT result in the truncation exception being ignored unless - ``coerce_timestamps`` is not None. - data_page_size : int, default None - Set a target threshold for the approximate encoded size of data - pages within a column chunk (in bytes). If None, use the default data page - size of 1MByte. - flavor : {'spark'}, default None - Sanitize schema or set other compatibility options to work with - various target systems. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - compression_level : int or dict, default None - Specify the compression level for a codec, either on a general basis or - per-column. If None is passed, arrow selects the compression level for - the compression codec in use. The compression level has a different - meaning for each codec, so you have to read the documentation of the - codec you are using. - An exception is thrown if the compression codec does not allow specifying - a compression level. - use_byte_stream_split : bool or list, default False - Specify if the byte_stream_split encoding should be used in general or - only for some columns. If both dictionary and byte_stream_stream are - enabled, then dictionary is preferred. - The byte_stream_split encoding is valid for integer, floating-point - and fixed-size binary data types (including decimals); it should be - combined with a compression codec so as to achieve size reduction. - column_encoding : string or dict, default None - Specify the encoding scheme on a per column basis. - Can only be used when ``use_dictionary`` is set to False, and - cannot be used in combination with ``use_byte_stream_split``. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. - Certain encodings are only compatible with certain data types. - Please refer to the encodings section of `Reading and writing Parquet - files `_. - data_page_version : {"1.0", "2.0"}, default "1.0" - The serialized Parquet data page format version to write, defaults to - 1.0. This does not impact the file schema logical types and Arrow to - Parquet type casting behavior; for that use the "version" option. - use_compliant_nested_type : bool, default True - Whether to write compliant Parquet nested type (lists) as defined - `here `_, defaults to ``True``. - For ``use_compliant_nested_type=True``, this will write into a list - with 3-level structure where the middle level, named ``list``, - is a repeated group with a single field named ``element``:: - - group (LIST) { - repeated group list { - element; - } - } - - For ``use_compliant_nested_type=False``, this will also write into a list - with 3-level structure, where the name of the single field of the middle - level ``list`` is taken from the element name for nested columns in Arrow, - which defaults to ``item``:: - - group (LIST) { - repeated group list { - item; - } - } - encryption_properties : FileEncryptionProperties, default None - File encryption properties for Parquet Modular Encryption. - If None, no encryption will be done. - The encryption properties can be created using: - ``CryptoFactory.file_encryption_properties()``. - write_batch_size : int, default None - Number of values to write to a page at a time. If None, use the default of - 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages - are exceeding the ``data_page_size`` due to large column values, lowering - the batch size can help keep page sizes closer to the intended size. - dictionary_pagesize_limit : int, default None - Specify the dictionary page size limit per row group. If None, use the - default 1MB. - store_schema : bool, default True - By default, the Arrow schema is serialized and stored in the Parquet - file metadata (in the "ARROW:schema" key). When reading the file, - if this key is available, it will be used to more faithfully recreate - the original Arrow data. For example, for tz-aware timestamp columns - it will restore the timezone (Parquet only stores the UTC values without - timezone), or columns with duration type will be restored from the int64 - Parquet column. - write_page_index : bool, default False - Whether to write a page index in general for all columns. - Writing statistics to the page index disables the old method of writing - statistics to each data page header. The page index makes statistics-based - filtering more efficient than the page header, as it gathers all the - statistics for a Parquet file in a single place, avoiding scattered I/O. - Note that the page index is not yet used on the read size by PyArrow. - write_page_checksum : bool, default False - Whether to write page checksums in general for all columns. - Page checksums enable detection of data corruption, which might occur during - transmission or in the storage. - sorting_columns : Sequence of SortingColumn, default None - Specify the sort order of the data being written. The writer does not sort - the data nor does it verify that the data is sorted. The sort order is - written to the row group metadata, which can then be used by readers. - store_decimal_as_integer : bool, default False - Allow decimals with 1 <= precision <= 18 to be stored as integers. - In Parquet, DECIMAL can be stored in any of the following physical types: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. - Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the - unscaled value is used. - - By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use the following physical types to store decimals: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: for precision > 18. - - As a consequence, decimal columns stored in integer types are more compact. - writer_engine_version : unused - **options : dict - If options contains a key `metadata_collector` then the - corresponding value is assumed to be a list (or any object with - `.append` method) that will be filled with the file metadata instance - of the written file. - - Examples - -------- - Generate an example PyArrow Table and RecordBatch: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.record_batch( - ... [ - ... [2, 2, 4, 4, 5, 100], - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... ], - ... names=["n_legs", "animal"], - ... ) - - create a ParquetWriter object: - - >>> import pyarrow.parquet as pq - >>> writer = pq.ParquetWriter("example.parquet", table.schema) - - and write the Table into the Parquet file: - - >>> writer.write_table(table) - >>> writer.close() - - >>> pq.read_table("example.parquet").to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - create a ParquetWriter object for the RecordBatch: - - >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) - - and write the RecordBatch into the Parquet file: - - >>> writer2.write_batch(batch) - >>> writer2.close() - - >>> pq.read_table("example2.parquet").to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - """ - - flavor: str - schema_changed: bool - schema: ParquetSchema - where: str | Path | IO - file_handler: NativeFile | None - writer: _parquet.ParquetWriter - is_open: bool - - def __init__( - self, - where: str | Path | IO | NativeFile, - schema: Schema, - filesystem: SupportedFileSystem | None = None, - flavor: str | None = None, - version: Literal["1.0", "2.4", "2.6"] = ..., - use_dictionary: bool = True, - compression: _Compression | dict[str, _Compression] = "snappy", - write_statistics: bool | list = True, - use_deprecated_int96_timestamps: bool | None = None, - compression_level: int | dict | None = None, - use_byte_stream_split: bool | list = False, - column_encoding: str | dict | None = None, - writer_engine_version=None, - data_page_version: Literal["1.0", "2.0"] = ..., - use_compliant_nested_type: bool = True, - encryption_properties: FileEncryptionProperties | None = None, - write_batch_size: int | None = None, - dictionary_pagesize_limit: int | None = None, - store_schema: bool = True, - write_page_index: bool = False, - write_page_checksum: bool = False, - sorting_columns: Sequence[SortingColumn] | None = None, - store_decimal_as_integer: bool = False, - **options, - ) -> None: ... - def __enter__(self) -> Self: ... - def __exit__(self, *args, **kwargs) -> Literal[False]: ... - def write( - self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None - ) -> None: - """ - Write RecordBatch or Table to the Parquet file. - - Parameters - ---------- - table_or_batch : {RecordBatch, Table} - row_group_size : int, default None - Maximum number of rows in each written row group. If None, - the row group size will be the minimum of the input - table or batch length and 1024 * 1024. - """ - def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: - """ - Write RecordBatch to the Parquet file. - - Parameters - ---------- - batch : RecordBatch - row_group_size : int, default None - Maximum number of rows in written row group. If None, the - row group size will be the minimum of the RecordBatch - size and 1024 * 1024. If set larger than 64Mi then 64Mi - will be used instead. - """ - def write_table(self, table: Table, row_group_size: int | None = None) -> None: - """ - Write Table to the Parquet file. - - Parameters - ---------- - table : Table - row_group_size : int, default None - Maximum number of rows in each written row group. If None, - the row group size will be the minimum of the Table size - and 1024 * 1024. If set larger than 64Mi then 64Mi will - be used instead. - - """ - def close(self) -> None: - """ - Close the connection to the Parquet file. - """ - def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: - """ - Add key-value metadata to the file. - This will overwrite any existing metadata with the same key. - - Parameters - ---------- - key_value_metadata : dict - Keys and values must be string-like / coercible to bytes. - """ - -class ParquetDataset: - """ - Encapsulates details of reading a complete Parquet dataset possibly - consisting of multiple files and partitions in subdirectories. - - Parameters - ---------- - path_or_paths : str or List[str] - A directory name, single file name, or list of file names. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - schema : pyarrow.parquet.Schema - Optionally provide the Schema for the Dataset, in which case it will - not be inferred from the source. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. Set to False if you want to prioritize minimal memory usage - over maximum speed. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular resolution - (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 - timestamps will be inferred as timestamps in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) - - create a ParquetDataset object from the dataset source: - - >>> dataset = pq.ParquetDataset("dataset_v2/") - - and read the data: - - >>> dataset.read().to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - create a ParquetDataset object with filter: - - >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) - >>> dataset.read().to_pandas() - n_legs animal year - 0 4 Dog 2021 - 1 4 Horse 2022 - """ - def __init__( - self, - path_or_paths: SingleOrList[str] - | SingleOrList[Path] - | SingleOrList[NativeFile] - | SingleOrList[IO], - filesystem: SupportedFileSystem | None = None, - schema: Schema | None = None, - *, - filters: Expression | FilterTuple | list[FilterTuple] | None = None, - read_dictionary: list[str] | None = None, - memory_map: bool = False, - buffer_size: int = 0, - partitioning: str | list[str] | Partitioning | None = "hive", - ignore_prefixes: list[str] | None = None, - pre_buffer: bool = True, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - page_checksum_verification: bool = False, - ): ... - def equals(self, other: ParquetDataset) -> bool: ... - @property - def schema(self) -> Schema: - """ - Schema of the Dataset. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_schema/") - - Read the schema: - - >>> dataset.schema - n_legs: int64 - animal: string - year: dictionary - """ - def read( - self, - columns: list[str] | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read (multiple) Parquet files as a single pyarrow.Table. - - Parameters - ---------- - columns : List[str] - Names of columns to read from the dataset. The partition fields - are not automatically included. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_read/") - - Read the dataset: - - >>> dataset.read(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[5],[2],[4,100],[2,4]] - """ - def read_pandas(self, **kwargs) -> Table: - """ - Read dataset including pandas metadata, if any. Other arguments passed - through to :func:`read`, see docstring for further details. - - Parameters - ---------- - **kwargs : optional - Additional options for :func:`read` - - Examples - -------- - Generate an example parquet file: - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "table_V2.parquet") - >>> dataset = pq.ParquetDataset("table_V2.parquet") - - Read the dataset with pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,2,4,4,5,100]] - - >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} - """ - @property - def fragments(self) -> list[ParquetFileFragment]: - """ - A list of the Dataset source fragments or pieces with absolute - file paths. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") - - List the fragments: - - >>> dataset.fragments - [ list[str]: - """ - A list of absolute Parquet file paths in the Dataset source. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_files/") - - List the files: - - >>> dataset.files - ['dataset_v2_files/year=2019/...-0.parquet', ... - """ - @property - def filesystem(self) -> FileSystem: - """ - The filesystem type of the Dataset source. - """ - @property - def partitioning(self) -> Partitioning: - """ - The partitioning of the Dataset source, if discovered. - """ - -def read_table( - source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], - *, - columns: list | None = None, - use_threads: bool = True, - schema: Schema | None = None, - use_pandas_metadata: bool = False, - read_dictionary: list[str] | None = None, - memory_map: bool = False, - buffer_size: int = 0, - partitioning: str | list[str] | Partitioning | None = "hive", - filesystem: SupportedFileSystem | None = None, - filters: Expression | FilterTuple | list[FilterTuple] | None = None, - ignore_prefixes: list[str] | None = None, - pre_buffer: bool = True, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - page_checksum_verification: bool = False, -) -> Table: - """ - Read a Table from Parquet format - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - use_threads : bool, default True - Perform multi-threaded column reads. - schema : Schema, optional - Optionally provide the Schema for the parquet dataset, in which case it - will not be inferred from the source. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns) - - - Examples - -------- - - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) - - Read the data: - - >>> pq.read_table("dataset_name_2").to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - - Read only a subset of columns: - - >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[5],[2],[4,100],[2,4]] - animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] - - Read a subset of columns and read one column as DictionaryArray: - - >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) - pyarrow.Table - n_legs: int64 - animal: dictionary - ---- - n_legs: [[5],[2],[4,100],[2,4]] - animal: [ -- dictionary: - ["Brittle stars"] -- indices: - [0], -- dictionary: - ["Flamingo"] -- indices: - [0], -- dictionary: - ["Dog","Centipede"] -- indices: - [0,1], -- dictionary: - ["Parrot","Horse"] -- indices: - [0,1]] - - Read the table with filter: - - >>> pq.read_table( - ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] - ... ).to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - - Read data from a single Parquet file: - - >>> pq.write_table(table, "example.parquet") - >>> pq.read_table("dataset_name_2").to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - """ - -def read_pandas( - source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs -) -> Table: - """ - - Read a Table from Parquet format, also reading DataFrame - index values if known in the file metadata - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - use_threads : bool, default True - Perform multi-threaded column reads. - schema : Schema, optional - Optionally provide the Schema for the parquet dataset, in which case it - will not be inferred from the source. - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - **kwargs - additional options for :func:`read_table` - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Returns - ------- - pyarrow.Table - Content of the file as a Table of Columns, including DataFrame - indexes as columns - """ - -def write_table( - table: Table, - where: str | Path | NativeFile | IO, - row_group_size: int | None = None, - version: Literal["1.0", "2.4", "2.6"] = "2.6", - use_dictionary: bool = True, - compression: _Compression | dict[str, _Compression] = "snappy", - write_statistics: bool | list = True, - use_deprecated_int96_timestamps: bool | None = None, - coerce_timestamps: str | None = None, - allow_truncated_timestamps: bool = False, - data_page_size: int | None = None, - flavor: str | None = None, - filesystem: SupportedFileSystem | None = None, - compression_level: int | dict | None = None, - use_byte_stream_split: bool = False, - column_encoding: str | dict | None = None, - data_page_version: Literal["1.0", "2.0"] = ..., - use_compliant_nested_type: bool = True, - encryption_properties: FileEncryptionProperties | None = None, - write_batch_size: int | None = None, - dictionary_pagesize_limit: int | None = None, - store_schema: bool = True, - write_page_index: bool = False, - write_page_checksum: bool = False, - sorting_columns: Sequence[SortingColumn] | None = None, - store_decimal_as_integer: bool = False, - **kwargs, -) -> None: - """ - - Write a Table to Parquet format. - - Parameters - ---------- - table : pyarrow.Table - where : string or pyarrow.NativeFile - row_group_size : int - Maximum number of rows in each written row group. If None, the - row group size will be the minimum of the Table size and - 1024 * 1024. - version : {"1.0", "2.4", "2.6"}, default "2.6" - Determine which Parquet logical types are available for use, whether the - reduced set from the Parquet 1.x.x format or the expanded logical types - added in later format versions. - Files written with version='2.4' or '2.6' may not be readable in all - Parquet implementations, so version='1.0' is likely the choice that - maximizes file compatibility. - UINT32 and some logical types are only available with version '2.4'. - Nanosecond timestamps are only available with version '2.6'. - Other features such as compression algorithms or the new serialized - data page format must be enabled separately (see 'compression' and - 'data_page_version'). - use_dictionary : bool or list, default True - Specify if we should use dictionary encoding in general or only for - some columns. - When encoding the column, if the dictionary size is too large, the - column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type - doesn't support dictionary encoding. - compression : str or dict, default 'snappy' - Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. - write_statistics : bool or list, default True - Specify if we should write statistics in general (default is True) or only - for some columns. - use_deprecated_int96_timestamps : bool, default None - Write timestamps to INT96 Parquet format. Defaults to False unless enabled - by flavor argument. This take priority over the coerce_timestamps option. - coerce_timestamps : str, default None - Cast timestamps to a particular resolution. If omitted, defaults are chosen - depending on `version`. For ``version='1.0'`` and ``version='2.4'``, - nanoseconds are cast to microseconds ('us'), while for - ``version='2.6'`` (the default), they are written natively without loss - of resolution. Seconds are always cast to milliseconds ('ms') by default, - as Parquet does not have any temporal type with seconds resolution. - If the casting results in loss of data, it will raise an exception - unless ``allow_truncated_timestamps=True`` is given. - Valid values: {None, 'ms', 'us'} - allow_truncated_timestamps : bool, default False - Allow loss of data when coercing timestamps to a particular - resolution. E.g. if microsecond or nanosecond data is lost when coercing to - 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` - will NOT result in the truncation exception being ignored unless - ``coerce_timestamps`` is not None. - data_page_size : int, default None - Set a target threshold for the approximate encoded size of data - pages within a column chunk (in bytes). If None, use the default data page - size of 1MByte. - flavor : {'spark'}, default None - Sanitize schema or set other compatibility options to work with - various target systems. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - compression_level : int or dict, default None - Specify the compression level for a codec, either on a general basis or - per-column. If None is passed, arrow selects the compression level for - the compression codec in use. The compression level has a different - meaning for each codec, so you have to read the documentation of the - codec you are using. - An exception is thrown if the compression codec does not allow specifying - a compression level. - use_byte_stream_split : bool or list, default False - Specify if the byte_stream_split encoding should be used in general or - only for some columns. If both dictionary and byte_stream_stream are - enabled, then dictionary is preferred. - The byte_stream_split encoding is valid for integer, floating-point - and fixed-size binary data types (including decimals); it should be - combined with a compression codec so as to achieve size reduction. - column_encoding : string or dict, default None - Specify the encoding scheme on a per column basis. - Can only be used when ``use_dictionary`` is set to False, and - cannot be used in combination with ``use_byte_stream_split``. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. - Certain encodings are only compatible with certain data types. - Please refer to the encodings section of `Reading and writing Parquet - files `_. - data_page_version : {"1.0", "2.0"}, default "1.0" - The serialized Parquet data page format version to write, defaults to - 1.0. This does not impact the file schema logical types and Arrow to - Parquet type casting behavior; for that use the "version" option. - use_compliant_nested_type : bool, default True - Whether to write compliant Parquet nested type (lists) as defined - `here `_, defaults to ``True``. - For ``use_compliant_nested_type=True``, this will write into a list - with 3-level structure where the middle level, named ``list``, - is a repeated group with a single field named ``element``:: - - group (LIST) { - repeated group list { - element; - } - } - - For ``use_compliant_nested_type=False``, this will also write into a list - with 3-level structure, where the name of the single field of the middle - level ``list`` is taken from the element name for nested columns in Arrow, - which defaults to ``item``:: - - group (LIST) { - repeated group list { - item; - } - } - encryption_properties : FileEncryptionProperties, default None - File encryption properties for Parquet Modular Encryption. - If None, no encryption will be done. - The encryption properties can be created using: - ``CryptoFactory.file_encryption_properties()``. - write_batch_size : int, default None - Number of values to write to a page at a time. If None, use the default of - 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages - are exceeding the ``data_page_size`` due to large column values, lowering - the batch size can help keep page sizes closer to the intended size. - dictionary_pagesize_limit : int, default None - Specify the dictionary page size limit per row group. If None, use the - default 1MB. - store_schema : bool, default True - By default, the Arrow schema is serialized and stored in the Parquet - file metadata (in the "ARROW:schema" key). When reading the file, - if this key is available, it will be used to more faithfully recreate - the original Arrow data. For example, for tz-aware timestamp columns - it will restore the timezone (Parquet only stores the UTC values without - timezone), or columns with duration type will be restored from the int64 - Parquet column. - write_page_index : bool, default False - Whether to write a page index in general for all columns. - Writing statistics to the page index disables the old method of writing - statistics to each data page header. The page index makes statistics-based - filtering more efficient than the page header, as it gathers all the - statistics for a Parquet file in a single place, avoiding scattered I/O. - Note that the page index is not yet used on the read size by PyArrow. - write_page_checksum : bool, default False - Whether to write page checksums in general for all columns. - Page checksums enable detection of data corruption, which might occur during - transmission or in the storage. - sorting_columns : Sequence of SortingColumn, default None - Specify the sort order of the data being written. The writer does not sort - the data nor does it verify that the data is sorted. The sort order is - written to the row group metadata, which can then be used by readers. - store_decimal_as_integer : bool, default False - Allow decimals with 1 <= precision <= 18 to be stored as integers. - In Parquet, DECIMAL can be stored in any of the following physical types: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. - Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the - unscaled value is used. - - By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use the following physical types to store decimals: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: for precision > 18. - - As a consequence, decimal columns stored in integer types are more compact. - - **kwargs : optional - Additional options for ParquetWriter - - Examples - -------- - Generate an example PyArrow Table: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - and write the Table into Parquet file: - - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - - Defining row group size for the Parquet file: - - >>> pq.write_table(table, "example.parquet", row_group_size=3) - - Defining row group compression (default is Snappy): - - >>> pq.write_table(table, "example.parquet", compression="none") - - Defining row group compression and encoding per-column: - - >>> pq.write_table( - ... table, - ... "example.parquet", - ... compression={"n_legs": "snappy", "animal": "gzip"}, - ... use_dictionary=["n_legs", "animal"], - ... ) - - Defining column encoding per-column: - - >>> pq.write_table( - ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False - ... ) - """ - -def write_to_dataset( - table: Table, - root_path: str | Path, - partition_cols: list[str] | None = None, - filesystem: SupportedFileSystem | None = None, - schema: Schema | None = None, - partitioning: Partitioning | list[str] | None = None, - basename_template: str | None = None, - use_threads: bool | None = None, - file_visitor: Callable[[str], None] | None = None, - existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] - | None = None, - **kwargs, -) -> None: - """ - Wrapper around dataset.write_dataset for writing a Table to - Parquet format by partitions. - For each combination of partition columns and values, - a subdirectories are created in the following - manner: - - root_dir/ - group1=value1 - group2=value1 - .parquet - group2=value2 - .parquet - group1=valueN - group2=value1 - .parquet - group2=valueN - .parquet - - Parameters - ---------- - table : pyarrow.Table - root_path : str, pathlib.Path - The root directory of the dataset. - partition_cols : list, - Column names by which to partition the dataset. - Columns are partitioned in the order they are given. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - schema : Schema, optional - This Schema of the dataset. - partitioning : Partitioning or list[str], optional - The partitioning scheme specified with the - ``pyarrow.dataset.partitioning()`` function or a list of field names. - When providing a list of field names, you can use - ``partitioning_flavor`` to drive which partitioning type should be - used. - basename_template : str, optional - A template string used to generate basenames of written data files. - The token '{i}' will be replaced with an automatically incremented - integer. If not specified, it defaults to "guid-{i}.parquet". - use_threads : bool, default True - Write files in parallel. If enabled, then maximum parallelism will be - used determined by the number of available CPU cores. - file_visitor : function - If set, this function will be called with a WrittenFile instance - for each file created during the call. This object will have both - a path attribute and a metadata attribute. - - The path attribute will be a string containing the path to - the created file. - - The metadata attribute will be the parquet metadata of the file. - This metadata will have the file path attribute set and can be used - to build a _metadata file. The metadata attribute will be None if - the format is not parquet. - - Example visitor which simple collects the filenames created:: - - visited_paths = [] - - def file_visitor(written_file): - visited_paths.append(written_file.path) - - existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' - Controls how the dataset will handle data that already exists in - the destination. The default behaviour is 'overwrite_or_ignore'. - - 'overwrite_or_ignore' will ignore any existing data and will - overwrite files with the same name as an output file. Other - existing files will be ignored. This behavior, in combination - with a unique basename_template for each write, will allow for - an append workflow. - - 'error' will raise an error if any data exists in the destination. - - 'delete_matching' is useful when you are writing a partitioned - dataset. The first time each partition directory is encountered - the entire directory will be deleted. This allows you to overwrite - old partitions completely. - **kwargs : dict, - Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` - function for matching kwargs, and remainder to - :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. - See the docstring of :func:`write_table` and - :func:`pyarrow.dataset.write_dataset` for the available options. - Using `metadata_collector` in kwargs allows one to collect the - file metadata instances of dataset pieces. The file paths in the - ColumnChunkMetaData will be set relative to `root_path`. - - Examples - -------- - Generate an example PyArrow Table: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - and write it to a partitioned dataset: - - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) - >>> pq.ParquetDataset("dataset_name_3").files - ['dataset_name_3/year=2019/...-0.parquet', ... - - Write a single Parquet file into the root folder: - - >>> pq.write_to_dataset(table, root_path="dataset_name_4") - >>> pq.ParquetDataset("dataset_name_4/").files - ['dataset_name_4/...-0.parquet'] - """ - -def write_metadata( - schema: Schema, - where: str | NativeFile, - metadata_collector: list[FileMetaData] | None = None, - filesystem: SupportedFileSystem | None = None, - **kwargs, -) -> None: - """ - Write metadata-only Parquet file from schema. This can be used with - `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar - files. - - Parameters - ---------- - schema : pyarrow.Schema - where : string or pyarrow.NativeFile - metadata_collector : list - where to collect metadata information. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - **kwargs : dict, - Additional kwargs for ParquetWriter class. See docstring for - `ParquetWriter` for more information. - - Examples - -------- - Generate example data: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Write a dataset and collect metadata information. - - >>> metadata_collector = [] - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) - - Write the `_common_metadata` parquet file without row groups statistics. - - >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") - - Write the `_metadata` parquet file with row groups statistics. - - >>> pq.write_metadata( - ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector - ... ) - """ - -def read_metadata( - where: str | Path | IO | NativeFile, - memory_map: bool = False, - decryption_properties: FileDecryptionProperties | None = None, - filesystem: SupportedFileSystem | None = None, -) -> FileMetaData: - """ - Read FileMetaData from footer of a single Parquet file. - - Parameters - ---------- - where : str (file path) or file-like object - memory_map : bool, default False - Create memory map when the source is a file path. - decryption_properties : FileDecryptionProperties, default None - Decryption properties for reading encrypted Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - - Returns - ------- - metadata : FileMetaData - The metadata of the Parquet file - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) - >>> pq.write_table(table, "example.parquet") - - >>> pq.read_metadata("example.parquet") - - created_by: parquet-cpp-arrow version ... - num_columns: 2 - num_rows: 3 - num_row_groups: 1 - format_version: 2.6 - serialized_size: ... - """ - -def read_schema( - where: str | Path | IO | NativeFile, - memory_map: bool = False, - decryption_properties: FileDecryptionProperties | None = None, - filesystem: SupportedFileSystem | None = None, -) -> Schema: - """ - Read effective Arrow schema from Parquet file metadata. - - Parameters - ---------- - where : str (file path) or file-like object - memory_map : bool, default False - Create memory map when the source is a file path. - decryption_properties : FileDecryptionProperties, default None - Decryption properties for reading encrypted Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - - Returns - ------- - schema : pyarrow.Schema - The schema of the Parquet file - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) - >>> pq.write_table(table, "example.parquet") - - >>> pq.read_schema("example.parquet") - n_legs: int64 - animal: string - """ diff --git a/python/pyarrow/parquet/encryption.pyi b/python/pyarrow/parquet/encryption.pyi deleted file mode 100644 index fe9a454e593..00000000000 --- a/python/pyarrow/parquet/encryption.pyi +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow._parquet_encryption import ( - CryptoFactory, - DecryptionConfiguration, - EncryptionConfiguration, - KmsClient, - KmsConnectionConfig, -) - -__all__ = [ - "CryptoFactory", - "DecryptionConfiguration", - "EncryptionConfiguration", - "KmsClient", - "KmsConnectionConfig", -] From 73e3e3a2959fbd638f62862639672d328e9198c1 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 12 Sep 2025 15:46:24 +0200 Subject: [PATCH 10/26] Fix --- .github/workflows/python.yml | 11 +- dev/update_stub_docstrings.py | 206 +++++++++++++++++++--------------- python/pyarrow/cuda.py | 25 +++++ 3 files changed, 150 insertions(+), 92 deletions(-) create mode 100644 python/pyarrow/cuda.py diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index a4aa53e5cdc..8630dab7e93 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -138,10 +138,15 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} - - name: Type check with pyright + - name: Type check with mypy and pyright run: |- - python -m pip install pyright - pushd python; python -m pyright + python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-requests griffe libcst + pushd python; + pip install -e . + python -m mypy pyarrow/*.pyi pyarrow/__lib_pxi/*.pyi pyarrow/tests/test_array.py pyarrow/tests/test_io.py + python -m pyright pyarrow/*.pyi pyarrow/__lib_pxi/*.pyi + python ../dev/update_stub_docstrings.py -f ./pyarrow + git status --porcelain=1 macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index 72db8b0d000..17f7e8e1aa1 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -1,118 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # Utility to extract docstrings from pyarrow and update # docstrings in stubfiles. # # Usage # ===== # -# python ../dev/update_stub_docstrings.py -s ./pyarrow/compute.pyi +# python ../dev/update_stub_docstrings.py -f ./pyarrow/ -import os from pathlib import Path from textwrap import indent import click +# TODO: perhaps replace griffe with importlib import griffe -import libcst as cst - -docstrings_map = {} - - -def extract_docstrings(pckg, path=""): - if "filepath" in pckg and pckg["filepath"].endswith(".pyi"): - return - if "docstring" in pckg: - docstrings_map[path] = pckg["docstring"].value - - for name, pckg in pckg.get("members", {}).items(): - extract_docstrings(pckg, path=f"{path}.{name}") - - -def _is_docstring_node(node): - """Checks if a node is a docstring.""" - return ( - isinstance(node, cst.SimpleStatementLine) and - isinstance(node.body[0], cst.Expr) and - isinstance(node.body[0].value, cst.SimpleString) - ) - - -class ClonedSignatureDocstringTransformer(cst.CSTTransformer): - def __init__(self, docstrings_map, module_name): - self.docstrings_map = docstrings_map - self.module_name = module_name - self.name_of_function = None - - def leave_Assign(self, original_node, updated_node): - target = original_node.targets[0].target - value = original_node.value - - if isinstance(target, cst.Name) and isinstance(value, cst.Call) and \ - value.func.value == "_clone_signature": - self.name_of_function = f"{self.module_name}.{target.value}" +import libcst + + +class DocUpdater(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.stack = [namespace] if namespace else [] + self._docstring = None + self.indentation = 0 + self.package = package + + def _get_docstring(self, name): + # print("extract_docstrings", name) + try: + obj = self.package.get_member(name) + except KeyError: + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + parent_name = ".".join(self.stack[:-1]) + + try: + obj = self.package.get_member(parent_name).all_members[self.stack[-1]] + except KeyError: + # print(f"{name} not found in {self.package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + return "\n".join(docstring.splitlines()[2:]) + else: + return docstring + return None + + def visit_ClassDef(self, node): + # TODO: class docstrings? + self.stack.append(node.name.value) + self.indentation += 1 + node_name = ".".join(self.stack) + docstring = self._get_docstring(node_name) + + if docstring: + if not node.get_docstring(clean=False): + print("Missing docstring (in annotations) for:", node_name) + return False + self._docstring = f'"""{node.get_docstring(clean=False)}"""' + return True + return False + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + node_name = ".".join(self.stack) + docstring = self._get_docstring(node_name) + + if docstring: + if not node.get_docstring(clean=False): + print("Missing docstring (in annotations) for:", node_name) + return False + self._docstring = f'"""{node.get_docstring(clean=False)}"""' + return True + return False + + def leave_ClassDef(self, original_node, updated_node): + self.stack.pop() + self.indentation -= 1 return updated_node - def leave_SimpleStatementLine(self, original_node, updated_node): - if self.name_of_function: - if len(updated_node.body) > 0 and _is_docstring_node(updated_node): - comment_content = self.docstrings_map[self.name_of_function].strip() - self.name_of_function = None - - new_string_node = cst.SimpleString(value=f'"""\n{comment_content}\n"""') - new_expr_node = updated_node.body[0].with_changes(value=new_string_node) - new_body = [new_expr_node] + list(updated_node.body[1:]) - updated_node = updated_node.with_changes(body=new_body) - + def leave_FunctionDef(self, original_node, updated_node): + self.stack.pop() + self.indentation -= 1 return updated_node + def leave_SimpleString(self, original_node, updated_node): + node_name = ".".join(self.stack) -class FunctionDocstringTransformer(cst.CSTTransformer): - def __init__(self, docstrings_map, module_name): - self.docstrings_map = docstrings_map - self.module_name = module_name - - def leave_FunctionDef(self, original_node, updated_node): - full_name = f"{self.module_name}.{original_node.name.value}" - - # Check if we have a docstring for this function - if full_name in self.docstrings_map: - # Check if the function already has a docstring - body_list = list(updated_node.body.body) - has_docstring = len(body_list) > 0 and _is_docstring_node(body_list[0]) - - if has_docstring: - # Replace existing docstring - docstring = indent(self.docstrings_map[full_name], " ").strip() - docstring_value = f'"""\n {docstring}\n """' - new_docstring_node = cst.SimpleStatementLine( - body=[cst.Expr(value=cst.SimpleString(value=docstring_value))] - ) - new_body = [new_docstring_node] + body_list[1:] - return updated_node.with_changes( - body=updated_node.body.with_changes(body=new_body) - ) + if original_node.value == self._docstring: + indentation = self.indentation * " " + indented_docstring = indent(self._get_docstring(node_name), indentation) + docstring = f'"""\n{indented_docstring}\n{indentation}"""' + return updated_node.with_changes(value=docstring) return updated_node + @click.command() -@click.option('--stub_file', '-s', type=click.Path(resolve_path=True)) -def update_stub_file(stub_file): - package = griffe.load("pyarrow", try_relative_path=False, force_inspection=True, resolve_aliases=True) - extract_docstrings(package.as_dict(), "pyarrow") +@click.option('--pyarrow_folder', '-f', type=click.Path(resolve_path=True)) +def update_stub_files(pyarrow_folder): + print("Updating docstrings of stub files in:", pyarrow_folder) + package = griffe.load("pyarrow", try_relative_path=True, + force_inspection=True, resolve_aliases=True) - with open(stub_file, 'r') as f: - tree = cst.parse_module(f.read()) + for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue - cloned_signature_transformer = ClonedSignatureDocstringTransformer(docstrings_map, "pyarrow.compute") - function_docstring_transformer = FunctionDocstringTransformer(docstrings_map, "pyarrow.compute") + print(f"[{stub_file}]") - modified_tree = tree.visit(function_docstring_transformer) - modified_tree = modified_tree.visit(cloned_signature_transformer) + with open(stub_file, 'r') as f: + tree = libcst.parse_module(f.read()) + if stub_file.name != "__init__.pyi": + modified_tree = tree.visit(DocUpdater(package, "lib")) + else: + modified_tree = tree.visit(DocUpdater(package, None)) + with open(stub_file, "w") as f: + f.write(modified_tree.code) - # Write the modified code - with open(stub_file, "w") as f: - f.write(modified_tree.code) if __name__ == "__main__": - update_stub_file(obj={}) + docstrings_map = {} + update_stub_files(obj={}) diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py new file mode 100644 index 00000000000..18c530d4afe --- /dev/null +++ b/python/pyarrow/cuda.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + + +from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, + HostBuffer, BufferReader, BufferWriter, + new_host_buffer, + serialize_record_batch, read_message, + read_record_batch) From 1c05a04550e6b41dac5781dc66eda2211d6e1399 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 15 Sep 2025 19:21:05 +0200 Subject: [PATCH 11/26] work --- .../{pyarrow => pyarrow-stubs}/__init__.pyi | 0 .../_stubs_typing.pyi | 0 .../types.pyi => pyarrow-stubs/_types.pyi} | 5 + .../__lib_pxi => pyarrow-stubs}/array.pyi | 111 ++-- .../__lib_pxi => pyarrow-stubs}/io.pyi | 0 python/{pyarrow => pyarrow-stubs}/lib.pyi | 28 +- .../__lib_pxi => pyarrow-stubs}/memory.pyi | 0 python/pyarrow-stubs/py.typed | 0 .../__lib_pxi => pyarrow-stubs}/scalar.pyi | 22 +- .../__lib_pxi => pyarrow-stubs}/tensor.pyi | 0 python/pyarrow-stubs/types.pyi | 214 ++++++ python/pyarrow/__lib_pxi/__init__.pyi | 16 - python/pyarrow/types.pyi | 611 ------------------ 13 files changed, 308 insertions(+), 699 deletions(-) rename python/{pyarrow => pyarrow-stubs}/__init__.pyi (100%) rename python/{pyarrow => pyarrow-stubs}/_stubs_typing.pyi (100%) rename python/{pyarrow/__lib_pxi/types.pyi => pyarrow-stubs/_types.pyi} (99%) rename python/{pyarrow/__lib_pxi => pyarrow-stubs}/array.pyi (96%) rename python/{pyarrow/__lib_pxi => pyarrow-stubs}/io.pyi (100%) rename python/{pyarrow => pyarrow-stubs}/lib.pyi (90%) rename python/{pyarrow/__lib_pxi => pyarrow-stubs}/memory.pyi (100%) create mode 100644 python/pyarrow-stubs/py.typed rename python/{pyarrow/__lib_pxi => pyarrow-stubs}/scalar.pyi (97%) rename python/{pyarrow/__lib_pxi => pyarrow-stubs}/tensor.pyi (100%) create mode 100644 python/pyarrow-stubs/types.pyi delete mode 100644 python/pyarrow/__lib_pxi/__init__.pyi delete mode 100644 python/pyarrow/types.pyi diff --git a/python/pyarrow/__init__.pyi b/python/pyarrow-stubs/__init__.pyi similarity index 100% rename from python/pyarrow/__init__.pyi rename to python/pyarrow-stubs/__init__.pyi diff --git a/python/pyarrow/_stubs_typing.pyi b/python/pyarrow-stubs/_stubs_typing.pyi similarity index 100% rename from python/pyarrow/_stubs_typing.pyi rename to python/pyarrow-stubs/_stubs_typing.pyi diff --git a/python/pyarrow/__lib_pxi/types.pyi b/python/pyarrow-stubs/_types.pyi similarity index 99% rename from python/pyarrow/__lib_pxi/types.pyi rename to python/pyarrow-stubs/_types.pyi index 27a2c75d68d..0c8afe2cbbb 100644 --- a/python/pyarrow/__lib_pxi/types.pyi +++ b/python/pyarrow-stubs/_types.pyi @@ -4282,4 +4282,9 @@ __all__ = [ "type_for_alias", "schema", "from_numpy_dtype", + "_Unit", + "_Tz", + "_Time32Unit", + "_Time64Unit", + ] diff --git a/python/pyarrow/__lib_pxi/array.pyi b/python/pyarrow-stubs/array.pyi similarity index 96% rename from python/pyarrow/__lib_pxi/array.pyi rename to python/pyarrow-stubs/array.pyi index c6e8dfecb62..c01f167029e 100644 --- a/python/pyarrow/__lib_pxi/array.pyi +++ b/python/pyarrow-stubs/array.pyi @@ -55,13 +55,20 @@ from pyarrow.lib import ( # type: ignore[attr-defined] ) from typing_extensions import deprecated -from . import scalar, types +from .scalar import * from .device import DeviceAllocationType # type: ignore[import-not-found] -from .scalar import Scalar -from .types import ( +from ._types import ( + BaseExtensionType, + BinaryType, DataType, Field, + Float64Type, + Int16Type, + Int32Type, + Int64Type, MapType, + StringType, + StructType, _AsPyType, _BasicDataType, _BasicValueT, @@ -69,8 +76,12 @@ from .types import ( _IndexT, _RunEndType, _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, ) -from .._stubs_typing import NullableCollection +from ._stubs_typing import NullableCollection def array( values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, @@ -963,7 +974,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): end: int | None = None, *, memory_pool: MemoryPool | None = None, - ) -> scalar.Int64Scalar | scalar.Int64Scalar: + ) -> Int64Scalar: """ Find the first index of a value. @@ -1270,12 +1281,12 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): Statistics of the array. """ -class NullArray(Array[scalar.NullScalar]): +class NullArray(Array[NullScalar]): """ Concrete class for Arrow arrays of null data type. """ -class BooleanArray(Array[scalar.BooleanScalar]): +class BooleanArray(Array[BooleanScalar]): """ Concrete class for Arrow arrays of boolean data type. """ @@ -1296,79 +1307,79 @@ class FloatingPointArray(NumericArray[_ScalarT]): """ A base class for Arrow floating-point arrays. """ -class Int8Array(IntegerArray[scalar.Int8Scalar]): +class Int8Array(IntegerArray[Int8Scalar]): """ Concrete class for Arrow arrays of int8 data type. """ -class UInt8Array(IntegerArray[scalar.UInt8Scalar]): +class UInt8Array(IntegerArray[UInt8Scalar]): """ Concrete class for Arrow arrays of uint8 data type. """ -class Int16Array(IntegerArray[scalar.Int16Scalar]): +class Int16Array(IntegerArray[Int16Scalar]): """ Concrete class for Arrow arrays of int16 data type. """ -class UInt16Array(IntegerArray[scalar.UInt16Scalar]): +class UInt16Array(IntegerArray[UInt16Scalar]): """ Concrete class for Arrow arrays of uint16 data type. """ -class Int32Array(IntegerArray[scalar.Int32Scalar]): +class Int32Array(IntegerArray[Int32Scalar]): """ Concrete class for Arrow arrays of int32 data type. """ -class UInt32Array(IntegerArray[scalar.UInt32Scalar]): +class UInt32Array(IntegerArray[UInt32Scalar]): """ Concrete class for Arrow arrays of uint32 data type. """ -class Int64Array(IntegerArray[scalar.Int64Scalar]): +class Int64Array(IntegerArray[Int64Scalar]): """ Concrete class for Arrow arrays of int64 data type. """ -class UInt64Array(IntegerArray[scalar.UInt64Scalar]): +class UInt64Array(IntegerArray[UInt64Scalar]): """ Concrete class for Arrow arrays of uint64 data type. """ -class Date32Array(NumericArray[scalar.Date32Scalar]): +class Date32Array(NumericArray[Date32Scalar]): """ Concrete class for Arrow arrays of date32 data type. """ -class Date64Array(NumericArray[scalar.Date64Scalar]): +class Date64Array(NumericArray[Date64Scalar]): """ Concrete class for Arrow arrays of date64 data type. """ -class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): +class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]): """ Concrete class for Arrow arrays of timestamp data type. """ -class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): +class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]): """ Concrete class for Arrow arrays of time32 data type. """ -class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): +class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]): """ Concrete class for Arrow arrays of time64 data type. """ -class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): +class DurationArray(NumericArray[DurationScalar[_Unit]]): """ Concrete class for Arrow arrays of duration data type. """ -class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): +class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]): """ Concrete class for Arrow arrays of interval[MonthDayNano] type. """ -class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): +class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): """ Concrete class for Arrow arrays of float16 data type. """ -class FloatArray(FloatingPointArray[scalar.FloatScalar]): +class FloatArray(FloatingPointArray[FloatScalar]): """ Concrete class for Arrow arrays of float32 data type. """ -class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): +class DoubleArray(FloatingPointArray[DoubleScalar]): """ Concrete class for Arrow arrays of float64 data type. """ -class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): +class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]): """ Concrete class for Arrow arrays of a fixed-size binary data type. """ @@ -1533,7 +1544,7 @@ class ListArray(BaseListArray[_ScalarT]): type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[_DataTypeT | types.Int64Type | types.Float64Type | types.StringType | types.BinaryType]] | ListArray: + ) -> ListArray[ListScalar[_DataTypeT | Int64Type | Float64Type | StringType | BinaryType]] | ListArray: """ Construct ListArray from arrays of int32 offsets and values. @@ -1679,7 +1690,7 @@ class ListArray(BaseListArray[_ScalarT]): ] """ -class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): +class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): """ Concrete class for Arrow arrays of a large list data type. @@ -1797,7 +1808,7 @@ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): offsets : Int64Array """ -class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): +class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): """ Concrete class for Arrow arrays of a list view data type. """ @@ -1992,7 +2003,7 @@ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): ] """ -class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): +class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): """ Concrete class for Arrow arrays of a large list view data type. @@ -2196,7 +2207,7 @@ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): ] """ -class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): +class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): """ Concrete class for Arrow arrays of a fixed size list data type. """ @@ -2267,7 +2278,7 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S ] """ @property - def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: + def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: """ Return the underlying array of values which backs the FixedSizeListArray ignoring the array's offset. @@ -2307,7 +2318,7 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S _MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) _MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) -class MapArray(BaseListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): +class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): """ Concrete class for Arrow arrays of a map data type. """ @@ -2425,7 +2436,7 @@ class MapArray(BaseListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): Flattened array of items across all maps in array """ -class UnionArray(Array[scalar.UnionScalar]): +class UnionArray(Array[UnionScalar]): """ Concrete class for Arrow arrays of a Union data type. """ @@ -2522,7 +2533,7 @@ class UnionArray(Array[scalar.UnionScalar]): union_array : UnionArray """ -class StringArray(Array[scalar.StringScalar]): +class StringArray(Array[StringScalar]): """ Concrete class for Arrow arrays of string (or utf8) data type. """ @@ -2554,7 +2565,7 @@ class StringArray(Array[scalar.StringScalar]): string_array : StringArray """ -class LargeStringArray(Array[scalar.LargeStringScalar]): +class LargeStringArray(Array[LargeStringScalar]): """ Concrete class for Arrow arrays of large string (or utf8) data type. """ @@ -2586,12 +2597,12 @@ class LargeStringArray(Array[scalar.LargeStringScalar]): string_array : StringArray """ -class StringViewArray(Array[scalar.StringViewScalar]): +class StringViewArray(Array[StringViewScalar]): """ Concrete class for Arrow arrays of string (or utf8) view data type. """ -class BinaryArray(Array[scalar.BinaryScalar]): +class BinaryArray(Array[BinaryScalar]): """ Concrete class for Arrow arrays of variable-sized binary data type. """ @@ -2602,7 +2613,7 @@ class BinaryArray(Array[scalar.BinaryScalar]): by the offsets of this BinaryArray. """ -class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): +class LargeBinaryArray(Array[LargeBinaryScalar]): """ Concrete class for Arrow arrays of large variable-sized binary data type. """ @@ -2613,12 +2624,12 @@ class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): by the offsets of this LargeBinaryArray. """ -class BinaryViewArray(Array[scalar.BinaryViewScalar]): +class BinaryViewArray(Array[BinaryViewScalar]): """ Concrete class for Arrow arrays of variable-sized binary view data type. """ -class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): +class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): """ Concrete class for dictionary-encoded Arrow arrays. """ @@ -2700,7 +2711,7 @@ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): dict_array : DictionaryArray """ -class StructArray(Array[scalar.StructScalar]): +class StructArray(Array[StructScalar]): """ Concrete class for Arrow arrays of a struct data type. """ @@ -2737,7 +2748,7 @@ class StructArray(Array[scalar.StructScalar]): fields: list[Field] | None = None, mask=None, memory_pool: MemoryPool | None = None, - type: types.StructType | None = None, + type: StructType | None = None, ) -> StructArray: """ Construct StructArray from collection of arrays representing @@ -2784,7 +2795,7 @@ class StructArray(Array[scalar.StructScalar]): result : StructArray """ -class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): +class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): """ Concrete class for Arrow run-end encoded arrays. """ @@ -2793,7 +2804,7 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal run_ends: Int16Array | Int32Array | Int64Array, values: Array, type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int16Type | types.Int32Type | types.Int64Type, _BasicValueT]: # type: ignore[type-var] + ) -> RunEndEncodedArray[Int16Type | Int32Type | Int64Type, _BasicValueT]: # type: ignore[type-var] """ Construct RunEndEncodedArray from run_ends and values arrays. @@ -2849,14 +2860,14 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal RunEndEncodedArray """ @property - def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: + def run_ends(self) -> Array[Scalar[_RunEndType]]: """ An array holding the logical indexes of each run-end. The physical offset to the array is applied. """ @property - def values(self) -> Array[scalar.Scalar[_BasicValueT]]: + def values(self) -> Array[Scalar[_BasicValueT]]: """ An array holding the values of each run. @@ -2884,14 +2895,14 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal _ArrayT = TypeVar("_ArrayT", bound=Array) -class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): +class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]): """ Concrete class for Arrow extension arrays. """ @property def storage(self) -> Any: ... @staticmethod - def from_storage(typ: types.BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: + def from_storage(typ: BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: """ Construct ExtensionArray from type and storage array. @@ -3232,7 +3243,7 @@ def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = No ] """ -def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: +def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: """ Create empty array of the given type. """ diff --git a/python/pyarrow/__lib_pxi/io.pyi b/python/pyarrow-stubs/io.pyi similarity index 100% rename from python/pyarrow/__lib_pxi/io.pyi rename to python/pyarrow-stubs/io.pyi diff --git a/python/pyarrow/lib.pyi b/python/pyarrow-stubs/lib.pyi similarity index 90% rename from python/pyarrow/lib.pyi rename to python/pyarrow-stubs/lib.pyi index 9d5bd7bedb2..a1a016ef2f2 100644 --- a/python/pyarrow/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -21,22 +21,22 @@ import datetime as dt from typing import NamedTuple, Literal from typing_extensions import TypeVar -from .__lib_pxi.array import * +from .array import * # TODO -# from .__lib_pxi.benchmark import * -# from .__lib_pxi.builder import * -# from .__lib_pxi.compat import * -# from .__lib_pxi.config import * -# from .__lib_pxi.device import * -# from .__lib_pxi.error import * -from .__lib_pxi.io import * +# from .benchmark import * +# from .builder import * +# from .compat import * +# from .config import * +# from .device import * +# from .error import * +from .io import * # from .__lib_pxi.ipc import * -from .__lib_pxi.memory import * -# from .__lib_pxi.pandas_shim import * -from .__lib_pxi.scalar import * -# from .__lib_pxi.table import * -from .__lib_pxi.tensor import * -from .__lib_pxi.types import * +from .memory import * +# from .pandas_shim import * +from .scalar import * +# from .table import * +from .tensor import * +from ._types import * _DataTypeT = TypeVar("_DataTypeT", bound=DataType) diff --git a/python/pyarrow/__lib_pxi/memory.pyi b/python/pyarrow-stubs/memory.pyi similarity index 100% rename from python/pyarrow/__lib_pxi/memory.pyi rename to python/pyarrow-stubs/memory.pyi diff --git a/python/pyarrow-stubs/py.typed b/python/pyarrow-stubs/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pyarrow/__lib_pxi/scalar.pyi b/python/pyarrow-stubs/scalar.pyi similarity index 97% rename from python/pyarrow/__lib_pxi/scalar.pyi rename to python/pyarrow-stubs/scalar.pyi index b979ec43a3a..2532026e7bc 100644 --- a/python/pyarrow/__lib_pxi/scalar.pyi +++ b/python/pyarrow-stubs/scalar.pyi @@ -35,19 +35,25 @@ from pyarrow._compute import CastOptions # type: ignore[import-not-found] from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable from typing_extensions import TypeVar -from . import types -from .types import ( +from ._types import ( # _AsPyType, _DataTypeT, _Time32Unit, _Time64Unit, _Tz, _Unit, + DataType, + ListType, + LargeListType, + ListViewType, + LargeListViewType, + FixedSizeListType, ) +from . import types _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") -_DataType_co = TypeVar("_DataType_co", bound=types.DataType, covariant=True) +_DataType_co = TypeVar("_DataType_co", bound=DataType, covariant=True) class Scalar(_Weakrefable, Generic[_DataType_co]): """ @@ -330,7 +336,7 @@ class StringViewScalar(Scalar[types.StringViewType]): Return a view over this value as a Buffer object. """ -class ListScalar(Scalar[types.ListType[_DataTypeT]]): +class ListScalar(Scalar[ListType[_DataTypeT]]): """ Concrete class for list-like scalars. """ @@ -349,7 +355,7 @@ class ListScalar(Scalar[types.ListType[_DataTypeT]]): Iterate over this element's values. """ -class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): +class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, types._Size]]): """ """ @property @@ -373,7 +379,7 @@ class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size Iterate over this element's values. """ -class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): +class LargeListScalar(Scalar[LargeListType[_DataTypeT]]): """ """ @property @@ -397,7 +403,7 @@ class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): Iterate over this element's values. """ -class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): +class ListViewScalar(Scalar[ListViewType[_DataTypeT]]): """ """ @property @@ -421,7 +427,7 @@ class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): Iterate over this element's values. """ -class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): +class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]): """ """ @property diff --git a/python/pyarrow/__lib_pxi/tensor.pyi b/python/pyarrow-stubs/tensor.pyi similarity index 100% rename from python/pyarrow/__lib_pxi/tensor.pyi rename to python/pyarrow-stubs/tensor.pyi diff --git a/python/pyarrow-stubs/types.pyi b/python/pyarrow-stubs/types.pyi new file mode 100644 index 00000000000..98181f6acc2 --- /dev/null +++ b/python/pyarrow-stubs/types.pyi @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + Uint32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... + +__all__ = [ + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] diff --git a/python/pyarrow/__lib_pxi/__init__.pyi b/python/pyarrow/__lib_pxi/__init__.pyi deleted file mode 100644 index 13a83393a91..00000000000 --- a/python/pyarrow/__lib_pxi/__init__.pyi +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/python/pyarrow/types.pyi b/python/pyarrow/types.pyi deleted file mode 100644 index 1d1554da520..00000000000 --- a/python/pyarrow/types.pyi +++ /dev/null @@ -1,611 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sys - -from typing import Any - -if sys.version_info >= (3, 13): - from typing import TypeIs -else: - from typing_extensions import TypeIs -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from pyarrow.lib import ( - BinaryType, - BinaryViewType, - BoolType, - DataType, - Date32Type, - Date64Type, - Decimal32Type, - Decimal64Type, - Decimal128Type, - Decimal256Type, - DenseUnionType, - DictionaryType, - DurationType, - FixedSizeBinaryType, - FixedSizeListType, - Float16Type, - Float32Type, - Float64Type, - Int8Type, - Int16Type, - Int32Type, - Int64Type, - LargeBinaryType, - LargeListType, - LargeListViewType, - LargeStringType, - ListType, - ListViewType, - MapType, - MonthDayNanoIntervalType, - NullType, - RunEndEncodedType, - SparseUnionType, - StringType, - StringViewType, - StructType, - Time32Type, - Time64Type, - TimestampType, - UInt8Type, - UInt16Type, - Uint32Type, - UInt64Type, -) - -_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type -_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type -_Integer: TypeAlias = _SignedInteger | _UnsignedInteger -_Floating: TypeAlias = Float16Type | Float32Type | Float64Type -_Decimal: TypeAlias = ( - Decimal32Type[Any, Any] - | Decimal64Type[Any, Any] - | Decimal128Type[Any, Any] - | Decimal256Type[Any, Any] -) -_Date: TypeAlias = Date32Type | Date64Type -_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] -_Interval: TypeAlias = MonthDayNanoIntervalType -_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval -_Union: TypeAlias = SparseUnionType | DenseUnionType -_Nested: TypeAlias = ( - ListType[Any] - | FixedSizeListType[Any, Any] - | LargeListType[Any] - | ListViewType[Any] - | LargeListViewType[Any] - | StructType - | MapType[Any, Any, Any] - | _Union -) - -def is_null(t: DataType) -> TypeIs[NullType]: - """ - Return True if value is an instance of type: null. - - Parameters - ---------- - t : DataType - """ -def is_boolean(t: DataType) -> TypeIs[BoolType]: - """ - Return True if value is an instance of type: boolean. - - Parameters - ---------- - t : DataType - """ -def is_integer(t: DataType) -> TypeIs[_Integer]: - """ - Return True if value is an instance of type: any integer. - - Parameters - ---------- - t : DataType - """ -def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: - """ - Return True if value is an instance of type: signed integer. - - Parameters - ---------- - t : DataType - """ -def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: - """ - Return True if value is an instance of type: unsigned integer. - - Parameters - ---------- - t : DataType - """ -def is_int8(t: DataType) -> TypeIs[Int8Type]: - """ - Return True if value is an instance of type: int8. - - Parameters - ---------- - t : DataType - """ -def is_int16(t: DataType) -> TypeIs[Int16Type]: - """ - Return True if value is an instance of type: int16. - - Parameters - ---------- - t : DataType - """ -def is_int32(t: DataType) -> TypeIs[Int32Type]: - """ - Return True if value is an instance of type: int32. - - Parameters - ---------- - t : DataType - """ -def is_int64(t: DataType) -> TypeIs[Int64Type]: - """ - Return True if value is an instance of type: int64. - - Parameters - ---------- - t : DataType - """ -def is_uint8(t: DataType) -> TypeIs[UInt8Type]: - """ - Return True if value is an instance of type: uint8. - - Parameters - ---------- - t : DataType - """ -def is_uint16(t: DataType) -> TypeIs[UInt16Type]: - """ - Return True if value is an instance of type: uint16. - - Parameters - ---------- - t : DataType - """ -def is_uint32(t: DataType) -> TypeIs[Uint32Type]: - """ - Return True if value is an instance of type: uint32. - - Parameters - ---------- - t : DataType - """ -def is_uint64(t: DataType) -> TypeIs[UInt64Type]: - """ - Return True if value is an instance of type: uint64. - - Parameters - ---------- - t : DataType - """ -def is_floating(t: DataType) -> TypeIs[_Floating]: - """ - Return True if value is an instance of type: floating point numeric. - - Parameters - ---------- - t : DataType - """ -def is_float16(t: DataType) -> TypeIs[Float16Type]: - """ - Return True if value is an instance of type: float16 (half-precision). - - Parameters - ---------- - t : DataType - """ -def is_float32(t: DataType) -> TypeIs[Float32Type]: - """ - Return True if value is an instance of type: float32 (single precision). - - Parameters - ---------- - t : DataType - """ -def is_float64(t: DataType) -> TypeIs[Float64Type]: - """ - Return True if value is an instance of type: float64 (double precision). - - Parameters - ---------- - t : DataType - """ -def is_list(t: DataType) -> TypeIs[ListType[Any]]: - """ - Return True if value is an instance of type: list. - - Parameters - ---------- - t : DataType - """ -def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: - """ - Return True if value is an instance of type: large list. - - Parameters - ---------- - t : DataType - """ -def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: - """ - Return True if value is an instance of type: fixed size list. - - Parameters - ---------- - t : DataType - """ -def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: - """ - Return True if value is an instance of type: list view. - - Parameters - ---------- - t : DataType - """ -def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: - """ - Return True if value is an instance of type: large list view. - - Parameters - ---------- - t : DataType - """ -def is_struct(t: DataType) -> TypeIs[StructType]: - """ - Return True if value is an instance of type: struct. - - Parameters - ---------- - t : DataType - """ -def is_union(t: DataType) -> TypeIs[_Union]: - """ - Return True if value is an instance of type: union. - - Parameters - ---------- - t : DataType - """ -def is_nested(t: DataType) -> TypeIs[_Nested]: - """ - Return True if value is an instance of type: nested type. - - Parameters - ---------- - t : DataType - """ -def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: - """ - Return True if value is an instance of type: run-end encoded. - - Parameters - ---------- - t : DataType - """ -def is_temporal(t: DataType) -> TypeIs[_Temporal]: - """ - Return True if value is an instance of type: date, time, timestamp or duration. - - Parameters - ---------- - t : DataType - """ -def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: - """ - Return True if value is an instance of type: timestamp. - - Parameters - ---------- - t : DataType - """ -def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: - """ - Return True if value is an instance of type: duration. - - Parameters - ---------- - t : DataType - """ -def is_time(t: DataType) -> TypeIs[_Time]: - """ - Return True if value is an instance of type: time. - - Parameters - ---------- - t : DataType - """ -def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: - """ - Return True if value is an instance of type: time32. - - Parameters - ---------- - t : DataType - """ -def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: - """ - Return True if value is an instance of type: time64. - - Parameters - ---------- - t : DataType - """ -def is_binary(t: DataType) -> TypeIs[BinaryType]: - """ - Return True if value is an instance of type: variable-length binary. - - Parameters - ---------- - t : DataType - """ -def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: - """ - Return True if value is an instance of type: large variable-length binary. - - Parameters - ---------- - t : DataType - """ -def is_unicode(t: DataType) -> TypeIs[StringType]: - """ - Alias for is_string. - - Parameters - ---------- - t : DataType - """ -def is_string(t: DataType) -> TypeIs[StringType]: - """ - Return True if value is an instance of type: string (utf8 unicode). - - Parameters - ---------- - t : DataType - """ -def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: - """ - Alias for is_large_string. - - Parameters - ---------- - t : DataType - """ -def is_large_string(t: DataType) -> TypeIs[LargeStringType]: - """ - Return True if value is an instance of type: large string (utf8 unicode). - - Parameters - ---------- - t : DataType - """ -def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: - """ - Return True if value is an instance of type: fixed size binary. - - Parameters - ---------- - t : DataType - """ -def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: - """ - Return True if value is an instance of type: variable-length binary view. - - Parameters - ---------- - t : DataType - """ -def is_string_view(t: DataType) -> TypeIs[StringViewType]: - """ - Return True if value is an instance of type: variable-length string (utf-8) view. - - Parameters - ---------- - t : DataType - """ -def is_date(t: DataType) -> TypeIs[_Date]: - """ - Return True if value is an instance of type: date. - - Parameters - ---------- - t : DataType - """ -def is_date32(t: DataType) -> TypeIs[Date32Type]: - """ - Return True if value is an instance of type: date32 (days). - - Parameters - ---------- - t : DataType - """ -def is_date64(t: DataType) -> TypeIs[Date64Type]: - """ - Return True if value is an instance of type: date64 (milliseconds). - - Parameters - ---------- - t : DataType - """ -def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: - """ - Return True if value is an instance of type: map. - - Parameters - ---------- - t : DataType - """ -def is_decimal(t: DataType) -> TypeIs[_Decimal]: - """ - Return True if value is an instance of type: decimal. - - Parameters - ---------- - t : DataType - """ -def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: - """ - Return True if value is an instance of type: decimal32. - - Parameters - ---------- - t : DataType - """ -def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: - """ - Return True if value is an instance of type: decimal64. - - Parameters - ---------- - t : DataType - """ -def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: - """ - Return True if value is an instance of type: decimal128. - - Parameters - ---------- - t : DataType - """ -def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: - """ - Return True if value is an instance of type: decimal256. - - Parameters - ---------- - t : DataType - """ -def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: - """ - Return True if value is an instance of type: dictionary-encoded. - - Parameters - ---------- - t : DataType - """ -def is_interval(t: DataType) -> TypeIs[_Interval]: - """ - Return True if value is an instance of type: interval. - - Parameters - ---------- - t : DataType - """ -def is_primitive(t: DataType) -> bool: - """ - Return True if value is an instance of type: primitive type. - - Parameters - ---------- - t : DataType - """ -def is_boolean_value(obj: Any) -> bool: - """ - Check if the object is a boolean. - - Parameters - ---------- - obj : object - The object to check - """ - -def is_integer_value(obj: Any) -> bool: - """ - Check if the object is an integer. - - Parameters - ---------- - obj : object - The object to check - """ - -def is_float_value(obj: Any) -> bool: - """ - Check if the object is a float. - - Parameters - ---------- - obj : object - The object to check - """ - -__all__ = [ - "is_binary", - "is_binary_view", - "is_boolean", - "is_date", - "is_date32", - "is_date64", - "is_decimal", - "is_decimal128", - "is_decimal256", - "is_decimal32", - "is_decimal64", - "is_dictionary", - "is_duration", - "is_fixed_size_binary", - "is_fixed_size_list", - "is_float16", - "is_float32", - "is_float64", - "is_floating", - "is_int16", - "is_int32", - "is_int64", - "is_int8", - "is_integer", - "is_interval", - "is_large_binary", - "is_large_list", - "is_large_list_view", - "is_large_string", - "is_large_unicode", - "is_list", - "is_list_view", - "is_map", - "is_nested", - "is_null", - "is_primitive", - "is_run_end_encoded", - "is_signed_integer", - "is_string", - "is_string_view", - "is_struct", - "is_temporal", - "is_time", - "is_time32", - "is_time64", - "is_timestamp", - "is_uint16", - "is_uint32", - "is_uint64", - "is_uint8", - "is_unicode", - "is_union", - "is_unsigned_integer", -] From 6dacfe1fc43361eb0922a061cfde71eb635d42db Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 15 Sep 2025 22:01:18 +0200 Subject: [PATCH 12/26] work --- python/pyarrow-stubs/_types.pyi | 8 +-- python/pyarrow-stubs/array.pyi | 5 +- python/pyarrow-stubs/io.pyi | 11 ++-- python/pyarrow-stubs/lib.pyi | 1 - python/pyarrow-stubs/py.typed | 16 +++++ python/pyarrow-stubs/scalar.pyi | 113 +++++++++++++++++--------------- python/pyarrow-stubs/tensor.pyi | 1 + 7 files changed, 92 insertions(+), 63 deletions(-) diff --git a/python/pyarrow-stubs/_types.pyi b/python/pyarrow-stubs/_types.pyi index 0c8afe2cbbb..32543d4b04b 100644 --- a/python/pyarrow-stubs/_types.pyi +++ b/python/pyarrow-stubs/_types.pyi @@ -46,9 +46,6 @@ from typing_extensions import TypeVar, deprecated from .io import Buffer from .scalar import ExtensionScalar -_AsPyType = TypeVar("_AsPyType") -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) - class _Weakrefable: ... class _Metadata(_Weakrefable): ... @@ -207,6 +204,9 @@ class DataType(_Weakrefable): ArrowSchema pointer. """ +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... class BoolType(_BasicDataType[bool]): ... @@ -4286,5 +4286,5 @@ __all__ = [ "_Tz", "_Time32Unit", "_Time64Unit", - + "_DataTypeT", ] diff --git a/python/pyarrow-stubs/array.pyi b/python/pyarrow-stubs/array.pyi index c01f167029e..fcd9ec8f135 100644 --- a/python/pyarrow-stubs/array.pyi +++ b/python/pyarrow-stubs/array.pyi @@ -54,6 +54,7 @@ from pyarrow.lib import ( # type: ignore[attr-defined] _Weakrefable, ) from typing_extensions import deprecated +import builtins from .scalar import * from .device import DeviceAllocationType # type: ignore[import-not-found] @@ -891,7 +892,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): result : Array A new array with nulls replaced by the given value. """ - def __getitem__(self, key: int | slice) -> _Scalar_co | Self: + def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: """ Slice or return value at given index @@ -2488,7 +2489,7 @@ class UnionArray(Array[UnionScalar]): """ @staticmethod def from_dense( - type: Int8Array, + types: Int8Array, value_offsets: Int32Array, children: NullableCollection[Array], field_names: list[str] | None = None, diff --git a/python/pyarrow-stubs/io.pyi b/python/pyarrow-stubs/io.pyi index ebcfa8c470b..b8404225e18 100644 --- a/python/pyarrow-stubs/io.pyi +++ b/python/pyarrow-stubs/io.pyi @@ -32,12 +32,13 @@ else: from typing_extensions import TypeAlias from typing import Any, Literal, SupportsIndex +import builtins from pyarrow._stubs_typing import Compression, SupportPyBuffer from pyarrow.lib import MemoryPool, _Weakrefable # from .device import Device, DeviceAllocationType, MemoryManager -from .types import KeyValueMetadata +from ._types import KeyValueMetadata def have_libhdfs() -> bool: """ @@ -205,7 +206,7 @@ class NativeFile(_Weakrefable): ------- stream : NativeFile """ - def read_at(self) -> bytes: + def read_at(self, nbytes: int, offset: int) -> bytes: """ Read indicated number of bytes at offset from the file @@ -218,7 +219,7 @@ class NativeFile(_Weakrefable): ------- data : bytes """ - def read1(self) -> bytes: + def read1(self, nbytes: int | None = None) -> bytes: """ Read and return up to n bytes. @@ -324,6 +325,8 @@ class NativeFile(_Weakrefable): The buffer size to use for data transfers. """ + def writable(self): ... + # ---------------------------------------------------------------------- # Python file-like objects @@ -632,7 +635,7 @@ class Buffer(_Weakrefable): # """ @property def parent(self) -> Buffer | None: ... - def __getitem__(self, key: slice | int) -> Self | int: + def __getitem__(self, key: builtins.slice | int) -> Self | int: """ Return self[key]. """ diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi index a1a016ef2f2..527f946b53a 100644 --- a/python/pyarrow-stubs/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -38,7 +38,6 @@ from .scalar import * from .tensor import * from ._types import * -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class MonthDayNano(NamedTuple): days: int diff --git a/python/pyarrow-stubs/py.typed b/python/pyarrow-stubs/py.typed index e69de29bb2d..13a83393a91 100644 --- a/python/pyarrow-stubs/py.typed +++ b/python/pyarrow-stubs/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow-stubs/scalar.pyi b/python/pyarrow-stubs/scalar.pyi index 2532026e7bc..0bcd97dd038 100644 --- a/python/pyarrow-stubs/scalar.pyi +++ b/python/pyarrow-stubs/scalar.pyi @@ -36,7 +36,6 @@ from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakre from typing_extensions import TypeVar from ._types import ( - # _AsPyType, _DataTypeT, _Time32Unit, _Time64Unit, @@ -49,7 +48,17 @@ from ._types import ( LargeListViewType, FixedSizeListType, ) -from . import types +from ._types import ( + Decimal256Type, _Precision, _Scale, NullType, BoolType, UInt8Type, Int8Type, + UInt16Type, Int16Type, Uint32Type, Int32Type, UInt64Type, Int64Type, + Float16Type, Float32Type, Float64Type, Decimal32Type, Decimal64Type, + Decimal128Type, Date32Type, Date64Type, Time32Type, Time64Type, TimestampType, + _Size, DurationType, MonthDayNanoIntervalType, BinaryType, LargeBinaryType, + FixedSizeBinaryType, StringType, LargeStringType, BinaryViewType, StringViewType, + StructType, _K, _ValueT, _IndexT, _BasicValueT, RunEndEncodedType, _RunEndType, + UnionType, ExtensionType, BaseExtensionType, Bool8Type, UuidType, JsonType, + OpaqueType, DictionaryType, MapType, _BasicDataType, +) _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") @@ -148,115 +157,115 @@ class Scalar(_Weakrefable, Generic[_DataType_co]): _NULL: TypeAlias = None NA = _NULL -class NullScalar(Scalar[types.NullType]): +class NullScalar(Scalar[NullType]): """ Concrete class for null scalars. """ -class BooleanScalar(Scalar[types.BoolType]): +class BooleanScalar(Scalar[BoolType]): """ Concrete class for boolean scalars. """ -class UInt8Scalar(Scalar[types.UInt8Type]): +class UInt8Scalar(Scalar[UInt8Type]): """ Concrete class for uint8 scalars. """ -class Int8Scalar(Scalar[types.Int8Type]): +class Int8Scalar(Scalar[Int8Type]): """ Concrete class for int8 scalars. """ -class UInt16Scalar(Scalar[types.UInt16Type]): +class UInt16Scalar(Scalar[UInt16Type]): """ Concrete class for uint16 scalars. """ -class Int16Scalar(Scalar[types.Int16Type]): +class Int16Scalar(Scalar[Int16Type]): """ Concrete class for int16 scalars. """ -class UInt32Scalar(Scalar[types.Uint32Type]): +class UInt32Scalar(Scalar[Uint32Type]): """ Concrete class for uint32 scalars. """ -class Int32Scalar(Scalar[types.Int32Type]): +class Int32Scalar(Scalar[Int32Type]): """ Concrete class for int32 scalars. """ -class UInt64Scalar(Scalar[types.UInt64Type]): +class UInt64Scalar(Scalar[UInt64Type]): """ Concrete class for uint64 scalars. """ -class Int64Scalar(Scalar[types.Int64Type]): +class Int64Scalar(Scalar[Int64Type]): """ Concrete class for int64 scalars. """ -class HalfFloatScalar(Scalar[types.Float16Type]): +class HalfFloatScalar(Scalar[Float16Type]): """ Concrete class for float scalars. """ -class FloatScalar(Scalar[types.Float32Type]): +class FloatScalar(Scalar[Float32Type]): """ Concrete class for float scalars. """ -class DoubleScalar(Scalar[types.Float64Type]): +class DoubleScalar(Scalar[Float64Type]): """ Concrete class for double scalars. """ -class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): +class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]): """ Concrete class for decimal32 scalars. """ -class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): +class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]): """ Concrete class for decimal64 scalars. """ -class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): +class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]): """ Concrete class for decimal128 scalars. """ -class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): +class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]): """ Concrete class for decimal256 scalars. """ -class Date32Scalar(Scalar[types.Date32Type]): +class Date32Scalar(Scalar[Date32Type]): """ Concrete class for date32 scalars. """ -class Date64Scalar(Scalar[types.Date64Type]): +class Date64Scalar(Scalar[Date64Type]): """ Concrete class for date64 scalars. """ @property def value(self) -> dt.date | None: ... -class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): +class Time32Scalar(Scalar[Time32Type[_Time32Unit]]): """ Concrete class for time32 scalars. """ @property def value(self) -> dt.time | None: ... -class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): +class Time64Scalar(Scalar[Time64Type[_Time64Unit]]): """ Concrete class for time64 scalars. """ @property def value(self) -> dt.time | None: ... -class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): +class TimestampScalar(Scalar[TimestampType[_Unit, _Tz]]): """ Concrete class for timestamp scalars. """ @property def value(self) -> int | None: ... -class DurationScalar(Scalar[types.DurationType[_Unit]]): +class DurationScalar(Scalar[DurationType[_Unit]]): """ Concrete class for duration scalars. """ @property def value(self) -> dt.timedelta | None: ... -class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): +class MonthDayNanoIntervalScalar(Scalar[MonthDayNanoIntervalType]): """ Concrete class for month, day, nanosecond interval scalars. """ @@ -266,7 +275,7 @@ class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): Same as self.as_py() """ -class BinaryScalar(Scalar[types.BinaryType]): +class BinaryScalar(Scalar[BinaryType]): """ Concrete class for binary-like scalars. """ @@ -275,7 +284,7 @@ class BinaryScalar(Scalar[types.BinaryType]): Return a view over this value as a Buffer object. """ -class LargeBinaryScalar(Scalar[types.LargeBinaryType]): +class LargeBinaryScalar(Scalar[LargeBinaryType]): """ """ def as_buffer(self) -> Buffer: @@ -285,7 +294,7 @@ class LargeBinaryScalar(Scalar[types.LargeBinaryType]): Return a view over this value as a Buffer object. """ -class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): +class FixedSizeBinaryScalar(Scalar[FixedSizeBinaryType]): """ """ def as_buffer(self) -> Buffer: @@ -295,7 +304,7 @@ class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): Return a view over this value as a Buffer object. """ -class StringScalar(Scalar[types.StringType]): +class StringScalar(Scalar[StringType]): """ Concrete class for string-like (utf8) scalars. """ @@ -306,7 +315,7 @@ class StringScalar(Scalar[types.StringType]): Return a view over this value as a Buffer object. """ -class LargeStringScalar(Scalar[types.LargeStringType]): +class LargeStringScalar(Scalar[LargeStringType]): """ """ def as_buffer(self) -> Buffer: @@ -316,7 +325,7 @@ class LargeStringScalar(Scalar[types.LargeStringType]): Return a view over this value as a Buffer object. """ -class BinaryViewScalar(Scalar[types.BinaryViewType]): +class BinaryViewScalar(Scalar[BinaryViewType]): """ """ def as_buffer(self) -> Buffer: @@ -326,7 +335,7 @@ class BinaryViewScalar(Scalar[types.BinaryViewType]): Return a view over this value as a Buffer object. """ -class StringViewScalar(Scalar[types.StringViewType]): +class StringViewScalar(Scalar[StringViewType]): """ """ def as_buffer(self) -> Buffer: @@ -355,7 +364,7 @@ class ListScalar(Scalar[ListType[_DataTypeT]]): Iterate over this element's values. """ -class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, types._Size]]): +class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, _Size]]): """ """ @property @@ -451,7 +460,7 @@ class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]): Iterate over this element's values. """ -class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): +class StructScalar(Scalar[StructType], collections.abc.Mapping[str, Scalar]): """ Concrete class for struct scalars. """ @@ -478,7 +487,7 @@ class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar """ def _as_py_tuple(self) -> list[tuple[str, Any]]: ... -class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): +class MapScalar(Scalar[MapType[_K, _ValueT]]): """ Concrete class for map scalars. """ @@ -490,48 +499,48 @@ class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): Return the number of values. """ - def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: + def __getitem__(self, i: int) -> tuple[Scalar[_K], _ValueT, Any]: """ Return the value at the given index or key. """ def __iter__( self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]],] - | Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]] - | Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]] + MapType[_BasicDataType[_AsPyTypeK], _BasicDataType[_AsPyTypeV]],] + | Scalar[MapType[Any, _BasicDataType[_AsPyTypeV]]] + | Scalar[MapType[_BasicDataType[_AsPyTypeK], Any]] ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] | Iterator[tuple[Any, _AsPyTypeV]] | Iterator[tuple[_AsPyTypeK, Any]]: """ Iterate over this element's values. """ -class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): +class DictionaryScalar(Scalar[DictionaryType[_IndexT, _BasicValueT]]): """ Concrete class for dictionary-encoded scalars. """ @property - def index(self) -> Scalar[types._IndexT]: + def index(self) -> Scalar[_IndexT]: """ Return this value's underlying index as a scalar. """ @property - def value(self) -> Scalar[types._BasicValueT]: + def value(self) -> Scalar[_BasicValueT]: """ Return the encoded value as a scalar. """ @property def dictionary(self) -> Array: ... -class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): +class RunEndEncodedScalar(Scalar[RunEndEncodedType[_RunEndType, _BasicValueT]]): """ Concrete class for RunEndEncoded scalars. """ @property - def value(self) -> tuple[int, types._BasicValueT] | None: + def value(self) -> tuple[int, _BasicValueT] | None: """ Return underlying value as a scalar. """ -class UnionScalar(Scalar[types.UnionType]): +class UnionScalar(Scalar[UnionType]): """ Concrete class for Union scalars. """ @@ -546,7 +555,7 @@ class UnionScalar(Scalar[types.UnionType]): Return the union type code for this scalar. """ -class ExtensionScalar(Scalar[types.ExtensionType]): +class ExtensionScalar(Scalar[ExtensionType]): """ Concrete class for Extension scalars. """ @@ -556,7 +565,7 @@ class ExtensionScalar(Scalar[types.ExtensionType]): Return storage value as a scalar. """ @staticmethod - def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: + def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: """ Construct ExtensionScalar from type and storage value. @@ -572,19 +581,19 @@ class ExtensionScalar(Scalar[types.ExtensionType]): ext_scalar : ExtensionScalar """ -class Bool8Scalar(Scalar[types.Bool8Type]): +class Bool8Scalar(Scalar[Bool8Type]): """ Concrete class for bool8 extension scalar. """ -class UuidScalar(Scalar[types.UuidType]): +class UuidScalar(Scalar[UuidType]): """ Concrete class for Uuid extension scalar. """ -class JsonScalar(Scalar[types.JsonType]): +class JsonScalar(Scalar[JsonType]): """ Concrete class for JSON extension scalar. """ -class OpaqueScalar(Scalar[types.OpaqueType]): +class OpaqueScalar(Scalar[OpaqueType]): """ Concrete class for opaque extension scalar. """ diff --git a/python/pyarrow-stubs/tensor.pyi b/python/pyarrow-stubs/tensor.pyi index ac34fa08ffc..7e9b86ea1cd 100644 --- a/python/pyarrow-stubs/tensor.pyi +++ b/python/pyarrow-stubs/tensor.pyi @@ -619,6 +619,7 @@ class SparseCSFTensor(_Weakrefable): indptr: np.ndarray, indices: np.ndarray, shape: tuple[int, ...], + axis_order: list[int] | None = None, dim_names: list[str] | None = None, ) -> Self: """ From 7a907f5ec4e3750a598e78578c3b97e84cf131b4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 01:43:36 +0200 Subject: [PATCH 13/26] work --- python/pyarrow-stubs/__init__.pyi | 183 +- python/pyarrow-stubs/_compute.pyi | 1721 +++++ python/pyarrow-stubs/_dataset.pyi | 2300 ++++++ python/pyarrow-stubs/_ipc.pyi | 709 ++ python/pyarrow-stubs/_types.pyi | 14 +- python/pyarrow-stubs/array.pyi | 1 + python/pyarrow-stubs/compute.pyi | 6168 +++++++++++++++++ python/pyarrow-stubs/config.pyi | 41 + python/pyarrow-stubs/dataset.pyi | 229 + python/pyarrow-stubs/device.pyi | 88 + python/pyarrow-stubs/error.pyi | 53 + python/pyarrow-stubs/interchange/__init__.pyi | 0 python/pyarrow-stubs/interchange/buffer.pyi | 58 + python/pyarrow-stubs/interchange/column.pyi | 252 + .../pyarrow-stubs/interchange/dataframe.pyi | 102 + .../interchange/from_dataframe.pyi | 244 + python/pyarrow-stubs/io.pyi | 57 +- python/pyarrow-stubs/ipc.pyi | 123 + python/pyarrow-stubs/lib.pyi | 10 +- python/pyarrow-stubs/table.pyi | 5154 ++++++++++++++ python/pyarrow-stubs/util.pyi | 27 + python/pyarrow/{ipc.py => _ipc.py} | 0 22 files changed, 17397 insertions(+), 137 deletions(-) create mode 100644 python/pyarrow-stubs/_compute.pyi create mode 100644 python/pyarrow-stubs/_dataset.pyi create mode 100644 python/pyarrow-stubs/_ipc.pyi create mode 100644 python/pyarrow-stubs/compute.pyi create mode 100644 python/pyarrow-stubs/config.pyi create mode 100644 python/pyarrow-stubs/dataset.pyi create mode 100644 python/pyarrow-stubs/device.pyi create mode 100644 python/pyarrow-stubs/error.pyi create mode 100644 python/pyarrow-stubs/interchange/__init__.pyi create mode 100644 python/pyarrow-stubs/interchange/buffer.pyi create mode 100644 python/pyarrow-stubs/interchange/column.pyi create mode 100644 python/pyarrow-stubs/interchange/dataframe.pyi create mode 100644 python/pyarrow-stubs/interchange/from_dataframe.pyi create mode 100644 python/pyarrow-stubs/ipc.pyi create mode 100644 python/pyarrow-stubs/table.pyi create mode 100644 python/pyarrow-stubs/util.pyi rename python/pyarrow/{ipc.py => _ipc.py} (100%) diff --git a/python/pyarrow-stubs/__init__.pyi b/python/pyarrow-stubs/__init__.pyi index d366d1793ff..3f5e3073fd8 100644 --- a/python/pyarrow-stubs/__init__.pyi +++ b/python/pyarrow-stubs/__init__.pyi @@ -22,20 +22,19 @@ import pyarrow.lib as _lib _gc_enabled: bool -# TODO from pyarrow.lib import ( - # BuildInfo, - # RuntimeInfo, - # set_timezone_db_path, + BuildInfo, + RuntimeInfo, + set_timezone_db_path, MonthDayNano, - # VersionInfo, - # cpp_build_info, - # cpp_version, - # cpp_version_info, - # runtime_info, + VersionInfo, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, cpu_count, set_cpu_count, - # enable_signal_handlers, + enable_signal_handlers, io_thread_count, set_io_thread_count, ) @@ -51,7 +50,6 @@ def show_info() -> None: def _module_is_available(module: str) -> bool: ... def _filesystem_is_available(fs: str) -> bool: ... -# TODO from pyarrow.lib import ( null, bool_, @@ -144,8 +142,8 @@ from pyarrow.lib import ( Array, Tensor, array, - # chunked_array, - # record_batch, + chunked_array, + record_batch, nulls, repeat, SparseCOOTensor, @@ -257,7 +255,7 @@ from pyarrow.lib import ( ) # Buffers, allocation -# from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager from pyarrow.lib import ( Buffer, @@ -311,54 +309,52 @@ from pyarrow.lib import ( have_libhdfs, ) -# TODO from pyarrow.lib import ( - # ChunkedArray, - # RecordBatch, - # Table, - # table, + ChunkedArray, + RecordBatch, + Table, + table, concat_arrays, - # concat_tables, - # TableGroupBy, - # RecordBatchReader, + concat_tables, + TableGroupBy, + RecordBatchReader, ) # Exceptions -# from pyarrow.lib import ( -# ArrowCancelled, -# ArrowCapacityError, -# ArrowException, -# ArrowKeyError, -# ArrowIndexError, -# ArrowInvalid, -# ArrowIOError, -# ArrowMemoryError, -# ArrowNotImplementedError, -# ArrowTypeError, -# ArrowSerializationError, -# ) +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) -# TODO -# from ipc import serialize_pandas, deserialize_pandas -# import ipc as ipc +from .ipc import serialize_pandas, deserialize_pandas +# TODO? +# import _ipc as ipc import types as types # ---------------------------------------------------------------------- # Deprecations -# from util import _deprecate_api, _deprecate_class +from .util import _deprecate_api, _deprecate_class -# TODO -# from pyarrow.ipc import ( -# Message, -# MessageReader, -# MetadataVersion, -# RecordBatchFileReader, -# RecordBatchFileWriter, -# RecordBatchStreamReader, -# RecordBatchStreamWriter, -# ) +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) # ---------------------------------------------------------------------- # Returning absolute path to the pyarrow include directory (if bundled, e.g. in @@ -401,18 +397,18 @@ __all__ = [ "__version__", "_lib", "_gc_enabled", - # "BuildInfo", - # "RuntimeInfo", - # "set_timezone_db_path", + "BuildInfo", + "RuntimeInfo", + "set_timezone_db_path", "MonthDayNano", - # "VersionInfo", - # "cpp_build_info", - # "cpp_version", - # "cpp_version_info", - # "runtime_info", + "VersionInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", "cpu_count", "set_cpu_count", - # "enable_signal_handlers", + "enable_signal_handlers", "io_thread_count", "set_io_thread_count", "show_versions", @@ -510,8 +506,8 @@ __all__ = [ "Array", "Tensor", "array", - # "chunked_array", - # "record_batch", + "chunked_array", + "record_batch", "nulls", "repeat", "SparseCOOTensor", @@ -620,10 +616,10 @@ __all__ = [ "UuidScalar", "JsonScalar", "OpaqueScalar", - # "DeviceAllocationType", - # "Device", - # "MemoryManager", - # "default_cpu_memory_manager", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", "Buffer", "ResizableBuffer", "foreign_buffer", @@ -666,38 +662,37 @@ __all__ = [ "input_stream", "output_stream", "have_libhdfs", - # "ChunkedArray", - # "RecordBatch", - # "Table", - # "table", + "ChunkedArray", + "RecordBatch", + "Table", + "table", "concat_arrays", - # "concat_tables", - # "TableGroupBy", - # "RecordBatchReader", - # "ArrowCancelled", - # "ArrowCapacityError", - # "ArrowException", - # "ArrowKeyError", - # "ArrowIndexError", - # "ArrowInvalid", - # "ArrowIOError", - # "ArrowMemoryError", - # "ArrowNotImplementedError", - # "ArrowTypeError", - # "ArrowSerializationError", - # "serialize_pandas", - # "deserialize_pandas", - # "ipc", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", "types", - # "_deprecate_api", - # "_deprecate_class", - # "Message", - # "MessageReader", - # "MetadataVersion", - # "RecordBatchFileReader", - # "RecordBatchFileWriter", - # "RecordBatchStreamReader", - # "RecordBatchStreamWriter", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", "get_include", "_get_pkg_config_executable", "_has_pkg_config", diff --git a/python/pyarrow-stubs/_compute.pyi b/python/pyarrow-stubs/_compute.pyi new file mode 100644 index 00000000000..3d61ae42787 --- /dev/null +++ b/python/pyarrow-stubs/_compute.pyi @@ -0,0 +1,1721 @@ +from typing import ( + Any, + Callable, + Iterable, + Literal, + Sequence, + TypeAlias, + TypedDict, + overload, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + +class Kernel(lib._Weakrefable): + """ + A kernel object. + + Kernels handle the execution of a Function for a certain signature. + """ + +class Function(lib._Weakrefable): + """ + A compute function. + + A function implements a certain logical computation over a range of + possible input signatures. Each signature accepts a range of input + types and is implemented by a given Kernel. + + Functions can be of different kinds: + + * "scalar" functions apply an item-wise computation over all items + of their inputs. Each item in the output only depends on the values + of the inputs at the same position. Examples: addition, comparisons, + string predicates... + + * "vector" functions apply a collection-wise computation, such that + each item in the output may depend on the values of several items + in each input. Examples: dictionary encoding, sorting, extracting + unique values... + + * "scalar_aggregate" functions reduce the dimensionality of the inputs by + applying a reduction function. Examples: sum, min_max, mode... + + * "hash_aggregate" functions apply a reduction function to an input + subdivided by grouping criteria. They may not be directly called. + Examples: hash_sum, hash_min_max... + + * "meta" functions dispatch to other functions. + """ + @property + def arity(self) -> int: + """ + The function arity. + + If Ellipsis (i.e. `...`) is returned, the function takes a variable + number of arguments. + """ + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: + """ + The function kind. + """ + @property + def name(self) -> str: + """ + The function name. + """ + @property + def num_kernels(self) -> int: + """ + The number of kernels implementing this function. + """ + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: + """ + Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If + not passed, will be inferred from passed data. + """ + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: + """ + Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup + """ + + def list_functions(self) -> list[str]: + """ + Return all function names in the registry. + """ + +class HashAggregateFunction(Function): ... +class HashAggregateKernel(Kernel): ... +class ScalarAggregateFunction(Function): ... +class ScalarAggregateKernel(Kernel): ... +class ScalarFunction(Function): ... +class ScalarKernel(Kernel): ... +class VectorFunction(Function): ... +class VectorKernel(Kernel): ... + +# ==================== _compute.pyx Option classes ==================== +class ArraySortOptions(FunctionOptions): + """ + Options for the `array_sort_indices` function. + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + +class AssumeTimezoneOptions(FunctionOptions): + """ + Options for the `assume_timezone` function. + + Parameters + ---------- + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + """ + + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + +class CastOptions(FunctionOptions): + """ + Options for the `cast` function. + + Parameters + ---------- + target_type : DataType, optional + The PyArrow type to cast to. + allow_int_overflow : bool, default False + Whether integer overflow is allowed when casting. + allow_time_truncate : bool, default False + Whether time precision truncation is allowed when casting. + allow_time_overflow : bool, default False + Whether date/time range overflow is allowed when casting. + allow_decimal_truncate : bool, default False + Whether decimal precision truncation is allowed when casting. + allow_float_truncate : bool, default False + Whether floating-point precision truncation is allowed when casting. + allow_invalid_utf8 : bool, default False + Whether producing invalid utf8 data is allowed when casting. + """ + + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + +class CountOptions(FunctionOptions): + """ + Options for the `count` function. + + Parameters + ---------- + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + """ + def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... + +class CumulativeOptions(FunctionOptions): + """ + Options for `cumulative_*` functions. + + - cumulative_sum + - cumulative_sum_checked + - cumulative_prod + - cumulative_prod_checked + - cumulative_max + - cumulative_min + + Parameters + ---------- + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class CumulativeSumOptions(FunctionOptions): + """ + Options for `cumulative_sum` function. + + Parameters + ---------- + start : Scalar, default None + Starting value for sum computation + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class DayOfWeekOptions(FunctionOptions): + """ + Options for the `day_of_week` function. + + Parameters + ---------- + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + """ + + def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... + +class DictionaryEncodeOptions(FunctionOptions): + """ + Options for dictionary encoding. + + Parameters + ---------- + null_encoding : str, default "mask" + How to encode nulls in the input. + Accepted values are "mask" (null inputs emit a null in the indices + array), "encode" (null inputs emit a non-null index pointing to + a null value in the dictionary array). + """ + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + +class RunEndEncodeOptions(FunctionOptions): + """ + Options for run-end encoding. + + Parameters + ---------- + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + """ + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + +class ElementWiseAggregateOptions(FunctionOptions): + """ + Options for element-wise aggregate functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + """ + def __init__(self, *, skip_nulls: bool = True) -> None: ... + +class ExtractRegexOptions(FunctionOptions): + """ + Options for the `extract_regex` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class ExtractRegexSpanOptions(FunctionOptions): + """ + Options for the `extract_regex_span` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class FilterOptions(FunctionOptions): + """ + Options for selecting with a boolean filter. + + Parameters + ---------- + null_selection_behavior : str, default "drop" + How to handle nulls in the selection filter. + Accepted values are "drop", "emit_null". + """ + + def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... + +class IndexOptions(FunctionOptions): + """ + Options for the `index` function. + + Parameters + ---------- + value : Scalar + The value to search for. + """ + def __init__(self, value: lib.Scalar) -> None: ... + +class JoinOptions(FunctionOptions): + """ + Options for the `binary_join_element_wise` function. + + Parameters + ---------- + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + """ + @overload + def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + @overload + def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... + +class ListSliceOptions(FunctionOptions): + """ + Options for list array slicing. + + Parameters + ---------- + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + """ + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + +class ListFlattenOptions(FunctionOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + def __init__(self, recursive: bool = False) -> None: ... + +class MakeStructOptions(FunctionOptions): + """ + Options for the `make_struct` function. + + Parameters + ---------- + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + """ + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + +class MapLookupOptions(FunctionOptions): + """ + Options for the `map_lookup` function. + + Parameters + ---------- + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + """ + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + +class MatchSubstringOptions(FunctionOptions): + """ + Options for looking for a substring. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + """ + + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + +class ModeOptions(FunctionOptions): + """ + Options for the `mode` function. + + Parameters + ---------- + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class NullOptions(FunctionOptions): + """ + Options for the `is_null` function. + + Parameters + ---------- + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + """ + def __init__(self, *, nan_is_null: bool = False) -> None: ... + +class PadOptions(FunctionOptions): + """ + Options for padding strings. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + """ + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + +class PairwiseOptions(FunctionOptions): + """ + Options for `pairwise` functions. + + Parameters + ---------- + period : int, default 1 + Period for applying the period function. + """ + def __init__(self, period: int = 1) -> None: ... + +class PartitionNthOptions(FunctionOptions): + """ + Options for the `partition_nth_indices` function. + + Parameters + ---------- + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + """ + def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... + +class WinsorizeOptions(FunctionOptions): + """ + Options for the `winsorize` function. + + Parameters + ---------- + lower_limit : float, between 0 and 1 + The quantile below which all values are replaced with the quantile's value. + upper_limit : float, between 0 and 1 + The quantile above which all values are replaced with the quantile's value. + """ + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + +class QuantileOptions(FunctionOptions): + """ + Options for the `quantile` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float], + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class RandomOptions(FunctionOptions): + """ + Options for random generation. + + Parameters + ---------- + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + """ + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + +class RankOptions(FunctionOptions): + """ + Options for the `rank` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + """ + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + +class RankQuantileOptions(FunctionOptions): + """ + Options for the `rank_quantile` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + """ + + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + +class PivotWiderOptions(FunctionOptions): + """ + Options for the `pivot_wider` function. + + Parameters + ---------- + key_names : sequence of str + The pivot key names expected in the pivot key column. + For each entry in `key_names`, a column with the same name is emitted + in the struct output. + unexpected_key_behavior : str, default "ignore" + The behavior when pivot keys not in `key_names` are encountered. + Accepted values are "ignore", "raise". + If "ignore", unexpected keys are silently ignored. + If "raise", unexpected keys raise a KeyError. + """ + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + +class ReplaceSliceOptions(FunctionOptions): + """ + Options for replacing slices. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + """ + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + +class ReplaceSubstringOptions(FunctionOptions): + """ + Options for replacing matched substrings. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + """ + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + +class RoundBinaryOptions(FunctionOptions): + """ + Options for rounding numbers when ndigits is provided by a second array + + Parameters + ---------- + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +class RoundOptions(FunctionOptions): + """ + Options for rounding numbers. + + Parameters + ---------- + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + +class RoundTemporalOptions(FunctionOptions): + """ + Options for rounding temporal values. + + Parameters + ---------- + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + """ + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + +class RoundToMultipleOptions(FunctionOptions): + """ + Options for rounding numbers to a multiple. + + Parameters + ---------- + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... + +class ScalarAggregateOptions(FunctionOptions): + """ + Options for scalar aggregations. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + +class SelectKOptions(FunctionOptions): + """ + Options for top/bottom k-selection. + + Parameters + ---------- + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + """ + + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + +class SetLookupOptions(FunctionOptions): + """ + Options for the `is_in` and `index_in` functions. + + Parameters + ---------- + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + """ + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + +class SliceOptions(FunctionOptions): + """ + Options for slicing. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + """ + + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... + +class SortOptions(FunctionOptions): + """ + Options for the `sort_indices` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" + ) -> None: ... + +class SplitOptions(FunctionOptions): + """ + Options for splitting on whitespace. + + Parameters + ---------- + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + + def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... + +class SplitPatternOptions(FunctionOptions): + """ + Options for splitting on a string pattern. + + Parameters + ---------- + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + +class StrftimeOptions(FunctionOptions): + """ + Options for the `strftime` function. + + Parameters + ---------- + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + """ + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... + +class StrptimeOptions(FunctionOptions): + """ + Options for the `strptime` function. + + Parameters + ---------- + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + """ + def __init__( + self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False + ) -> None: ... + +class StructFieldOptions(FunctionOptions): + """ + Options for the `struct_field` function. + + Parameters + ---------- + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + """ + def __init__( + self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int + ) -> None: ... + +class TakeOptions(FunctionOptions): + """ + Options for the `take` and `array_take` functions. + + Parameters + ---------- + boundscheck : boolean, default True + Whether to check indices are within bounds. If False and an + index is out of bounds, behavior is undefined (the process + may crash). + """ + def __init__(self, boundscheck: bool = True) -> None: ... + +class TDigestOptions(FunctionOptions): + """ + Options for the `tdigest` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class TrimOptions(FunctionOptions): + """ + Options for trimming characters from strings. + + Parameters + ---------- + characters : str + Individual characters to be trimmed from the string. + """ + def __init__(self, characters: str) -> None: ... + +class Utf8NormalizeOptions(FunctionOptions): + """ + Options for the `utf8_normalize` function. + + Parameters + ---------- + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + +class VarianceOptions(FunctionOptions): + """ + Options for the `variance` and `stddev` functions. + + Parameters + ---------- + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class SkewOptions(FunctionOptions): + """ + Options for the `skew` and `kurtosis` functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + +class WeekOptions(FunctionOptions): + """ + Options for the `week` function. + + Parameters + ---------- + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + """ + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + +# ==================== _compute.pyx Functions ==================== + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: + """ + Call a named function. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If not + passed, inferred from data. + """ + +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: + """ + Get a function by name. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup + """ + +def list_functions() -> list[str]: + """ + Return all function names in the global registry. + """ + +# ==================== _compute.pyx Udf ==================== + +def call_tabular_function( + function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None +) -> lib.RecordBatchReader: + """ + Get a record batch iterator from a tabular function. + + Parameters + ---------- + function_name : str + Name of the function. + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. Currently, only an empty args is supported. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +class _FunctionDoc(TypedDict): + summary: str + description: str + +def register_scalar_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined scalar function. + + This API is EXPERIMENTAL. + + A scalar function is a function that executes elementwise + operations on arrays or scalars, i.e. a scalar function must + be computed row-by-row with no state where each output row + is computed only from its corresponding input row. + In other words, all argument arrays have the same length, + and the output array is of the same length as the arguments. + Scalar functions are the only functions allowed in query engine + expressions. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> + >>> def add_constant(ctx, array): + ... return pc.add(array, 1, memory_pool=ctx.memory_pool) + >>> + >>> func_name = "py_add_func" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.int64() + >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> answer = pc.call_function(func_name, [pa.array([20])]) + >>> answer + + [ + 21 + ] + """ + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined tabular function. + + This API is EXPERIMENTAL. + + A tabular function is one accepting a context argument of type + UdfContext and returning a generator of struct arrays. + The in_types argument must be empty and the out_type argument + specifies a schema. Each struct array must have field types + corresponding to the schema. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The only argument is the context argument of type + UdfContext. It must return a callable that + returns on each invocation a StructArray matching + the out_type, where an empty array indicates end. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + Must be an empty dictionary (reserved for future use). + out_type : Union[Schema, DataType] + Schema of the function's output, or a corresponding flat struct type. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined non-decomposable aggregate function. + + This API is EXPERIMENTAL. + + A non-decomposable aggregation function is a function that executes + aggregate operations on the whole data that it is aggregating. + In other words, non-decomposable aggregate function cannot be + split into consume/merge/finalize steps. + + This is often used with ordered or segmented aggregation where groups + can be emit before accumulating all of the input data. + + Note that currently the size of any input column cannot exceed 2 GB + for a single segment (all groups combined). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return a Scalar matching the + out_type. + To define a varargs function, pass a callable that takes + *args. The in_type needs to match in type of inputs when + the function gets called. + function_name : str + Name of the function. This name must be unique, i.e., + there should only be one function registered with + this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import numpy as np + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple median udf" + >>> func_doc["description"] = "compute median" + >>> + >>> def compute_median(ctx, array): + ... return pa.scalar(np.median(array)) + >>> + >>> func_name = "py_compute_median" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.float64() + >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_compute_median' + >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) + >>> answer + + >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) + >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) + >>> result + pyarrow.Table + k: int64 + v_py_compute_median: double + ---- + k: [[1,2]] + v_py_compute_median: [[15,35]] + """ + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ + +class UdfContext: + """ + Per-invocation function context/state. + + This object will always be the first argument to a user-defined + function. It should not be used outside of a call to the function. + """ + + @property + def batch_length(self) -> int: + """ + The common length of all input arguments (int). + + In the case that all arguments are scalars, this value + is used to pass the "actual length" of the arguments, + e.g. because the scalar values are encoding a column + with a constant value. + """ + @property + def memory_pool(self) -> lib.MemoryPool: + """ + A memory pool for allocations (:class:`MemoryPool`). + + This is the memory pool supplied by the user when they invoked + the function and it should be used in any calls to arrow that the + UDF makes if that call accepts a memory_pool. + """ + +# ==================== _compute.pyx Expression ==================== +class Expression(lib._Weakrefable): + """ + A logical expression to be evaluated against some input. + + To create an expression: + + - Use the factory function ``pyarrow.compute.scalar()`` to create a + scalar (not necessary when combined, see example below). + - Use the factory function ``pyarrow.compute.field()`` to reference + a field (column in table). + - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. + - Combine expressions using python operators ``&`` (logical and), + ``|`` (logical or) and ``~`` (logical not). + Note: python keywords ``and``, ``or`` and ``not`` cannot be used + to combine expressions. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. + + Examples + -------- + + >>> import pyarrow.compute as pc + >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) + 7))> + >>> pc.field("a") != 3 + + >>> pc.field("a").isin([1, 2, 3]) + + """ + + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: + """ + Deserialize an expression from Substrait + + The serialized message must be an ExtendedExpression message that has + only a single expression. The name of the expression and the schema + the expression was bound to will be ignored. Use + pyarrow.substrait.deserialize_expressions if this information is needed + or if the message might contain multiple expressions. + + Parameters + ---------- + message : bytes or Buffer or a protobuf Message + The Substrait message to deserialize + + Returns + ------- + Expression + The deserialized expression + """ + def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: + """ + Serialize the expression using Substrait + + The expression will be serialized as an ExtendedExpression message that has a + single expression named "expression" + + Parameters + ---------- + schema : Schema + The input schema the expression will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + but the result may not be accepted by other compute libraries. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ + def __invert__(self) -> Expression: ... + def __and__(self, other) -> Expression: ... + def __or__(self, other) -> Expression: ... + def __add__(self, other) -> Expression: ... + def __mul__(self, other) -> Expression: ... + def __sub__(self, other) -> Expression: ... + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... # type: ignore[override] + def __lt__(self, value: object) -> Expression: ... # type: ignore[override] + def __ge__(self, value: object) -> Expression: ... # type: ignore[override] + def __le__(self, value: object) -> Expression: ... # type: ignore[override] + def __truediv__(self, other) -> Expression: ... + def is_valid(self) -> bool: + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ + def is_null(self, nan_is_null: bool = False) -> Expression: + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ + def is_nan(self) -> Expression: + """ + Check whether the expression is NaN. + + This creates a new expression equivalent to calling the + `is_nan` compute function on this expression. + + Returns + ------- + is_nan : Expression + """ + def cast( + self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + ) -> Expression: + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Expression + """ + def isin(self, values: lib.Array | Iterable) -> Expression: + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ + +# ==================== _compute.py ==================== diff --git a/python/pyarrow-stubs/_dataset.pyi b/python/pyarrow-stubs/_dataset.pyi new file mode 100644 index 00000000000..03e7762b6df --- /dev/null +++ b/python/pyarrow-stubs/_dataset.pyi @@ -0,0 +1,2300 @@ +# import sys +# +# if sys.version_info >= (3, 11): +# from typing import Self +# else: +# from typing_extensions import Self +# from typing import ( +# IO, +# Any, +# Callable, +# Generic, +# Iterator, +# Literal, +# NamedTuple, +# TypeVar, +# overload, +# ) +# +# from _typeshed import StrPath +# +# from . import _csv, _json, _parquet, lib +# from ._fs import FileSelector, FileSystem, SupportedFileSystem +# from ._stubs_typing import Indices, JoinType, Order +# from .acero import ExecNodeOptions +# from .compute import Expression +# from .ipc import IpcWriteOptions, RecordBatchReader +# +# class Dataset(lib._Weakrefable): +# """ +# Collection of data fragments and potentially child datasets. +# +# Arrow Datasets allow you to query against data that has been split across +# multiple files. This sharding of data may indicate partitioning, which +# can accelerate queries that only touch some partitions (files). +# """ +# +# @property +# def partition_expression(self) -> Expression: +# """ +# An Expression which evaluates to true for all data viewed by this +# Dataset. +# """ +# def replace_schema(self, schema: lib.Schema) -> None: +# """ +# Return a copy of this Dataset with a different schema. +# +# The copy will view the same Fragments. If the new schema is not +# compatible with the original dataset's schema then an error will +# be raised. +# +# Parameters +# ---------- +# schema : Schema +# The new dataset schema. +# """ +# def get_fragments(self, filter: Expression | None = None): +# """Returns an iterator over the fragments in this dataset. +# +# Parameters +# ---------- +# filter : Expression, default None +# Return fragments matching the optional filter, either using the +# partition_expression or internal information like Parquet's +# statistics. +# +# Returns +# ------- +# fragments : iterator of Fragment +# """ +# def scanner( +# self, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Scanner: +# """ +# Build a scan operation against the dataset. +# +# Data is not loaded immediately. Instead, this produces a Scanner, +# which exposes further operations (e.g. loading all data as a +# table, counting rows). +# +# See the :meth:`Scanner.from_dataset` method for further information. +# +# Parameters +# ---------- +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# scanner : Scanner +# +# Examples +# -------- +# >>> import pyarrow as pa +# >>> table = pa.table( +# ... { +# ... "year": [2020, 2022, 2021, 2022, 2019, 2021], +# ... "n_legs": [2, 2, 4, 4, 5, 100], +# ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], +# ... } +# ... ) +# >>> +# >>> import pyarrow.parquet as pq +# >>> pq.write_table(table, "dataset_scanner.parquet") +# +# >>> import pyarrow.dataset as ds +# >>> dataset = ds.dataset("dataset_scanner.parquet") +# +# Selecting a subset of the columns: +# +# >>> dataset.scanner(columns=["year", "n_legs"]).to_table() +# pyarrow.Table +# year: int64 +# n_legs: int64 +# ---- +# year: [[2020,2022,2021,2022,2019,2021]] +# n_legs: [[2,2,4,4,5,100]] +# +# Projecting selected columns using an expression: +# +# >>> dataset.scanner( +# ... columns={ +# ... "n_legs_uint": ds.field("n_legs").cast("uint8"), +# ... } +# ... ).to_table() +# pyarrow.Table +# n_legs_uint: uint8 +# ---- +# n_legs_uint: [[2,2,4,4,5,100]] +# +# Filtering rows while scanning: +# +# >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() +# pyarrow.Table +# year: int64 +# n_legs: int64 +# animal: string +# ---- +# year: [[2022,2021,2022,2021]] +# n_legs: [[2,4,4,100]] +# animal: [["Parrot","Dog","Horse","Centipede"]] +# """ +# def to_batches( +# self, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Iterator[lib.RecordBatch]: +# """ +# Read the dataset as materialized record batches. +# +# Parameters +# ---------- +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# record_batches : iterator of RecordBatch +# """ +# def to_table( +# self, +# columns: list[str] | dict[str, Expression] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> lib.Table: +# """ +# Read the dataset to an Arrow table. +# +# Note that this method reads all the selected data from the dataset +# into memory. +# +# Parameters +# ---------- +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# table : Table +# """ +# def take( +# self, +# indices: Indices, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> lib.Table: +# """ +# Select rows of data by index. +# +# Parameters +# ---------- +# indices : Array or array-like +# indices of rows to select in the dataset. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# table : Table +# """ +# def head( +# self, +# num_rows: int, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> lib.Table: +# """ +# Load the first N rows of the dataset. +# +# Parameters +# ---------- +# num_rows : int +# The number of rows to load. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# table : Table +# """ +# def count_rows( +# self, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> int: +# """ +# Count rows matching the scanner filter. +# +# Parameters +# ---------- +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# count : int +# """ +# @property +# def schema(self) -> lib.Schema: +# """The common schema of the full Dataset""" +# def filter(self, expression: Expression) -> Self: +# """ +# Apply a row filter to the dataset. +# +# Parameters +# ---------- +# expression : Expression +# The filter that should be applied to the dataset. +# +# Returns +# ------- +# Dataset +# """ +# def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: +# """ +# Sort the Dataset by one or multiple columns. +# +# Parameters +# ---------- +# sorting : str or list[tuple(name, order)] +# Name of the column to use to sort (ascending), or +# a list of multiple sorting conditions where +# each entry is a tuple with column name +# and sorting order ("ascending" or "descending") +# **kwargs : dict, optional +# Additional sorting options. +# As allowed by :class:`SortOptions` +# +# Returns +# ------- +# InMemoryDataset +# A new dataset sorted according to the sort keys. +# """ +# def join( +# self, +# right_dataset: Dataset, +# keys: str | list[str], +# right_keys: str | list[str] | None = None, +# join_type: JoinType = "left outer", +# left_suffix: str | None = None, +# right_suffix: str | None = None, +# coalesce_keys: bool = True, +# use_threads: bool = True, +# ) -> InMemoryDataset: +# """ +# Perform a join between this dataset and another one. +# +# Result of the join will be a new dataset, where further +# operations can be applied. +# +# Parameters +# ---------- +# right_dataset : dataset +# The dataset to join to the current one, acting as the right dataset +# in the join operation. +# keys : str or list[str] +# The columns from current dataset that should be used as keys +# of the join operation left side. +# right_keys : str or list[str], default None +# The columns from the right_dataset that should be used as keys +# on the join operation right side. +# When ``None`` use the same key names as the left dataset. +# join_type : str, default "left outer" +# The kind of join that should be performed, one of +# ("left semi", "right semi", "left anti", "right anti", +# "inner", "left outer", "right outer", "full outer") +# left_suffix : str, default None +# Which suffix to add to right column names. This prevents confusion +# when the columns in left and right datasets have colliding names. +# right_suffix : str, default None +# Which suffix to add to the left column names. This prevents confusion +# when the columns in left and right datasets have colliding names. +# coalesce_keys : bool, default True +# If the duplicated keys should be omitted from one of the sides +# in the join result. +# use_threads : bool, default True +# Whenever to use multithreading or not. +# +# Returns +# ------- +# InMemoryDataset +# """ +# def join_asof( +# self, +# right_dataset: Dataset, +# on: str, +# by: str | list[str], +# tolerance: int, +# right_on: str | list[str] | None = None, +# right_by: str | list[str] | None = None, +# ) -> InMemoryDataset: +# """ +# Perform an asof join between this dataset and another one. +# +# This is similar to a left-join except that we match on nearest key rather +# than equal keys. Both datasets must be sorted by the key. This type of join +# is most useful for time series data that are not perfectly aligned. +# +# Optionally match on equivalent keys with "by" before searching with "on". +# +# Result of the join will be a new Dataset, where further +# operations can be applied. +# +# Parameters +# ---------- +# right_dataset : dataset +# The dataset to join to the current one, acting as the right dataset +# in the join operation. +# on : str +# The column from current dataset that should be used as the "on" key +# of the join operation left side. +# +# An inexact match is used on the "on" key, i.e. a row is considered a +# match if and only if left_on - tolerance <= right_on <= left_on. +# +# The input table must be sorted by the "on" key. Must be a single +# field of a common type. +# +# Currently, the "on" key must be an integer, date, or timestamp type. +# by : str or list[str] +# The columns from current dataset that should be used as the keys +# of the join operation left side. The join operation is then done +# only for the matches in these columns. +# tolerance : int +# The tolerance for inexact "on" key matching. A right row is considered +# a match with the left row `right.on - left.on <= tolerance`. The +# `tolerance` may be: +# +# - negative, in which case a past-as-of-join occurs; +# - or positive, in which case a future-as-of-join occurs; +# - or zero, in which case an exact-as-of-join occurs. +# +# The tolerance is interpreted in the same units as the "on" key. +# right_on : str or list[str], default None +# The columns from the right_dataset that should be used as the on key +# on the join operation right side. +# When ``None`` use the same key name as the left dataset. +# right_by : str or list[str], default None +# The columns from the right_dataset that should be used as by keys +# on the join operation right side. +# When ``None`` use the same key names as the left dataset. +# +# Returns +# ------- +# InMemoryDataset +# """ +# +# class InMemoryDataset(Dataset): +# """ +# A Dataset wrapping in-memory data. +# +# Parameters +# ---------- +# source : RecordBatch, Table, list, tuple +# The data for this dataset. Can be a RecordBatch, Table, list of +# RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader +# If an iterable is provided, the schema must also be provided. +# schema : Schema, optional +# Only required if passing an iterable as the source +# """ +# +# class UnionDataset(Dataset): +# """ +# A Dataset wrapping child datasets. +# +# Children's schemas must agree with the provided schema. +# +# Parameters +# ---------- +# schema : Schema +# A known schema to conform to. +# children : list of Dataset +# One or more input children +# """ +# +# @property +# def children(self) -> list[Dataset]: ... +# +# class FileSystemDataset(Dataset): +# """ +# A Dataset of file fragments. +# +# A FileSystemDataset is composed of one or more FileFragment. +# +# Parameters +# ---------- +# fragments : list[Fragments] +# List of fragments to consume. +# schema : Schema +# The top-level schema of the Dataset. +# format : FileFormat +# File format of the fragments, currently only ParquetFileFormat, +# IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. +# filesystem : FileSystem +# FileSystem of the fragments. +# root_partition : Expression, optional +# The top-level partition of the DataDataset. +# """ +# +# def __init__( +# self, +# fragments: list[Fragment], +# schema: lib.Schema, +# format: FileFormat, +# filesystem: SupportedFileSystem | None = None, +# root_partition: Expression | None = None, +# ) -> None: ... +# @classmethod +# def from_paths( +# cls, +# paths: list[str], +# schema: lib.Schema | None = None, +# format: FileFormat | None = None, +# filesystem: SupportedFileSystem | None = None, +# partitions: list[Expression] | None = None, +# root_partition: Expression | None = None, +# ) -> FileSystemDataset: +# """ +# A Dataset created from a list of paths on a particular filesystem. +# +# Parameters +# ---------- +# paths : list of str +# List of file paths to create the fragments from. +# schema : Schema +# The top-level schema of the DataDataset. +# format : FileFormat +# File format to create fragments from, currently only +# ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. +# filesystem : FileSystem +# The filesystem which files are from. +# partitions : list[Expression], optional +# Attach additional partition information for the file paths. +# root_partition : Expression, optional +# The top-level partition of the DataDataset. +# """ +# @property +# def filesystem(self) -> FileSystem: ... +# @property +# def partitioning(self) -> Partitioning | None: +# """ +# The partitioning of the Dataset source, if discovered. +# +# If the FileSystemDataset is created using the ``dataset()`` factory +# function with a partitioning specified, this will return the +# finalized Partitioning object from the dataset discovery. In all +# other cases, this returns None. +# """ +# @property +# def files(self) -> list[str]: +# """List of the files""" +# @property +# def format(self) -> FileFormat: +# """The FileFormat of this source.""" +# +# class FileWriteOptions(lib._Weakrefable): +# @property +# def format(self) -> FileFormat: ... +# +# class FileFormat(lib._Weakrefable): +# def inspect( +# self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None +# ) -> lib.Schema: +# """ +# Infer the schema of a file. +# +# Parameters +# ---------- +# file : file-like object, path-like or str +# The file or file path to infer a schema from. +# filesystem : Filesystem, optional +# If `filesystem` is given, `file` must be a string and specifies +# the path of the file to read from the filesystem. +# +# Returns +# ------- +# schema : Schema +# The schema inferred from the file +# """ +# def make_fragment( +# self, +# file: StrPath | IO, +# filesystem: SupportedFileSystem | None = None, +# partition_expression: Expression | None = None, +# *, +# file_size: int | None = None, +# ) -> Fragment: +# """ +# Make a FileFragment from a given file. +# +# Parameters +# ---------- +# file : file-like object, path-like or str +# The file or file path to make a fragment from. +# filesystem : Filesystem, optional +# If `filesystem` is given, `file` must be a string and specifies +# the path of the file to read from the filesystem. +# partition_expression : Expression, optional +# An expression that is guaranteed true for all rows in the fragment. Allows +# fragment to be potentially skipped while scanning with a filter. +# file_size : int, optional +# The size of the file in bytes. Can improve performance with high-latency filesystems +# when file size needs to be known before reading. +# +# Returns +# ------- +# fragment : Fragment +# The file fragment +# """ +# def make_write_options(self) -> FileWriteOptions: ... +# @property +# def default_extname(self) -> str: ... +# @property +# def default_fragment_scan_options(self) -> FragmentScanOptions: ... +# @default_fragment_scan_options.setter +# def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... +# +# class Fragment(lib._Weakrefable): +# """Fragment of data from a Dataset.""" +# @property +# def physical_schema(self) -> lib.Schema: +# """Return the physical schema of this Fragment. This schema can be +# different from the dataset read schema.""" +# @property +# def partition_expression(self) -> Expression: +# """An Expression which evaluates to true for all data viewed by this +# Fragment. +# """ +# def scanner( +# self, +# schema: lib.Schema | None = None, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Scanner: +# """ +# Build a scan operation against the fragment. +# +# Data is not loaded immediately. Instead, this produces a Scanner, +# which exposes further operations (e.g. loading all data as a +# table, counting rows). +# +# Parameters +# ---------- +# schema : Schema +# Schema to use for scanning. This is used to unify a Fragment to +# its Dataset's schema. If not specified this will use the +# Fragment's physical schema which might differ for each Fragment. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# scanner : Scanner +# """ +# def to_batches( +# self, +# schema: lib.Schema | None = None, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Iterator[lib.RecordBatch]: +# """ +# Read the fragment as materialized record batches. +# +# Parameters +# ---------- +# schema : Schema, optional +# Concrete schema to use for scanning. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# record_batches : iterator of RecordBatch +# """ +# def to_table( +# self, +# schema: lib.Schema | None = None, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> lib.Table: +# """ +# Convert this Fragment into a Table. +# +# Use this convenience utility with care. This will serially materialize +# the Scan result in memory before creating the Table. +# +# Parameters +# ---------- +# schema : Schema, optional +# Concrete schema to use for scanning. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# table : Table +# """ +# def take( +# self, +# indices: Indices, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> lib.Table: +# """ +# Select rows of data by index. +# +# Parameters +# ---------- +# indices : Array or array-like +# The indices of row to select in the dataset. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# Table +# """ +# def head( +# self, +# num_rows: int, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> lib.Table: +# """ +# Load the first N rows of the fragment. +# +# Parameters +# ---------- +# num_rows : int +# The number of rows to load. +# columns : list of str, default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# Table +# """ +# def count_rows( +# self, +# columns: list[str] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> int: +# """ +# Count rows matching the scanner filter. +# +# Parameters +# ---------- +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# +# Returns +# ------- +# count : int +# """ +# +# class FileFragment(Fragment): +# """A Fragment representing a data file.""" +# +# def open(self) -> lib.NativeFile: +# """ +# Open a NativeFile of the buffer or file viewed by this fragment. +# """ +# @property +# def path(self) -> str: +# """ +# The path of the data file viewed by this fragment, if it views a +# file. If instead it views a buffer, this will be "". +# """ +# @property +# def filesystem(self) -> FileSystem: +# """ +# The FileSystem containing the data file viewed by this fragment, if +# it views a file. If instead it views a buffer, this will be None. +# """ +# @property +# def buffer(self) -> lib.Buffer: +# """ +# The buffer viewed by this fragment, if it views a buffer. If +# instead it views a file, this will be None. +# """ +# @property +# def format(self) -> FileFormat: +# """ +# The format of the data file viewed by this fragment. +# """ +# +# class FragmentScanOptions(lib._Weakrefable): +# """Scan options specific to a particular fragment and scan operation.""" +# +# @property +# def type_name(self) -> str: ... +# +# class IpcFileWriteOptions(FileWriteOptions): +# @property +# def write_options(self) -> IpcWriteOptions: ... +# @write_options.setter +# def write_options(self, write_options: IpcWriteOptions) -> None: ... +# +# class IpcFileFormat(FileFormat): +# def equals(self, other: IpcFileFormat) -> bool: ... +# def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... +# @property +# def default_extname(self) -> str: ... +# +# class FeatherFileFormat(IpcFileFormat): ... +# +# class CsvFileFormat(FileFormat): +# """ +# FileFormat for CSV files. +# +# Parameters +# ---------- +# parse_options : pyarrow.csv.ParseOptions +# Options regarding CSV parsing. +# default_fragment_scan_options : CsvFragmentScanOptions +# Default options for fragments scan. +# convert_options : pyarrow.csv.ConvertOptions +# Options regarding value conversion. +# read_options : pyarrow.csv.ReadOptions +# General read options. +# """ +# def __init__( +# self, +# parse_options: _csv.ParseOptions | None = None, +# default_fragment_scan_options: CsvFragmentScanOptions | None = None, +# convert_options: _csv.ConvertOptions | None = None, +# read_options: _csv.ReadOptions | None = None, +# ) -> None: ... +# def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] +# @property +# def parse_options(self) -> _csv.ParseOptions: ... +# @parse_options.setter +# def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... +# def equals(self, other: CsvFileFormat) -> bool: ... +# +# class CsvFragmentScanOptions(FragmentScanOptions): +# """ +# Scan-specific options for CSV fragments. +# +# Parameters +# ---------- +# convert_options : pyarrow.csv.ConvertOptions +# Options regarding value conversion. +# read_options : pyarrow.csv.ReadOptions +# General read options. +# """ +# +# convert_options: _csv.ConvertOptions +# read_options: _csv.ReadOptions +# +# def __init__( +# self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions +# ) -> None: ... +# def equals(self, other: CsvFragmentScanOptions) -> bool: ... +# +# class CsvFileWriteOptions(FileWriteOptions): +# write_options: _csv.WriteOptions +# +# class JsonFileFormat(FileFormat): +# """ +# FileFormat for JSON files. +# +# Parameters +# ---------- +# default_fragment_scan_options : JsonFragmentScanOptions +# Default options for fragments scan. +# parse_options : pyarrow.json.ParseOptions +# Options regarding json parsing. +# read_options : pyarrow.json.ReadOptions +# General read options. +# """ +# def __init__( +# self, +# default_fragment_scan_options: JsonFragmentScanOptions | None = None, +# parse_options: _json.ParseOptions | None = None, +# read_options: _json.ReadOptions | None = None, +# ) -> None: ... +# def equals(self, other: JsonFileFormat) -> bool: ... +# +# class JsonFragmentScanOptions(FragmentScanOptions): +# """ +# Scan-specific options for JSON fragments. +# +# Parameters +# ---------- +# parse_options : pyarrow.json.ParseOptions +# Options regarding JSON parsing. +# read_options : pyarrow.json.ReadOptions +# General read options. +# """ +# +# parse_options: _json.ParseOptions +# read_options: _json.ReadOptions +# def __init__( +# self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions +# ) -> None: ... +# def equals(self, other: JsonFragmentScanOptions) -> bool: ... +# +# class Partitioning(lib._Weakrefable): +# def parse(self, path: str) -> Expression: +# """ +# Parse a path into a partition expression. +# +# Parameters +# ---------- +# path : str +# +# Returns +# ------- +# pyarrow.dataset.Expression +# """ +# def format(self, expr: Expression) -> tuple[str, str]: +# """ +# Convert a filter expression into a tuple of (directory, filename) using +# the current partitioning scheme +# +# Parameters +# ---------- +# expr : pyarrow.dataset.Expression +# +# Returns +# ------- +# tuple[str, str] +# +# Examples +# -------- +# +# Specify the Schema for paths like "/2009/June": +# +# >>> import pyarrow as pa +# >>> import pyarrow.dataset as ds +# >>> import pyarrow.compute as pc +# >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) +# >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) +# ('1862/Jan', '') +# """ +# @property +# def schema(self) -> lib.Schema: +# """The arrow Schema attached to the partitioning.""" +# +# class PartitioningFactory(lib._Weakrefable): +# @property +# def type_name(self) -> str: ... +# +# class KeyValuePartitioning(Partitioning): +# @property +# def dictionaries(self) -> list[lib.Array | None]: +# """ +# The unique values for each partition field, if available. +# +# Those values are only available if the Partitioning object was +# created through dataset discovery from a PartitioningFactory, or +# if the dictionaries were manually specified in the constructor. +# If no dictionary field is available, this returns an empty list. +# """ +# +# class DirectoryPartitioning(KeyValuePartitioning): +# """ +# A Partitioning based on a specified Schema. +# +# The DirectoryPartitioning expects one segment in the file path for each +# field in the schema (all fields are required to be present). +# For example given schema the path "/2009/11" would +# be parsed to ("year"_ == 2009 and "month"_ == 11). +# +# Parameters +# ---------- +# schema : Schema +# The schema that describes the partitions present in the file path. +# dictionaries : dict[str, Array] +# If the type of any field of `schema` is a dictionary type, the +# corresponding entry of `dictionaries` must be an array containing +# every value which may be taken by the corresponding column or an +# error will be raised in parsing. +# segment_encoding : str, default "uri" +# After splitting paths into segments, decode the segments. Valid +# values are "uri" (URI-decode segments) and "none" (leave as-is). +# +# Returns +# ------- +# DirectoryPartitioning +# +# Examples +# -------- +# >>> from pyarrow.dataset import DirectoryPartitioning +# >>> partitioning = DirectoryPartitioning( +# ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) +# ... ) +# >>> print(partitioning.parse("/2009/11/")) +# ((year == 2009) and (month == 11)) +# """ +# +# @staticmethod +# def discover( +# field_names: list[str] | None = None, +# infer_dictionary: bool = False, +# max_partition_dictionary_size: int = 0, +# schema: lib.Schema | None = None, +# segment_encoding: Literal["uri", "none"] = "uri", +# ) -> PartitioningFactory: +# """ +# Discover a DirectoryPartitioning. +# +# Parameters +# ---------- +# field_names : list of str +# The names to associate with the values from the subdirectory names. +# If schema is given, will be populated from the schema. +# infer_dictionary : bool, default False +# When inferring a schema for partition fields, yield dictionary +# encoded types instead of plain types. This can be more efficient +# when materializing virtual columns, and Expressions parsed by the +# finished Partitioning will include dictionaries of all unique +# inspected values for each field. +# max_partition_dictionary_size : int, default 0 +# Synonymous with infer_dictionary for backwards compatibility with +# 1.0: setting this to -1 or None is equivalent to passing +# infer_dictionary=True. +# schema : Schema, default None +# Use this schema instead of inferring a schema from partition +# values. Partition values will be validated against this schema +# before accumulation into the Partitioning's dictionary. +# segment_encoding : str, default "uri" +# After splitting paths into segments, decode the segments. Valid +# values are "uri" (URI-decode segments) and "none" (leave as-is). +# +# Returns +# ------- +# PartitioningFactory +# To be used in the FileSystemFactoryOptions. +# """ +# def __init__( +# self, +# schema: lib.Schema, +# dictionaries: dict[str, lib.Array] | None = None, +# segment_encoding: Literal["uri", "none"] = "uri", +# ) -> None: ... +# +# class HivePartitioning(KeyValuePartitioning): +# """ +# A Partitioning for "/$key=$value/" nested directories as found in +# Apache Hive. +# +# Multi-level, directory based partitioning scheme originating from +# Apache Hive with all data files stored in the leaf directories. Data is +# partitioned by static values of a particular column in the schema. +# Partition keys are represented in the form $key=$value in directory names. +# Field order is ignored, as are missing or unrecognized field names. +# +# For example, given schema, a possible +# path would be "/year=2009/month=11/day=15". +# +# Parameters +# ---------- +# schema : Schema +# The schema that describes the partitions present in the file path. +# dictionaries : dict[str, Array] +# If the type of any field of `schema` is a dictionary type, the +# corresponding entry of `dictionaries` must be an array containing +# every value which may be taken by the corresponding column or an +# error will be raised in parsing. +# null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" +# If any field is None then this fallback will be used as a label +# segment_encoding : str, default "uri" +# After splitting paths into segments, decode the segments. Valid +# values are "uri" (URI-decode segments) and "none" (leave as-is). +# +# Returns +# ------- +# HivePartitioning +# +# Examples +# -------- +# >>> from pyarrow.dataset import HivePartitioning +# >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) +# >>> print(partitioning.parse("/year=2009/month=11/")) +# ((year == 2009) and (month == 11)) +# +# """ +# def __init__( +# self, +# schema: lib.Schema, +# dictionaries: dict[str, lib.Array] | None = None, +# null_fallback: str = "__HIVE_DEFAULT_PARTITION__", +# segment_encoding: Literal["uri", "none"] = "uri", +# ) -> None: ... +# @staticmethod +# def discover( +# infer_dictionary: bool = False, +# max_partition_dictionary_size: int = 0, +# null_fallback="__HIVE_DEFAULT_PARTITION__", +# schema: lib.Schema | None = None, +# segment_encoding: Literal["uri", "none"] = "uri", +# ) -> PartitioningFactory: +# """ +# Discover a HivePartitioning. +# +# Parameters +# ---------- +# infer_dictionary : bool, default False +# When inferring a schema for partition fields, yield dictionary +# encoded types instead of plain. This can be more efficient when +# materializing virtual columns, and Expressions parsed by the +# finished Partitioning will include dictionaries of all unique +# inspected values for each field. +# max_partition_dictionary_size : int, default 0 +# Synonymous with infer_dictionary for backwards compatibility with +# 1.0: setting this to -1 or None is equivalent to passing +# infer_dictionary=True. +# null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" +# When inferring a schema for partition fields this value will be +# replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ +# for compatibility with Spark +# schema : Schema, default None +# Use this schema instead of inferring a schema from partition +# values. Partition values will be validated against this schema +# before accumulation into the Partitioning's dictionary. +# segment_encoding : str, default "uri" +# After splitting paths into segments, decode the segments. Valid +# values are "uri" (URI-decode segments) and "none" (leave as-is). +# +# Returns +# ------- +# PartitioningFactory +# To be used in the FileSystemFactoryOptions. +# """ +# +# class FilenamePartitioning(KeyValuePartitioning): +# """ +# A Partitioning based on a specified Schema. +# +# The FilenamePartitioning expects one segment in the file name for each +# field in the schema (all fields are required to be present) separated +# by '_'. For example given schema the name +# ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). +# +# Parameters +# ---------- +# schema : Schema +# The schema that describes the partitions present in the file path. +# dictionaries : dict[str, Array] +# If the type of any field of `schema` is a dictionary type, the +# corresponding entry of `dictionaries` must be an array containing +# every value which may be taken by the corresponding column or an +# error will be raised in parsing. +# segment_encoding : str, default "uri" +# After splitting paths into segments, decode the segments. Valid +# values are "uri" (URI-decode segments) and "none" (leave as-is). +# +# Returns +# ------- +# FilenamePartitioning +# +# Examples +# -------- +# >>> from pyarrow.dataset import FilenamePartitioning +# >>> partitioning = FilenamePartitioning( +# ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) +# ... ) +# >>> print(partitioning.parse("2009_11_data.parquet")) +# ((year == 2009) and (month == 11)) +# """ +# +# def __init__( +# self, +# schema: lib.Schema, +# dictionaries: dict[str, lib.Array] | None = None, +# segment_encoding: Literal["uri", "none"] = "uri", +# ) -> None: ... +# @staticmethod +# def discover( +# field_names: list[str] | None = None, +# infer_dictionary: bool = False, +# schema: lib.Schema | None = None, +# segment_encoding: Literal["uri", "none"] = "uri", +# ) -> PartitioningFactory: +# """ +# Discover a FilenamePartitioning. +# +# Parameters +# ---------- +# field_names : list of str +# The names to associate with the values from the subdirectory names. +# If schema is given, will be populated from the schema. +# infer_dictionary : bool, default False +# When inferring a schema for partition fields, yield dictionary +# encoded types instead of plain types. This can be more efficient +# when materializing virtual columns, and Expressions parsed by the +# finished Partitioning will include dictionaries of all unique +# inspected values for each field. +# schema : Schema, default None +# Use this schema instead of inferring a schema from partition +# values. Partition values will be validated against this schema +# before accumulation into the Partitioning's dictionary. +# segment_encoding : str, default "uri" +# After splitting paths into segments, decode the segments. Valid +# values are "uri" (URI-decode segments) and "none" (leave as-is). +# +# Returns +# ------- +# PartitioningFactory +# To be used in the FileSystemFactoryOptions. +# """ +# +# class DatasetFactory(lib._Weakrefable): +# """ +# DatasetFactory is used to create a Dataset, inspect the Schema +# of the fragments contained in it, and declare a partitioning. +# """ +# +# root_partition: Expression +# def finish(self, schema: lib.Schema | None = None) -> Dataset: +# """ +# Create a Dataset using the inspected schema or an explicit schema +# (if given). +# +# Parameters +# ---------- +# schema : Schema, default None +# The schema to conform the source to. If None, the inspected +# schema is used. +# +# Returns +# ------- +# Dataset +# """ +# def inspect(self) -> lib.Schema: +# """ +# Inspect all data fragments and return a common Schema. +# +# Returns +# ------- +# Schema +# """ +# def inspect_schemas(self) -> list[lib.Schema]: ... +# +# class FileSystemFactoryOptions(lib._Weakrefable): +# """ +# Influences the discovery of filesystem paths. +# +# Parameters +# ---------- +# partition_base_dir : str, optional +# For the purposes of applying the partitioning, paths will be +# stripped of the partition_base_dir. Files not matching the +# partition_base_dir prefix will be skipped for partitioning discovery. +# The ignored files will still be part of the Dataset, but will not +# have partition information. +# partitioning : Partitioning/PartitioningFactory, optional +# Apply the Partitioning to every discovered Fragment. See Partitioning or +# PartitioningFactory documentation. +# exclude_invalid_files : bool, optional (default True) +# If True, invalid files will be excluded (file format specific check). +# This will incur IO for each files in a serial and single threaded +# fashion. Disabling this feature will skip the IO, but unsupported +# files may be present in the Dataset (resulting in an error at scan +# time). +# selector_ignore_prefixes : list, optional +# When discovering from a Selector (and not from an explicit file list), +# ignore files and directories matching any of these prefixes. +# By default this is ['.', '_']. +# """ +# +# partitioning: Partitioning +# partitioning_factory: PartitioningFactory +# partition_base_dir: str +# exclude_invalid_files: bool +# selector_ignore_prefixes: list[str] +# +# def __init__( +# self, +# artition_base_dir: str | None = None, +# partitioning: Partitioning | PartitioningFactory | None = None, +# exclude_invalid_files: bool = True, +# selector_ignore_prefixes: list[str] | None = None, +# ) -> None: ... +# +# class FileSystemDatasetFactory(DatasetFactory): +# """ +# Create a DatasetFactory from a list of paths with schema inspection. +# +# Parameters +# ---------- +# filesystem : pyarrow.fs.FileSystem +# Filesystem to discover. +# paths_or_selector : pyarrow.fs.FileSelector or list of path-likes +# Either a Selector object or a list of path-like objects. +# format : FileFormat +# Currently only ParquetFileFormat and IpcFileFormat are supported. +# options : FileSystemFactoryOptions, optional +# Various flags influencing the discovery of filesystem paths. +# """ +# +# def __init__( +# self, +# filesystem: SupportedFileSystem, +# paths_or_selector: FileSelector, +# format: FileFormat, +# options: FileSystemFactoryOptions | None = None, +# ) -> None: ... +# +# class UnionDatasetFactory(DatasetFactory): +# """ +# Provides a way to inspect/discover a Dataset's expected schema before +# materialization. +# +# Parameters +# ---------- +# factories : list of DatasetFactory +# """ +# def __init__(self, factories: list[DatasetFactory]) -> None: ... +# +# _RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) +# +# class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): +# """An iterator over a sequence of record batches.""" +# def __iter__(self) -> Self: ... +# def __next__(self) -> _RecordBatchT: ... +# +# class TaggedRecordBatch(NamedTuple): +# """ +# A combination of a record batch and the fragment it came from. +# +# Parameters +# ---------- +# record_batch : RecordBatch +# The record batch. +# fragment : Fragment +# Fragment of the record batch. +# """ +# +# record_batch: lib.RecordBatch +# fragment: Fragment +# +# class TaggedRecordBatchIterator(lib._Weakrefable): +# """An iterator over a sequence of record batches with fragments.""" +# def __iter__(self) -> Self: ... +# def __next__(self) -> TaggedRecordBatch: ... +# +# class Scanner(lib._Weakrefable): +# """A materialized scan operation with context and options bound. +# +# A scanner is the class that glues the scan tasks, data fragments and data +# sources together. +# """ +# @staticmethod +# def from_dataset( +# dataset: Dataset, +# *, +# columns: list[str] | dict[str, Expression] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Scanner: +# """ +# Create Scanner from Dataset, +# +# Parameters +# ---------- +# dataset : Dataset +# Dataset to scan. +# columns : list[str] or dict[str, Expression], default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# """ +# @staticmethod +# def from_fragment( +# fragment: Fragment, +# *, +# schema: lib.Schema | None = None, +# columns: list[str] | dict[str, Expression] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Scanner: +# """ +# Create Scanner from Fragment, +# +# Parameters +# ---------- +# fragment : Fragment +# fragment to scan. +# schema : Schema, optional +# The schema of the fragment. +# columns : list[str] or dict[str, Expression], default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# """ +# @overload +# @staticmethod +# def from_batches( +# source: Iterator[lib.RecordBatch], +# *, +# schema: lib.Schema, +# columns: list[str] | dict[str, Expression] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Scanner: ... +# @overload +# @staticmethod +# def from_batches( +# source: RecordBatchReader, +# *, +# columns: list[str] | dict[str, Expression] | None = None, +# filter: Expression | None = None, +# batch_size: int = ..., +# batch_readahead: int = 16, +# fragment_readahead: int = 4, +# fragment_scan_options: FragmentScanOptions | None = None, +# use_threads: bool = True, +# cache_metadata: bool = True, +# memory_pool: lib.MemoryPool | None = None, +# ) -> Scanner: ... +# @staticmethod +# def from_batches(*args, **kwargs): +# """ +# Create a Scanner from an iterator of batches. +# +# This creates a scanner which can be used only once. It is +# intended to support writing a dataset (which takes a scanner) +# from a source which can be read only once (e.g. a +# RecordBatchReader or generator). +# +# Parameters +# ---------- +# source : Iterator or Arrow-compatible stream object +# The iterator of Batches. This can be a pyarrow RecordBatchReader, +# any object that implements the Arrow PyCapsule Protocol for +# streams, or an actual Python iterator of RecordBatches. +# schema : Schema +# The schema of the batches (required when passing a Python +# iterator). +# columns : list[str] or dict[str, Expression], default None +# The columns to project. This can be a list of column names to +# include (order and duplicates will be preserved), or a dictionary +# with {new_column_name: expression} values for more advanced +# projections. +# +# The list of columns or expressions may use the special fields +# `__batch_index` (the index of the batch within the fragment), +# `__fragment_index` (the index of the fragment within the dataset), +# `__last_in_fragment` (whether the batch is last in fragment), and +# `__filename` (the name of the source file or a description of the +# source fragment). +# +# The columns will be passed down to Datasets and corresponding data +# fragments to avoid loading, copying, and deserializing columns +# that will not be required further down the compute chain. +# By default all of the available columns are projected. Raises +# an exception if any of the referenced column names does not exist +# in the dataset's Schema. +# filter : Expression, default None +# Scan will return only the rows matching the filter. +# If possible the predicate will be pushed down to exploit the +# partition information or internal metadata found in the data +# source, e.g. Parquet statistics. Otherwise filters the loaded +# RecordBatches before yielding them. +# batch_size : int, default 131_072 +# The maximum row count for scanned record batches. If scanned +# record batches are overflowing memory then this method can be +# called to reduce their size. +# batch_readahead : int, default 16 +# The number of batches to read ahead in a file. This might not work +# for all file formats. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_readahead : int, default 4 +# The number of files to read ahead. Increasing this number will increase +# RAM usage but could also improve IO utilization. +# fragment_scan_options : FragmentScanOptions, default None +# Options specific to a particular scan and fragment type, which +# can change between different scans of the same dataset. +# use_threads : bool, default True +# If enabled, then maximum parallelism will be used determined by +# the number of available CPU cores. +# cache_metadata : bool, default True +# If enabled, metadata may be cached when scanning to speed up +# repeated scans. +# memory_pool : MemoryPool, default None +# For memory allocations, if required. If not specified, uses the +# default pool. +# """ +# @property +# def dataset_schema(self) -> lib.Schema: +# """The schema with which batches will be read from fragments.""" +# @property +# def projected_schema(self) -> lib.Schema: +# """ +# The materialized schema of the data, accounting for projections. +# +# This is the schema of any data returned from the scanner. +# """ +# def to_batches(self) -> Iterator[lib.RecordBatch]: +# """ +# Consume a Scanner in record batches. +# +# Returns +# ------- +# record_batches : iterator of RecordBatch +# """ +# def scan_batches(self) -> TaggedRecordBatchIterator: +# """ +# Consume a Scanner in record batches with corresponding fragments. +# +# Returns +# ------- +# record_batches : iterator of TaggedRecordBatch +# """ +# def to_table(self) -> lib.Table: +# """ +# Convert a Scanner into a Table. +# +# Use this convenience utility with care. This will serially materialize +# the Scan result in memory before creating the Table. +# +# Returns +# ------- +# Table +# """ +# def take(self, indices: Indices) -> lib.Table: +# """ +# Select rows of data by index. +# +# Will only consume as many batches of the underlying dataset as +# needed. Otherwise, this is equivalent to +# ``to_table().take(indices)``. +# +# Parameters +# ---------- +# indices : Array or array-like +# indices of rows to select in the dataset. +# +# Returns +# ------- +# Table +# """ +# def head(self, num_rows: int) -> lib.Table: +# """ +# Load the first N rows of the dataset. +# +# Parameters +# ---------- +# num_rows : int +# The number of rows to load. +# +# Returns +# ------- +# Table +# """ +# def count_rows(self) -> int: +# """ +# Count rows matching the scanner filter. +# +# Returns +# ------- +# count : int +# """ +# def to_reader(self) -> RecordBatchReader: +# """Consume this scanner as a RecordBatchReader. +# +# Returns +# ------- +# RecordBatchReader +# """ +# +# def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: +# """ +# Extract partition keys (equality constraints between a field and a scalar) +# from an expression as a dict mapping the field's name to its value. +# +# NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning +# will be conjunctions of equality conditions and are accessible through this +# function. Other subexpressions will be ignored. +# +# Parameters +# ---------- +# partition_expression : pyarrow.dataset.Expression +# +# Returns +# ------- +# dict +# +# Examples +# -------- +# +# For example, an expression of +# +# is converted to {'part': 'A', 'year': 2016} +# """ +# +# class WrittenFile(lib._Weakrefable): +# """ +# Metadata information about files written as +# part of a dataset write operation +# +# Parameters +# ---------- +# path : str +# Path to the file. +# metadata : pyarrow.parquet.FileMetaData, optional +# For Parquet files, the Parquet file metadata. +# size : int +# The size of the file in bytes. +# """ +# def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... +# +# def _filesystemdataset_write( +# data: Scanner, +# base_dir: StrPath, +# basename_template: str, +# filesystem: SupportedFileSystem, +# partitioning: Partitioning, +# file_options: FileWriteOptions, +# max_partitions: int, +# file_visitor: Callable[[str], None], +# existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], +# max_open_files: int, +# max_rows_per_file: int, +# min_rows_per_group: int, +# max_rows_per_group: int, +# create_dir: bool, +# ): ... +# +# class _ScanNodeOptions(ExecNodeOptions): +# def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... +# +# class ScanNodeOptions(_ScanNodeOptions): +# """ +# A Source node which yields batches from a Dataset scan. +# +# This is the option class for the "scan" node factory. +# +# This node is capable of applying pushdown projections or filters +# to the file readers which reduce the amount of data that needs to +# be read (if supported by the file format). But note that this does not +# construct associated filter or project nodes to perform the final +# filtering or projection. Rather, you may supply the same filter +# expression or projection to the scan node that you also supply +# to the filter or project node. +# +# Yielded batches will be augmented with fragment/batch indices when +# implicit_ordering=True to enable stable ordering for simple ExecPlans. +# +# Parameters +# ---------- +# dataset : pyarrow.dataset.Dataset +# The table which acts as the data source. +# **kwargs : dict, optional +# Scan options. See `Scanner.from_dataset` for possible arguments. +# require_sequenced_output : bool, default False +# Batches are yielded sequentially, like single-threaded +# implicit_ordering : bool, default False +# Preserve implicit ordering of data. +# """ +# +# def __init__( +# self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs +# ) -> None: ... diff --git a/python/pyarrow-stubs/_ipc.pyi b/python/pyarrow-stubs/_ipc.pyi new file mode 100644 index 00000000000..fc48cae3c04 --- /dev/null +++ b/python/pyarrow-stubs/_ipc.pyi @@ -0,0 +1,709 @@ +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile +from ._types import DictionaryMemo, KeyValueMetadata + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + +class WriteStats(NamedTuple): + """IPC write statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class ReadStats(NamedTuple): + """IPC read statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class IpcReadOptions(_Weakrefable): + """ + Serialization options for reading IPC format. + + Parameters + ---------- + ensure_native_endian : bool, default True + Whether to convert incoming data to platform-native endianness. + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like decompression + included_fields : list + If empty (the default), return all deserialized fields. + If non-empty, the values are the indices of fields to read on + the top-level schema + """ + + ensure_native_endian: bool + use_threads: bool + included_fields: list[int] + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + included_fields: list[int] | None = None, + ) -> None: ... + +class IpcWriteOptions(_Weakrefable): + """ + Serialization options for the IPC format. + + Parameters + ---------- + metadata_version : MetadataVersion, default MetadataVersion.V5 + The metadata version to write. V5 is the current and latest, + V4 is the pre-1.0 metadata version (with incompatible Union layout). + allow_64bit : bool, default False + If true, allow field lengths that don't fit in a signed 32-bit int. + use_legacy_format : bool, default False + Whether to use the pre-Arrow 0.15 IPC format. + compression : str, Codec, or None + compression codec to use for record batch buffers. + If None then batch buffers will be uncompressed. + Must be "lz4", "zstd" or None. + To specify a compression_level use `pyarrow.Codec` + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like compression. + emit_dictionary_deltas : bool + Whether to emit dictionary deltas. Default is false for maximum + stream compatibility. + unify_dictionaries : bool + If true then calls to write_table will attempt to unify dictionaries + across all batches in the table. This can help avoid the need for + replacement dictionaries (which the file format does not support) + but requires computing the unified dictionary and then remapping + the indices arrays. + + This parameter is ignored when writing to the IPC stream format as + the IPC stream format can support replacement dictionaries. + """ + + metadata_version: MetadataVersion + allow_64bit: bool + use_legacy_format: bool + compression: Codec | Literal["lz4", "zstd"] | None + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + +class Message(_Weakrefable): + """ + Container for an Arrow IPC message with metadata and optional body + """ + + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + def serialize_to( + self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None + ): + """ + Write message to generic OutputStream + + Parameters + ---------- + sink : NativeFile + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + """ + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write message as encapsulated IPC message + + Parameters + ---------- + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + """ + +class MessageReader(_Weakrefable): + """ + Interface for reading Message objects from some source (like an + InputStream) + """ + @classmethod + def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: + """ + Open stream from source, if you want to use memory map use + MemoryMappedFile as source. + + Parameters + ---------- + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object + A readable source, like an InputStream + """ + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: + """ + Read next Message from the stream. + + Raises + ------ + StopIteration + At end of stream + """ + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + +class _CRecordBatchWriter(_Weakrefable): + """The base RecordBatchWriter wrapper. + + Provides common implementations of convenience methods. Should not + be instantiated directly by user code. + """ + def write(self, table_or_batch: Table | RecordBatch): + """ + Write RecordBatch or Table to stream. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + """ + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + custom_metadata : mapping or KeyValueMetadata + Keys and values must be string-like / coercible to bytes + """ + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: + """ + Current IPC write statistics. + """ + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + @property + def _use_legacy_format(self) -> bool: ... + @property + def _metadata_version(self) -> MetadataVersion: ... + def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... + + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: + """ + Read contents of stream to a pandas.DataFrame. + + Read all record batches as a pyarrow.Table then convert it to a + pandas.DataFrame using Table.to_pandas. + + Parameters + ---------- + **options + Arguments to forward to :meth:`Table.to_pandas`. + + Returns + ------- + df : pandas.DataFrame + """ + +class RecordBatchReader(_Weakrefable): + """Base class for reading stream of record batches. + + Record batch readers function as iterators of record batches that also + provide the schema (without the need to get any batches). + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatchReader.from_*`` functions instead. + + Notes + ----- + To import and export using the Arrow C stream interface, use the + ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this + interface is intended for expert users. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([("x", pa.int64())]) + >>> def iter_record_batches(): + ... for i in range(2): + ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) + >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) + >>> print(reader.schema) + x: int64 + >>> for batch in reader: + ... print(batch) + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + """ + + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: + """ + Read next RecordBatch from the stream. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + RecordBatch + """ + __next__ = read_next_batch + @property + def schema(self) -> Schema: + """ + Shared schema of the record batches in the stream. + + Returns + ------- + Schema + """ + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: + """ + Read next RecordBatch from the stream along with its custom metadata. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: + """ + Iterate over record batches from the stream along with their custom + metadata. + + Yields + ------ + RecordBatchWithMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table. + + Returns + ------- + Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def close(self) -> None: + """ + Release any resources associated with the reader. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: + """ + Wrap this reader with one that casts each batch lazily as it is pulled. + Currently only a safe cast to target_schema is implemented. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + + Returns + ------- + RecordBatchReader + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowArrayStream struct, given its pointer. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + Be careful: if you don't pass the ArrowArrayStream struct to a + consumer, array memory will leak. This is a low-level function + intended for expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream struct, + given its pointer. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: + """ + Create RecordBatchReader from a Arrow-compatible stream object. + + This accepts objects implementing the Arrow PyCapsule Protocol for + streams, i.e. objects that have a ``__arrow_c_stream__`` method. + + Parameters + ---------- + data : Arrow-compatible stream object + Any object that implements the Arrow PyCapsule Protocol for + streams. + schema : Schema, default None + The schema to which the stream should be casted, if supported + by the stream object. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: + """ + Create RecordBatchReader from an iterable of batches. + + Parameters + ---------- + schema : Schema + The shared schema of the record batches + batches : Iterable[RecordBatch] + The batches that this reader will return. + + Returns + ------- + reader : RecordBatchReader + """ + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: + """ + Current IPC read statistics. + """ + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... + +class RecordBatchWithMetadata(NamedTuple): + """RecordBatch with its custom metadata + + Parameters + ---------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + + batch: RecordBatch + custom_metadata: KeyValueMetadata + +class _RecordBatchFileReader(_Weakrefable): + @property + def num_record_batches(self) -> int: + """ + The number of record batches in the IPC file. + """ + def get_batch(self, i: int) -> RecordBatch: + """ + Read the record batch with the given index. + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + """ + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: + """ + Read the record batch with the given index along with + its custom metadata + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + +def get_tensor_size(tensor: Tensor) -> int: + """ + Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. + """ + +def get_record_batch_size(batch: RecordBatch) -> int: + """ + Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. + """ + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: + """ + Write pyarrow.Tensor to pyarrow.NativeFile object its current position. + + Parameters + ---------- + tensor : pyarrow.Tensor + dest : pyarrow.NativeFile + + Returns + ------- + bytes_written : int + Total number of bytes written to the file + """ + +def read_tensor(source: NativeFile) -> Tensor: + """Read pyarrow.Tensor from pyarrow.NativeFile object from current + position. If the file source supports zero copy (e.g. a memory map), then + this operation does not allocate any memory. This function not assume that + the stream is aligned + + Parameters + ---------- + source : pyarrow.NativeFile + + Returns + ------- + tensor : Tensor + + """ + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: + """ + Read length-prefixed message from file or buffer-like object + + Parameters + ---------- + source : pyarrow.NativeFile, file-like object, or buffer-like object + + Returns + ------- + message : Message + """ + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: + """ + Read Schema from message or buffer + + Parameters + ---------- + obj : buffer or Message + dictionary_memo : DictionaryMemo, optional + Needed to be able to reconstruct dictionary-encoded fields + with read_record_batch + + Returns + ------- + schema : Schema + """ + +def read_record_batch( + obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None +) -> RecordBatch: + """ + Read RecordBatch from message, given a known schema. If reading data from a + complete IPC stream, use ipc.open_stream instead + + Parameters + ---------- + obj : Message or Buffer-like + schema : Schema + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + + Returns + ------- + batch : RecordBatch + """ + +__all__ = [ + "MetadataVersion", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/pyarrow-stubs/_types.pyi b/python/pyarrow-stubs/_types.pyi index 32543d4b04b..6596fb3e1d1 100644 --- a/python/pyarrow-stubs/_types.pyi +++ b/python/pyarrow-stubs/_types.pyi @@ -32,14 +32,13 @@ import numpy as np import pandas as pd from pyarrow._stubs_typing import SupportArrowSchema -# TODO from pyarrow.lib import ( Array, - # ChunkedArray, + ChunkedArray, ExtensionArray, MemoryPool, MonthDayNano, - # Table, + Table, ) from typing_extensions import TypeVar, deprecated @@ -1151,9 +1150,7 @@ class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): @property def value_type(self) -> _BasicValueT: ... -# TODO: replace below with: -# _StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) -_StorageT = TypeVar("_StorageT", bound=Array | Any) +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) class BaseExtensionType(DataType): """ @@ -1438,6 +1435,7 @@ class OpaqueType(BaseExtensionType): The name of the external system. """ +# TODO # @deprecated( # "This class is deprecated and its deserialization is disabled by default. " # ":class:`ExtensionType` is recommended instead." @@ -2063,9 +2061,7 @@ class Schema(_Weakrefable): >>> schema.metadata {b'n_legs': b'Number of legs per animal'} """ - # TODO: replace below with: - # def empty_table(self) -> Table: - def empty_table(self) -> Any: + def empty_table(self) -> Table: """ Provide an empty table according to the schema. diff --git a/python/pyarrow-stubs/array.pyi b/python/pyarrow-stubs/array.pyi index fcd9ec8f135..3027d689372 100644 --- a/python/pyarrow-stubs/array.pyi +++ b/python/pyarrow-stubs/array.pyi @@ -3310,4 +3310,5 @@ __all__ = [ "FixedShapeTensorArray", "concat_arrays", "_empty_array", + "_CastAs", ] diff --git a/python/pyarrow-stubs/compute.pyi b/python/pyarrow-stubs/compute.pyi new file mode 100644 index 00000000000..5c816773c62 --- /dev/null +++ b/python/pyarrow-stubs/compute.pyi @@ -0,0 +1,6168 @@ +# ruff: noqa: I001 +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from collections.abc import Callable + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import Function as Function +from pyarrow._compute import FunctionOptions as FunctionOptions +from pyarrow._compute import FunctionRegistry as FunctionRegistry +from pyarrow._compute import HashAggregateFunction as HashAggregateFunction +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel +from pyarrow._compute import IndexOptions as IndexOptions +from pyarrow._compute import JoinOptions as JoinOptions +from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction +from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import ScalarFunction as ScalarFunction +from pyarrow._compute import ScalarKernel as ScalarKernel +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import UdfContext as UdfContext +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import VectorFunction as VectorFunction +from pyarrow._compute import VectorKernel as VectorKernel +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions + +# Functions +from pyarrow._compute import call_function as call_function + +# Udf +from pyarrow._compute import call_tabular_function as call_tabular_function +from pyarrow._compute import function_registry as function_registry +from pyarrow._compute import get_function as get_function +from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_aggregate_function as register_aggregate_function +from pyarrow._compute import register_scalar_function as register_scalar_function +from pyarrow._compute import register_tabular_function as register_tabular_function +from pyarrow._compute import register_vector_function as register_vector_function + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: + """Reference a column of the dataset. + + Stores only the field's name. Type and other information is known only when + the expression is bound to a dataset having an explicit scheme. + + Nested references are allowed by passing multiple names or a tuple of + names. For example ``('foo', 'bar')`` references the field named "bar" + inside the field named "foo". + + Parameters + ---------- + *name_or_index : string, multiple strings, tuple or int + The name or index of the (possibly nested) field the expression + references to. + + Returns + ------- + field_expr : Expression + Reference to the given field + + Examples + -------- + >>> import pyarrow.compute as pc + >>> pc.field("a") + + >>> pc.field(1) + + >>> pc.field(("a", "b")) + >> pc.field("a", "b") + Expression: + """Expression representing a scalar value. + + Creates an Expression object representing a scalar value that can be used + in compute expressions and predicates. + + Parameters + ---------- + value : bool, int, float or string + Python value of the scalar. This function accepts any value that can be + converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. + + Notes + ----- + This function differs from ``pyarrow.scalar()`` in the following way: + + * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents + a single value in Arrow's memory model. + * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing + a scalar value that can be used in compute expressions, predicates, and + dataset filtering operations. + + Returns + ------- + scalar_expr : Expression + An Expression representing the scalar value + """ + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.Uint32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = ( + lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] +# =============================== 1. Aggregation =============================== + +# ========================= 1.1 functions ========================= + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: + """ + Test whether all elements in a boolean array evaluate to true. + + Null values are ignored by default. + If the `skip_nulls` option is set to false, then Kleene logic is used. + See "kleene_and" for more details on Kleene logic. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +any = _clone_signature(all) +""" +Test whether any element in a boolean array evaluates to true. + +Null values are ignored by default. +If the `skip_nulls` option is set to false, then Kleene logic is used. +See "kleene_or" for more details on Kleene logic. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Approximate median of a numeric array with T-Digest algorithm. + + Nulls and NaNs are ignored. + A null scalar is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of null / non-null values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of unique values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the first value in each group. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: + """ + Compute the first and last values of an array. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array-like + value : Scalar-like object + The value to search for. + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : int + the index, or -1 if not found + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) + >>> pc.index(arr, "ipsum") + + >>> pc.index(arr, "ipsum", start=2) + + >>> pc.index(arr, "amet") + + """ + +last = _clone_signature(first) +""" +Compute the first and last values of an array. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True +In [15]: print(pc.last.__doc__) +Compute the first value in each group. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +max = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min_max = _clone_signature(first_last) +""" +Compute the minimum and maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def mean( + array: FloatScalar | FloatArray + | lib.NumericArray[lib.Scalar[Any]] + | lib.ChunkedArray[lib.Scalar[Any]] + | lib.Scalar[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[Any]: + """ + Compute the mean of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + The result is a double for integer and floating point arguments, + and a decimal with the same bit-width/precision/scale for decimal arguments. + For integers and floats, NaN is returned if min_count = 0 and + there are no values. For decimals, null is returned instead. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: + """ + Compute the modal (most common) values of a numeric array. + + Compute the n most common values and their respective occurrence counts. + The output has type `struct`, where T is the + input type. + The results are ordered by descending `count` first, and ascending `mode` + when breaking ties. + Nulls are ignored. If there are no non-null values in the array, + an empty array is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ModeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + + >>> modes[1] + + """ + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the product of values in a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Compute an array of quantiles of a numeric array or chunked array. + + By default, 0.5 quantile (median) is returned. + If quantile lies between two data points, an interpolated value is + returned based on selected interpolation method. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.QuantileOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the standard deviation of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population standard deviation is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: + """ + Compute the sum of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Approximate quantiles of a numeric array with T-Digest algorithm. + + By default, 0.5 quantile (median) is returned. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.TDigestOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +def variance( + array: NumericScalar | NumericArray, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the variance of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population variance is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array + Indices of the top-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + + [ + 5, + 4, + 2 + ] + """ + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Indices of the bottom-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + + [ + 0, + 1, + 2 + ] + """ + +# ========================= 2. Element-wise (“scalar”) functions ========================= + +# ========================= 2.1 Arithmetic ========================= +def abs( + x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: + """ + Calculate the absolute value of the argument element-wise. + + Results will wrap around on integer overflow. + Use function "abs_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +abs_checked = _clone_signature(abs) +""" +Calculate the absolute value of the argument element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "abs". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def add( + x: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + y: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: + """ + Add the arguments element-wise. + + Results will wrap around on integer overflow. + Use function "add_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +add_checked = _clone_signature(add) +""" +Add the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "add". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +""" + +def divide( + x: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + y: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: + """ + Divide the arguments element-wise. + + Integer division by zero returns an error. However, integer overflow + wraps around, and floating-point division by zero returns an infinite. + Use function "divide_checked" if you want to get an error + in all the aforementioned cases. + + Parameters + ---------- + dividend : Array-like or scalar-like + Argument to compute function. + divisor : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +divide_checked = _clone_signature(divide) +""" +Divide the arguments element-wise. + +An error is returned when trying to divide by zero, or when +integer overflow is encountered. + +Parameters +---------- +dividend : Array-like or scalar-like + Argument to compute function. +divisor : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def exp( + exponent: _FloatArrayT | ArrayOrChunkedArray[NonFloatNumericScalar] | _FloatScalarT | NonFloatNumericScalar | lib.DoubleScalar, + /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression: + """ + Compute Euler's number raised to the power of specified exponent, element-wise. + + If exponent is null the result will be null. + + Parameters + ---------- + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +multiply = _clone_signature(add) +""" +Multiply the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "multiply_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +multiply_checked = _clone_signature(add) +""" +Multiply the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "multiply". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def negate( + x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: + """ + Negate the argument element-wise. + + Results will wrap around on integer overflow. + Use function "negate_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +negate_checked = _clone_signature(negate) +""" +Negate the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "negate". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def power( + base: _NumericScalarT | _NumericArrayT | Expression | _NumericArrayT | NumericScalar, + exponent: _NumericScalarT | _NumericArrayT | Expression | _NumericArrayT | NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: + """ + Raise arguments to power element-wise. + + Integer to negative integer power returns an error. However, integer overflow + wraps around. If either base or exponent is null the result will be null. + + Parameters + ---------- + base : Array-like or scalar-like + Argument to compute function. + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +power_checked = _clone_signature(power) +""" +Raise arguments to power element-wise. + +An error is returned when integer to negative integer power is encountered, +or integer overflow is encountered. + +Parameters +---------- +base : Array-like or scalar-like + Argument to compute function. +exponent : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def sign( + x: NumericOrDurationArray | NumericOrDurationScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] + | lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar | Expression +): + """ + Get the signedness of the arguments element-wise. + + Output is any of (-1,1) for nonzero inputs and 0 for zero input. + NaN values return NaN. Integral values return signedness as Int8 and + floating-point values return it with the same type as the input values. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +def sqrt(x: NumericArray | NumericScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray | FloatScalar | Expression: + """ + Takes the square root of arguments element-wise. + + A negative argument returns a NaN. For a variant that returns an + error, use function "sqrt_checked". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +sqrt_checked = _clone_signature(sqrt) +""" +Takes the square root of arguments element-wise. + +A negative argument returns an error. For a variant that returns a +NaN, use function "sqrt". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +subtract = _clone_signature(add) +""" +Subtract the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "subtract_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subtract_checked = _clone_signature(add) +""" +Subtract the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "subtract". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.1 Bit-wise functions ========================= +def bit_wise_and( + x: _NumericScalarT | _NumericArrayT | NumericScalar | Expression | ArrayOrChunkedArray[NumericScalar], + y: _NumericScalarT | _NumericArrayT | NumericScalar | Expression | ArrayOrChunkedArray[NumericScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: + """ + Bit-wise AND the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def bit_wise_not( + x: _NumericScalarT | _NumericArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: + """ + Bit-wise negate the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +bit_wise_or = _clone_signature(bit_wise_and) +""" +Bit-wise OR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +bit_wise_xor = _clone_signature(bit_wise_and) +""" +Bit-wise XOR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +`x` is returned if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_left_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left_checked = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_left" for a variant that doesn't fail for an invalid shift amount. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +`x` is returned if `y` (the amount to shift by) is: (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_right_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right_checked = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_right" for a variant that doesn't fail for an invalid shift amount + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.2 Rounding functions ========================= +def ceil(x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT | _FloatArrayT | Expression: + """ + Round up to the nearest integer. + + Compute the smallest integer value not less in magnitude than `x`. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor = _clone_signature(ceil) +""" +Round down to the nearest integer. + +Compute the largest integer value not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def round( + x: _NumericScalarT | _NumericArrayT | Expression, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: + """ + Round to a given precision. + + Options are used to control the number of digits and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def round_to_multiple( + x: _NumericScalarT | _NumericArrayT | Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: + """ + Round to a given multiple. + + Options are used to control the rounding multiple and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundToMultipleOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def round_binary( + x: _NumericScalarT | _NumericArrayT | Expression, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT | Expression: + """ + Round to the given precision. + + Options are used to control the rounding mode. + Default behavior is to use the half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + s : Array-like or scalar-like + Argument to compute function. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundBinaryOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +trunc = _clone_signature(ceil) +""" +Compute the integral part. + +Compute the nearest integer not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.3 Logarithmic functions ========================= +def ln( + x: FloatScalar | FloatArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: + """ + Compute natural logarithm. + + Non-positive values return -inf or NaN. Null values return null. + Use function "ln_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ln_checked = _clone_signature(ln) +""" +Compute natural logarithm. + +Non-positive values raise an error. Null values return null. +Use function "ln" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10 = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log10_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10_checked = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log10" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p_checked" if you want invalid values to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p_checked = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p" if you want invalid values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2 = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log2_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2_checked = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log2" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def logb( + x: FloatScalar | FloatArray | Expression | Any, b: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression | Any: + """ + Compute base `b` logarithm. + + Values <= 0 return -inf or NaN. Null values return null. + Use function "logb_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + b : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +logb_checked = _clone_signature(logb) +""" +Compute base `b` logarithm. + +Values <= 0 return -inf or NaN. Null values return null. +Use function "logb" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +b : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +""" +Compute the inverse cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "acos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +acos_checked = _clone_signature(ln) +""" +Compute the inverse cosine. + +Invalid input values raise an error; +to return NaN instead, see "acos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin = _clone_signature(ln) +""" +Compute the inverse sine. + +NaN is returned for invalid input values; +to raise an error instead, see "asin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin_checked = _clone_signature(ln) +""" +Compute the inverse sine. + +Invalid input values raise an error; +to return NaN instead, see "asin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +atan = _clone_signature(ln) +""" +Compute the inverse tangent of x. + +The return value is in the range [-pi/2, pi/2]; +for a full return range [-pi, pi], see "atan2". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos = _clone_signature(ln) +""" +Compute the cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "cos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos_checked = _clone_signature(ln) +""" +Compute the cosine. + +Infinite values raise an error; +to return NaN instead, see "cos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin = _clone_signature(ln) +""" +Compute the sine. + +NaN is returned for invalid input values; +to raise an error instead, see "sin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin_checked = _clone_signature(ln) +""" +Compute the sine. + +Invalid input values raise an error; +to return NaN instead, see "sin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan = _clone_signature(ln) +""" +Compute the tangent. + +NaN is returned for invalid input values; +to raise an error instead, see "tan_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan_checked = _clone_signature(ln) +""" +Compute the tangent. + +Infinite values raise an error; +to return NaN instead, see "tan". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def atan2( + y: FloatScalar | FloatArray | Expression | Any, x: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: + """ + Compute the inverse tangent of y/x. + + The return value is in the range [-pi, pi]. + + Parameters + ---------- + y : Array-like or scalar-like + Argument to compute function. + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.5 Comparisons functions ========================= +def equal( + x: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + y: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Compare values for equality (x == y). + + A null on either side emits a null comparison result. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +greater = _clone_signature(equal) +""" +Compare values for ordered inequality (x > y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +greater_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x >= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less = _clone_signature(equal) +""" +Compare values for ordered inequality (x < y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x <= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +not_equal = _clone_signature(equal) +""" +Compare values for inequality (x != y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT] | Expression, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT | Expression: + """ + Find the element-wise maximum value. + + Nulls are ignored (by default) or propagated. + NaN is preferred over null, but not over any valid value. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +min_element_wise = _clone_signature(max_element_wise) +""" +Find the element-wise minimum value. + +Nulls are ignored (by default) or propagated. +NaN is preferred over null, but not over any valid value. + +Parameters +---------- +*args : Array-like or scalar-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.6 Logical functions ========================= +def and_( + x: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + y: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar]: + """ + Logical 'and' boolean values. + + When a null is encountered in either input, a null is output. + For a different null behavior, see function "and_kleene". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +and_kleene = _clone_signature(and_) +""" +Logical 'and' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and null = null +- null and true = null +- false and null = false +- null and false = false +- null and null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and' false is always false. +For a different null behavior, see function "and". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not = _clone_signature(and_) +""" +Logical 'and not' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "and_not_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not_kleene = _clone_signature(and_) +""" +Logical 'and not' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and not null = null +- null and not false = null +- false and not null = false +- null and not true = false +- null and not null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and not' true is always false, as is false +'and not' an unknown value. +For a different null behavior, see function "and_not". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_ = _clone_signature(and_) +""" +Logical 'or' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "or_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_kleene = _clone_signature(and_) +""" +Logical 'or' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true or null = true +- null or true = true +- false or null = null +- null or false = null +- null or null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'or' true is always true. +For a different null behavior, see function "or". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +xor = _clone_signature(and_) +""" +Logical 'xor' boolean values. + +When a null is encountered in either input, a null is output. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def invert( + x: lib.BooleanScalar | _BooleanArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | _BooleanArrayT | Expression: + """ + Invert boolean values. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.10 String predicates ========================= +def ascii_is_alnum( + strings: StringScalar | StringArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Classify strings as ASCII alphanumeric. + + For each string in `strings`, emit true iff the string is non-empty + and consists only of alphanumeric ASCII characters. Null strings emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alnum = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphanumeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphanumeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_digit = _clone_signature(ascii_is_alnum) +""" +Classify strings as digits. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of Unicode digits. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_numeric = _clone_signature(ascii_is_alnum) +""" +Classify strings as numeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of numeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +string_is_ascii = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII. + +For each string in `strings`, emit true iff the string consists only +of ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.11 String transforms ========================= +def ascii_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: + """ + Capitalize the first character of ASCII input. + + For each string in `strings`, return a capitalized version. + + This function assumes the input is fully ASCII. If it may contain + non-ASCII characters, use "utf8_capitalize" instead. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lower = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to lowercase. + +For each string in `strings`, return a lowercase version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_lower" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_reverse = _clone_signature(ascii_capitalize) +""" +Reverse ASCII input. + +For each ASCII string in `strings`, return a reversed version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_reverse" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_swapcase = _clone_signature(ascii_capitalize) +""" +Transform ASCII input by inverting casing. + +For each string in `strings`, return a string with opposite casing. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_swapcase" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_title = _clone_signature(ascii_capitalize) +""" +Titlecase each word of ASCII input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_title" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_upper = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to uppercase. + +For each string in `strings`, return an uppercase version. + +This function assumes the input is fully ASCII. It it may contain +non-ASCII characters, use "utf8_upper" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def binary_length( + strings: lib.BinaryScalar | lib.StringScalar | lib.LargeBinaryScalar | lib.LargeStringScalar + | lib.BinaryArray | lib.StringArray + | lib.ChunkedArray[lib.BinaryScalar] | lib.ChunkedArray[lib.StringScalar] + | lib.LargeBinaryArray | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeBinaryScalar] | lib.ChunkedArray[lib.LargeStringScalar] + | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: + """ + Compute string lengths. + + For each string in `strings`, emit its length of bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def binary_repeat( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT | Expression: + """ + Repeat a binary string. + + For each binary string in `strings`, return a replicated version. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + num_repeats : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def binary_replace_slice( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: + """ + Replace a slice of a binary string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def binary_reverse( + strings: _BinaryScalarT | _BinaryArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT | _BinaryArrayT | Expression: + """ + Reverse binary input. + + For each binary string in `strings`, return a reversed version. + + This function reverses the binary data at a byte-level. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def replace_substring( + strings: _StringScalarT | _StringArrayT | Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: + """ + Replace matching non-overlapping substrings with replacement. + + For each string in `strings`, replace non-overlapping substrings that match + the given literal `pattern` with the given `replacement`. + If `max_replacements` is given and not equal to -1, it limits the + maximum amount replacements per input, counted from the left. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +replace_substring_regex = _clone_signature(replace_substring) +""" +Replace matching non-overlapping substrings with replacement. + +For each string in `strings`, replace non-overlapping substrings that match +the given regular expression `pattern` with the given `replacement`. +If `max_replacements` is given and not equal to -1, it limits the +maximum amount replacements per input, counted from the left. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +replacement : str + What to replace the pattern with. +max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). +options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def utf8_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: + """ + Capitalize the first character of input. + + For each string in `strings`, return a capitalized version, + with the first character uppercased and the others lowercased. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def utf8_length( + strings: lib.StringScalar | lib.LargeStringScalar | lib.StringArray | lib.ChunkedArray[lib.StringScalar] + | lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: + """ + Compute UTF8 string lengths. + + For each string in `strings`, emit its length in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_lower = _clone_signature(utf8_capitalize) +""" +Transform input to lowercase. + +For each string in `strings`, return a lowercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def utf8_replace_slice( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: + """ + Replace a slice of a string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_reverse = _clone_signature(utf8_capitalize) +""" +Reverse input. + +For each string in `strings`, return a reversed version. + +This function operates on Unicode codepoints, not grapheme +clusters. Hence, it will not correctly reverse grapheme clusters +composed of multiple codepoints. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_swapcase = _clone_signature(utf8_capitalize) +""" +Transform input lowercase characters to uppercase and uppercase characters to lowercase. + +For each string in `strings`, return an opposite case version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_title = _clone_signature(utf8_capitalize) +""" +Titlecase each word of input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_upper = _clone_signature(utf8_capitalize) +""" +Transform input to uppercase. + +For each string in `strings`, return an uppercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory poo +""" + +# ========================= 2.12 String padding ========================= +def ascii_center( + strings: _StringScalarT | _StringArrayT | Expression, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: + """ + Center strings by padding with a given character. + + For each string in `strings`, emit a centered string by padding both sides + with the given ASCII character. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_center = _clone_signature(ascii_center) +""" +Center strings by padding with a given character. + +For each string in `strings`, emit a centered string by padding both sides +with the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.13 String trimming ========================= +def ascii_ltrim( + strings: _StringScalarT | _StringArrayT | Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: + """ + Trim leading characters. + + For each string in `strings`, remove any leading characters + from the `characters` option (as given in TrimOptions). + Null values emit null. + Both the `strings` and the `characters` are interpreted as + ASCII; to trim non-ASCII characters, use `utf8_ltrim`. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + characters : str + Individual characters to be trimmed from the string. + options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_rtrim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_trim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim = _clone_signature(ascii_ltrim) +""" +Trim leading characters. + +For each string in `strings`, remove any leading characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def ascii_ltrim_whitespace( + strings: _StringScalarT | _StringArrayT | Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: + """ + Trim leading ASCII whitespace characters. + + For each string in `strings`, emit a string with leading ASCII whitespace + characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode + whitespace characters. Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with trailing ASCII whitespace +characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with leading and trailing ASCII +whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading whitespace characters. + +For each string in `strings`, emit a string with leading whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing whitespace characters. + +For each string in `strings`, emit a string with trailing whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing whitespace characters. + +For each string in `strings`, emit a string with leading and trailing +whitespace characters removed, where whitespace characters are defined +by the Unicode standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.14 String splitting ========================= +def ascii_split_whitespace( + strings: _StringScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: + """ + Split string according to any ASCII whitespace. + + Split each string according any non-zero length sequence of ASCII + whitespace characters. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def split_pattern( + strings: _StringOrBinaryScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: + """ + Split string according to separator. + + Split each string according to the exact `pattern` defined in + SplitPatternOptions. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitPatternOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +split_pattern_regex = _clone_signature(split_pattern) +""" +Split string according to regex pattern. + +Split each string according to the regex `pattern` defined in +SplitPatternOptions. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitPatternOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + String pattern to split on. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) +""" +Split string according to any Unicode whitespace. + +Split each string according any non-zero length sequence of Unicode +whitespace characters. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.15 String component extraction ========================= +def extract_regex( + strings: StringOrBinaryScalar | StringOrBinaryArray | Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: + """ + Extract substrings captured by a regex pattern. + + For each string in `strings`, match the regular expression and, if + successful, emit a struct with field names and values coming from the + regular expression's named capture groups. If the input is null or the + regular expression fails matching, a null output value is emitted. + + Regular expression matching is done using the Google RE2 library. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Regular expression with named capture fields. + options : pyarrow.compute.ExtractRegexOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: + """ + Join a list of strings together with a separator. + + Concatenate the strings in `list`. The `separator` is inserted + between each given string. + Any null input and any null `list` element emits a null output. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + separator : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def binary_join_element_wise( + *strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: + """ + Join string arguments together, with the last argument as separator. + + Concatenate the `strings` except for the last one. The last argument + in `strings` is inserted between each given string. + Any null separator element emits a null output. Null elements either + emit a null (the default), are skipped, or replaced with a given string. + + Parameters + ---------- + *strings : Array-like or scalar-like + Argument to compute function. + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + options : pyarrow.compute.JoinOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.17 String Slicing ========================= +def binary_slice( + strings: _BinaryScalarT | _BinaryArrayT | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT | _BinaryArrayT | Expression: + """ + Slice binary string. + + For each binary string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + bytes. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def utf8_slice_codeunits( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: + """ + Slice string. + + For each string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + UTF8 codeunits. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.18 Containment tests ========================= +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar | lib.LargeStringScalar | lib.LargeBinaryScalar + | lib.StringArray | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] | lib.ChunkedArray[lib.BinaryScalar] + | lib.LargeStringArray | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] | lib.ChunkedArray[lib.LargeBinaryScalar] + | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: + """ + Count occurrences of substring. + + For each string in `strings`, emit the number of occurrences of the given + literal pattern. + Null inputs emit null. The pattern must be given in MatchSubstringOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +count_substring_regex = _clone_signature(count_substring) +""" +Count occurrences of substring. + +For each string in `strings`, emit the number of occurrences of the given +regular expression pattern. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def ends_with( + strings: StringScalar | BinaryScalar | StringArray | BinaryArray | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Check if strings end with a literal pattern. + + For each string in `strings`, emit true iff it ends with a given pattern. + The pattern must be given in MatchSubstringOptions. + If ignore_case is set, only simple case folding is performed. + + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +find_substring = _clone_signature(count_substring) +""" +Find first occurrence of substring. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +find_substring_regex = _clone_signature(count_substring) +""" +Find location of first match of regex pattern. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def index_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int32Array | Expression: + """ + Return index of each element in a set of values. + + For each element in `values`, return its index in a given set of + values, or null if it is not found there. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def is_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray: + """ + Find each element in a set of values. + + For each element in `values`, return true if it is found in a given + set of values, false otherwise. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +match_like = _clone_signature(ends_with) +""" +Match strings against SQL-style LIKE pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. '%' will match any number of characters, '_' will +match exactly one character, and any other character matches itself. +To match a literal '%', '_', or '\', precede the character with a backslash. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring = _clone_signature(ends_with) +""" +Match strings against literal pattern. + +For each string in `strings`, emit true iff it contains a given pattern. +Null inputs emit null. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring_regex = _clone_signature(ends_with) +""" +Match strings against regex pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +starts_with = _clone_signature(ends_with) +""" +Check if strings start with a literal pattern. + +For each string in `strings`, emit true iff it starts with a given pattern. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.19 Categorizations ========================= +def is_finite( + values: NumericScalar | lib.NullScalar | NumericArray | lib.NullArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Return true if value is finite. + + For each input value, emit true iff the value is finite + (i.e. neither NaN, inf, nor -inf). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +is_inf = _clone_signature(is_finite) +""" +Return true if infinity. + +For each input value, emit true iff the value is infinite (inf or -inf). + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +is_nan = _clone_signature(is_finite) +""" +Return true if NaN. + +For each input value, emit true iff the value is NaN. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def is_null( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Return true if null (and optionally NaN). + + For each input value, emit true iff the value is null. + True may also be emitted for NaN values by setting the `nan_is_null` flag. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + options : pyarrow.compute.NullOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def is_valid( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Return true if non-null. + + For each input value, emit true iff the value is valid (i.e. non-null). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +true_unless_null = _clone_signature(is_valid) +""" +Return true if non-null, else return null. + +For each input value, emit true iff the value +is valid (non-null), otherwise emit null. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): + """ + Choose values based on multiple conditions. + + `cond` must be a struct of Boolean values. `cases` can be a mix + of scalar and array arguments (of any type, but all must be the + same type or castable to a common type), with either exactly one + datum per child of `cond`, or one more `cases` than children of + `cond` (in which case we have an "else" value). + + Each row of the output will be the corresponding value of the + first datum in `cases` for which the corresponding child of `cond` + is true, or otherwise the "else" value (if given), or null. + + Essentially, this implements a switch-case or if-else, if-else... statement. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + *cases : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): + """ + Choose values from several arrays. + + For each row, the value of the first argument is used as a 0-based index + into the list of `values` arrays (i.e. index 0 selects the first of the + `values` arrays). The output value is the corresponding value of the + selected argument. + + If an index is null, the output will be null. + + Parameters + ---------- + indices : Array-like or scalar-like + Argument to compute function. + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def coalesce( + *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT: + """ + Select the first non-null value. + + Each row of the output will be the value from the first corresponding input + for which the value is not null. If all inputs are null in a row, the output + will be null. + + Parameters + ---------- + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +fill_null = coalesce +"""Replace each null element in values with a corresponding +element from fill_value. + +If fill_value is scalar-like, then every null element in values +will be replaced with fill_value. If fill_value is array-like, +then the i-th element in values will be replaced with the i-th +element in fill_value. + +The fill_value's type must be the same as that of values, or it +must be able to be implicitly casted to the array's type. + +This is an alias for :func:`coalesce`. + +Parameters +---------- +values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. +fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as values, will attempt to cast. + +Returns +------- +result : depends on inputs + Values with all null elements replaced + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array([1, 2, None, 3], type=pa.int8()) +>>> fill_value = pa.scalar(5, type=pa.int8()) +>>> arr.fill_null(fill_value) + +[ + 1, + 2, + 5, + 3 +] +>>> arr = pa.array([1, 2, None, 4, None]) +>>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) + +[ + 1, + 2, + 30, + 4, + 50 +] +""" + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: + """ + Choose values based on a condition. + + `cond` must be a Boolean scalar/ array. + `left` or `right` must be of the same type scalar/ array. + `null` values in `cond` will be promoted to the output. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + left : Array-like or scalar-like + Argument to compute function. + right : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.21 Structural transforms ========================= + +def list_value_length( + lists: _ListArray[Any] | _LargeListArray[Any] | ListArray[Any] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array | Expression: + """ + Compute list lengths. + + `lists` must have a list-like type. + For each non-null value in `lists`, its length is emitted. + Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def make_struct( + *args: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: + """ + Wrap Arrays into a StructArray. + + Names of the StructArray's fields are + specified through MakeStructOptions. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + options : pyarrow.compute.MakeStructOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.22 Conversions ========================= +def ceil_temporal( + timestamps: _TemporalScalarT | _TemporalArrayT | Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT | _TemporalArrayT | Expression: + """ + Round temporal values up to nearest multiple of specified time unit. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values down to nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +round_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values to the nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def cast( + arr: lib.Scalar | lib.Array | lib.ChunkedArray, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[_DataTypeT] | lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]]: + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp("ms")) + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp("ms")).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast("timestamp[ms]") + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + >>> arr.cast("timestamp[ms]").type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + The cast result as a new Array + """ + +def strftime( + timestamps: TemporalScalar | TemporalArray | Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar | lib.StringArray | Expression: + """ + Format temporal values according to a format string. + + For each input value, emit a formatted string. + The time format string and locale can be set using StrftimeOptions. + The output precision of the "%S" (seconds) format code depends on + the input time precision: it is an integer for timestamps with + second precision, a real number with the required number of fractional + digits for higher precisions. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database, or if the specified locale + does not exist on this system. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + options : pyarrow.compute.StrftimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def strptime( + strings: StringScalar | StringArray | Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar | lib.TimestampArray | Expression: + """ + Parse timestamps. + + For each string in `strings`, parse it as a timestamp. + The timestamp unit and the expected string pattern must be given + in StrptimeOptions. Null inputs emit null. If a non-null string + fails parsing, an error is returned by default. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + options : pyarrow.compute.StrptimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.23 Temporal component extraction ========================= +def day( + values: TemporalScalar | TemporalArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array | Expression: + """ + Extract day number. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def day_of_week( + values: TemporalScalar | TemporalArray | Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: + """ + Extract day of the week number. + + By default, the week starts on Monday represented by 0 and ends on Sunday + represented by 6. + `DayOfWeekOptions.week_start` can be used to set another starting day using + the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). + Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +day_of_year = _clone_signature(day) +""" +Extract day of year number. + +January 1st maps to day number 1, February 1st to 32, etc. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any] + | lib.TimestampArray[Any] | lib.Time32Array[Any] | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: + """ + Extract hour value. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def is_dst( + values: lib.TimestampScalar | lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Extracts if currently observing daylight savings. + + IsDaylightSavings returns true if a timestamp has a daylight saving + offset in the given timezone. + Null values emit null. + An error is returned if the values do not have a defined timezone. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def iso_week( + values: lib.TimestampScalar | lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array | Expression: + """ + Extract ISO week of year number. + + First ISO week has the majority (4 or more) of its days in January. + ISO week starts on Monday. The week number starts with 1 and can run + up to 53. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +iso_year = _clone_signature(iso_week) +""" +Extract ISO year number. + +First week of an ISO year has the majority (4 or more) of its days in January. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar | lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: + """ + Extract if year is a leap year. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +microsecond = _clone_signature(iso_week) +""" +Extract microsecond values. + +Microsecond returns number of microseconds since the last full millisecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +millisecond = _clone_signature(iso_week) +""" +Extract millisecond values. + +Millisecond returns number of milliseconds since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minute = _clone_signature(iso_week) +""" +Extract minute values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +month = _clone_signature(day_of_week) +""" +Extract month number. + +Month is encoded as January=1, December=12. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +nanosecond = _clone_signature(hour) +""" +Extract nanosecond values. + +Nanosecond returns number of nanoseconds since the last full microsecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarter = _clone_signature(day_of_week) +""" +Extract quarter of year number. + +First quarter maps to 1 and forth quarter maps to 4. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +second = _clone_signature(hour) +""" +Extract second values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subsecond = _clone_signature(hour) +""" +Extract subsecond values. + +Subsecond returns the fraction of a second since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_week = _clone_signature(iso_week) +""" +Extract US week of year number. + +First US week has the majority (4 or more) of its days in January. +US week starts on Monday. The week number starts with 1 and can run +up to 53. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_year = _clone_signature(iso_week) +""" +Extract US epidemiological year number. + +First week of US epidemiological year has the majority (4 or more) of +it's days in January. Last week of US epidemiological year has the +year's last Wednesday in it. US epidemiological week starts on Sunday. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +year = _clone_signature(iso_week) +""" +Extract year number. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def week( + values: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: + """ + Extract week of year number. + + First week has the majority (4 or more) of its days in January. + Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using + DayOfWeekOptions.count_from_zero. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + options : pyarrow.compute.WeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def year_month_day( + values: TemporalScalar | TemporalArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar | lib.StructArray | Expression: + """ + Extract (year, month, day) struct. + + Null values emit null. + An error is returned in the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of days and milliseconds between two timestamps. + + Returns the number of days and milliseconds from `start` to `end`. + That is, first the difference in days is computed as if both + timestamps were truncated to the day, then the difference between time times + of the two timestamps is computed as if both times were truncated to the + millisecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of days between two timestamps. + + Returns the number of day boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the day. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +hours_between = _clone_signature(days_between) +""" +Compute the number of hours between two timestamps. + +Returns the number of hour boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the hour. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +microseconds_between = _clone_signature(days_between) +""" +Compute the number of microseconds between two timestamps. + +Returns the number of microsecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the microsecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +milliseconds_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minutes_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. +In [152]: print(pc.minutes_between.__doc__) +Compute the number of minute boundaries between two timestamps. + +Returns the number of minute boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the minute. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: + """ + Compute the number of months, days and nanoseconds between two timestamps. + + Returns the number of months, days, and nanoseconds from `start` to `end`. + That is, first the difference in months is computed as if both timestamps + were truncated to the months, then the difference between the days + is computed, and finally the difference between the times of the two + timestamps is computed as if both times were truncated to the nanosecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of months between two timestamps. + + Returns the number of month boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the month. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +nanoseconds_between = _clone_signature(days_between) +""" +Compute the number of nanoseconds between two timestamps. + +Returns the number of nanosecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the nanosecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarters_between = _clone_signature(days_between) +""" +Compute the number of quarters between two timestamps. + +Returns the number of quarter start boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the quarter. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +seconds_between = _clone_signature(days_between) +""" +Compute the number of seconds between two timestamps. + +Returns the number of second boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the second. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of weeks between two timestamps. + + Returns the number of week boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the week. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +years_between = _clone_signature(days_between) +""" +Compute the number of years between two timestamps. + +Returns the number of year boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the year. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.25 Timezone handling ========================= +def assume_timezone( + timestamps: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression: + """ + Convert naive timestamp to timezone-aware timestamp. + + Input timestamps are assumed to be relative to the timezone given in the + `timezone` option. They are converted to UTC-relative timestamps and + the output type has its timezone set to the value of the `timezone` + option. Null values emit null. + This function is meant to be used when an external system produces + "timezone-naive" timestamps which need to be converted to + "timezone-aware" timestamps. An error is returned if the timestamps + already have a defined timezone. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + options : pyarrow.compute.AssumeTimezoneOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def local_timestamp( + timestamps: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar | lib.TimestampArray | Expression: + """ + Convert timestamp to a timezone-naive local time timestamp. + + LocalTimestamp converts timezone-aware timestamp to local timestamp + of the given timestamp's timezone and removes timezone metadata. + Alternative name for this timestamp is also wall clock time. + If input is in UTC or without timezone, then unchanged input values + without timezone metadata are returned. + Null values emit null. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Generate numbers in the range [0, 1). + + Generated values are uniformly-distributed, double-precision + in range [0, 1). Algorithm and seed can be changed via RandomOptions. + + Parameters + ---------- + n : int + Number of values to generate, must be greater than or equal to 0 + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + options : pyarrow.compute.RandomOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +def cumulative_sum( + values: _NumericArrayT | Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT | Expression: + """ + Compute the cumulative sum over a numeric input. + + `values` must be numeric. Return an array/chunked array which is the + cumulative sum computed over `values`. Results will wrap around on + integer overflow. Use function "cumulative_sum_checked" if you want + overflow to return an error. The default start is 0. + + Parameters + ---------- + values : Array-like + Argument to compute function. + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +cumulative_sum_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative sum over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative sum computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_sum". The default start is 0. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. Results will wrap around on +integer overflow. Use function "cumulative_prod_checked" if you want +overflow to return an error. The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_prod". The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_max = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_min = _clone_signature(cumulative_sum) +""" +Compute the cumulative min over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative min computed over `values`. The default start is the maximum +value of input type (so that any other value will replace the +start as the new minimum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_mean = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +# ========================= 3.2 Associative transforms ========================= + +def dictionary_encode( + array: _ScalarOrArrayT | Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... +def unique(array: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT | Expression: ... +def value_counts( + array: lib.Array | lib.ChunkedArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray | Expression: ... + +# ========================= 3.3 Selections ========================= +@overload +def array_filter( + array: _ArrayT, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_filter( + array: Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def array_take( + array: _ArrayT, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_take( + array: Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def drop_null( + input: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +filter = array_filter +take = array_take +""" +Select values (or records) from array- or table-like data given integer +selection indices. + +The result will be of the same type(s) as the input, with elements taken +from the input array (or record batch / table fields) at the given +indices. If an index is null then the corresponding value in the output +will be null. + +Parameters +---------- +data : Array, ChunkedArray, RecordBatch, or Table +indices : Array, ChunkedArray + Must be of integer type +boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. +memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +Returns +------- +result : depends on inputs + Selected values for the given indices + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array(["a", "b", "c", None, "e", "f"]) +>>> indices = pa.array([0, None, 4, 3]) +>>> arr.take(indices) + +[ + "a", + null, + "e", + null +] +""" + +# ========================= 3.4 Containment tests ========================= +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: + """ + Return the indices of the values in the array that are non-zero. + + For each input value, check if it's zero, false or null. Emit the index + of the value in the array if it's none of the those. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.5 Sorts and partitions ========================= +def array_sort_indices( + array: lib.Array | lib.ChunkedArray | Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: + """ + Return the indices that would sort an array. + + This function computes an array of indices that define a stable sort + of the input array. By default, Null values are considered greater + than any other value and are therefore sorted at the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in ArraySortOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.ArraySortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray | Expression, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: + """ + Return the indices that would partition an array around a pivot. + + This functions computes an array of indices that define a non-stable + partial sort of the input array. + + The output is such that the `N`'th index points to the `N`'th element + of the input in sorted order, and all indices before the `N`'th point + to elements in the input less or equal to elements at or after the `N`'th. + + By default, null values are considered greater than any other value + and are therefore partitioned towards the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The pivot index `N` must be given in PartitionNthOptions. + The handling of nulls and NaNs can also be changed in PartitionNthOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.PartitionNthOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: + """ + Compute ordinal ranks of an array (1-based). + + This function computes a rank of the input array. + By default, null values are considered greater than any other value and + are therefore sorted at the end of the input. For floating-point types, + NaNs are considered greater than any other non-null value, but smaller + than null values. The default tiebreaker is to assign ranks in order of + when ties appear in the input. + + The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + options : pyarrow.compute.RankOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def select_k_unstable( + input: lib.Array | lib.ChunkedArray | Expression, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: + """ + Select the indices of the first `k` ordered elements from the input. + + This function selects an array of indices of the first `k` ordered elements + from the `input` array, record batch or table specified in the column keys + (`options.sort_keys`). Output is not guaranteed to be stable. + Null values are considered greater than any other value and are + therefore ordered at the end. For floating-point types, NaNs are considered + greater than any other non-null value, but smaller than null values. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + options : pyarrow.compute.SelectKOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: + """ + Return the indices that would sort an array, record batch or table. + + This function computes an array of indices that define a stable sort + of the input array, record batch or table. By default, null values are + considered greater than any other value and are therefore sorted at the + end of the input. For floating-point types, NaNs are considered greater + than any other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in SortOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.SortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.6 Structural transforms ========================= +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]] | lib.ChunkedArray[ListScalar[_DataTypeT]] | ListScalar[_DataTypeT] | Expression, + index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] | _DataTypeT | Expression: + """ + Compute elements using of nested list values using an index. + + `lists` must have a list-like type. + For each value in each list of `lists`, the element at `index` + is emitted. Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + index : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]] | Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: + """ + Flatten list values. + + `lists` must have a list-like type (lists, list-views, and + fixed-size lists). + Return an array with the top list level flattened unless + `recursive` is set to true in ListFlattenOptions. When that + is that case, flattening happens recursively until a non-list + array is formed. + + Null list values do not emit anything to the output. + + Parameters + ---------- + lists : Array-like + Argument to compute function. + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + options : pyarrow.compute.ListFlattenOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def list_parent_indices( + lists: ArrayOrChunkedArray[Any] | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array | Expression: + """ + Compute parent indices of nested list values. + + `lists` must have a list-like or list-view type. + For each value in each list of `lists`, the top-level list index + is emitted. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def list_slice( + lists: ArrayOrChunkedArray[Any] | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: + """ + Compute slice of list-like array. + + `lists` must have a list-like type. + For each list element, compute a slice, returning a new list array. + A variable or fixed size list array is returned, depending on options. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + options : pyarrow.compute.ListSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Find the items corresponding to a given key in a Map. + + For a given query key (passed via MapLookupOptions), extract + either the FIRST, LAST or ALL items from a Map that have + matching keys. + + Parameters + ---------- + container : Array-like or scalar-like + Argument to compute function. + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + options : pyarrow.compute.MapLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Extract children of a struct or union by index. + + Given a list of indices (passed via StructFieldOptions), extract + the child array or scalar with the given child index, recursively. + + For union inputs, nulls are emitted for union values that reference + a different child than specified. Also, the indices are always + in physical order, not logical type codes - for example, the first + child is always index 0. + + An empty list of indices returns the argument unchanged. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + options : pyarrow.compute.StructFieldOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values backward to fill null slots. + + Given an array, propagate next valid observation backward to previous valid + or nothing if all next values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values forward to fill null slots. + + Given an array, propagate last valid observation forward to next valid + or nothing if all previous values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def replace_with_mask( + values, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +): + """ + Replace items selected with a mask. + + Given an array and a boolean mask (either scalar or of equal length), + along with replacement values (either scalar or array), + each element of the array for which the corresponding mask element is + true will be replaced by the next value from the replacements, + or with null if the mask is null. + Hence, for replacement arrays, len(replacements) == sum(mask == true). + + Parameters + ---------- + values : Array-like + Argument to compute function. + mask : Array-like + Argument to compute function. + replacements : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.7 Pairwise functions ========================= +def pairwise_diff( + input: _NumericOrTemporalArrayT | Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT | Expression: + """ + Compute first order difference of an array. + + Computes the first order difference of an array, It internally calls + the scalar function "subtract" to compute + differences, so its + behavior and supported types are the same as + "subtract". The period can be specified in :struct:`PairwiseOptions`. + + Results will wrap around on integer overflow. Use function + "pairwise_diff_checked" if you want overflow to return an error. + + Parameters + ---------- + input : Array-like + Argument to compute function. + period : int, default 1 + Period for applying the period function. + options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +pairwise_diff_checked = _clone_signature(pairwise_diff) +""" +Compute first order difference of an array. + +Computes the first order difference of an array, It internally calls +the scalar function "subtract_checked" (or the checked variant) to compute +differences, so its behavior and supported types are the same as +"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. + +This function returns an error on overflow. For a variant that doesn't +fail on overflow, use function "pairwise_diff". + +Parameters +---------- +input : Array-like + Argument to compute function. +period : int, default 1 + Period for applying the period function. +options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" diff --git a/python/pyarrow-stubs/config.pyi b/python/pyarrow-stubs/config.pyi new file mode 100644 index 00000000000..166e10c9734 --- /dev/null +++ b/python/pyarrow-stubs/config.pyi @@ -0,0 +1,41 @@ +from typing import NamedTuple + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + +class BuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + +cpp_build_info: BuildInfo +cpp_version: str +cpp_version_info: VersionInfo + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + +__all__ = [ + "VersionInfo", + "BuildInfo", + "RuntimeInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/pyarrow-stubs/dataset.pyi b/python/pyarrow-stubs/dataset.pyi new file mode 100644 index 00000000000..a145437bb52 --- /dev/null +++ b/python/pyarrow-stubs/dataset.pyi @@ -0,0 +1,229 @@ +# from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload +# +# from _typeshed import StrPath +# from pyarrow._dataset import ( +# CsvFileFormat, +# CsvFragmentScanOptions, +# Dataset, +# DatasetFactory, +# DirectoryPartitioning, +# FeatherFileFormat, +# FileFormat, +# FileFragment, +# FilenamePartitioning, +# FileSystemDataset, +# FileSystemDatasetFactory, +# FileSystemFactoryOptions, +# FileWriteOptions, +# Fragment, +# FragmentScanOptions, +# HivePartitioning, +# InMemoryDataset, +# IpcFileFormat, +# IpcFileWriteOptions, +# JsonFileFormat, +# JsonFragmentScanOptions, +# Partitioning, +# PartitioningFactory, +# Scanner, +# TaggedRecordBatch, +# UnionDataset, +# UnionDatasetFactory, +# WrittenFile, +# get_partition_keys, +# ) +# from pyarrow._dataset_orc import OrcFileFormat +# from pyarrow._dataset_parquet import ( +# ParquetDatasetFactory, +# ParquetFactoryOptions, +# ParquetFileFormat, +# ParquetFileFragment, +# ParquetFileWriteOptions, +# ParquetFragmentScanOptions, +# ParquetReadOptions, +# RowGroupInfo, +# ) +# from pyarrow._dataset_parquet_encryption import ( +# ParquetDecryptionConfig, +# ParquetEncryptionConfig, +# ) +# from pyarrow.compute import Expression, field, scalar +# from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table +# +# from ._fs import SupportedFileSystem +# +# _orc_available: bool +# _parquet_available: bool +# +# __all__ = [ +# "CsvFileFormat", +# "CsvFragmentScanOptions", +# "Dataset", +# "DatasetFactory", +# "DirectoryPartitioning", +# "FeatherFileFormat", +# "FileFormat", +# "FileFragment", +# "FilenamePartitioning", +# "FileSystemDataset", +# "FileSystemDatasetFactory", +# "FileSystemFactoryOptions", +# "FileWriteOptions", +# "Fragment", +# "FragmentScanOptions", +# "HivePartitioning", +# "InMemoryDataset", +# "IpcFileFormat", +# "IpcFileWriteOptions", +# "JsonFileFormat", +# "JsonFragmentScanOptions", +# "Partitioning", +# "PartitioningFactory", +# "Scanner", +# "TaggedRecordBatch", +# "UnionDataset", +# "UnionDatasetFactory", +# "WrittenFile", +# "get_partition_keys", +# # Orc +# "OrcFileFormat", +# # Parquet +# "ParquetDatasetFactory", +# "ParquetFactoryOptions", +# "ParquetFileFormat", +# "ParquetFileFragment", +# "ParquetFileWriteOptions", +# "ParquetFragmentScanOptions", +# "ParquetReadOptions", +# "RowGroupInfo", +# # Parquet Encryption +# "ParquetDecryptionConfig", +# "ParquetEncryptionConfig", +# # Compute +# "Expression", +# "field", +# "scalar", +# # Dataset +# "partitioning", +# "parquet_dataset", +# "write_dataset", +# ] +# +# _DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] +# +# @overload +# def partitioning( +# schema: Schema, +# ) -> Partitioning: ... +# @overload +# def partitioning( +# schema: Schema, +# *, +# flavor: Literal["filename"], +# dictionaries: dict[str, Array] | None = None, +# ) -> Partitioning: ... +# @overload +# def partitioning( +# schema: Schema, +# *, +# flavor: Literal["filename"], +# dictionaries: Literal["infer"], +# ) -> PartitioningFactory: ... +# @overload +# def partitioning( +# field_names: list[str], +# *, +# flavor: Literal["filename"], +# ) -> PartitioningFactory: ... +# @overload +# def partitioning( +# schema: Schema, +# *, +# flavor: Literal["hive"], +# dictionaries: Literal["infer"], +# ) -> PartitioningFactory: ... +# @overload +# def partitioning( +# *, +# flavor: Literal["hive"], +# ) -> PartitioningFactory: ... +# @overload +# def partitioning( +# schema: Schema, +# *, +# flavor: Literal["hive"], +# dictionaries: dict[str, Array] | None = None, +# ) -> Partitioning: ... +# def parquet_dataset( +# metadata_path: StrPath, +# schema: Schema | None = None, +# filesystem: SupportedFileSystem | None = None, +# format: ParquetFileFormat | None = None, +# partitioning: Partitioning | PartitioningFactory | None = None, +# partition_base_dir: str | None = None, +# ) -> FileSystemDataset: ... +# @overload +# def dataset( +# source: StrPath | Sequence[StrPath], +# schema: Schema | None = None, +# format: FileFormat | _DatasetFormat | None = None, +# filesystem: SupportedFileSystem | str | None = None, +# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, +# partition_base_dir: str | None = None, +# exclude_invalid_files: bool | None = None, +# ignore_prefixes: list[str] | None = None, +# ) -> FileSystemDataset: ... +# @overload +# def dataset( +# source: list[Dataset], +# schema: Schema | None = None, +# format: FileFormat | _DatasetFormat | None = None, +# filesystem: SupportedFileSystem | str | None = None, +# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, +# partition_base_dir: str | None = None, +# exclude_invalid_files: bool | None = None, +# ignore_prefixes: list[str] | None = None, +# ) -> UnionDataset: ... +# @overload +# def dataset( +# source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, +# schema: Schema | None = None, +# format: FileFormat | _DatasetFormat | None = None, +# filesystem: SupportedFileSystem | str | None = None, +# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, +# partition_base_dir: str | None = None, +# exclude_invalid_files: bool | None = None, +# ignore_prefixes: list[str] | None = None, +# ) -> InMemoryDataset: ... +# @overload +# def dataset( +# source: RecordBatch | Table, +# schema: Schema | None = None, +# format: FileFormat | _DatasetFormat | None = None, +# filesystem: SupportedFileSystem | str | None = None, +# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, +# partition_base_dir: str | None = None, +# exclude_invalid_files: bool | None = None, +# ignore_prefixes: list[str] | None = None, +# ) -> InMemoryDataset: ... +# def write_dataset( +# data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], +# base_dir: StrPath, +# *, +# basename_template: str | None = None, +# format: FileFormat | _DatasetFormat | None = None, +# partitioning: Partitioning | list[str] | None = None, +# partitioning_flavor: str | None = None, +# schema: Schema | None = None, +# filesystem: SupportedFileSystem | None = None, +# file_options: FileWriteOptions | None = None, +# use_threads: bool = True, +# max_partitions: int = 1024, +# max_open_files: int = 1024, +# max_rows_per_file: int = 0, +# min_rows_per_group: int = 0, +# max_rows_per_group: int = 1024 * 1024, +# file_visitor: Callable[[str], None] | None = None, +# existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", +# create_dir: bool = True, +# ): ... diff --git a/python/pyarrow-stubs/device.pyi b/python/pyarrow-stubs/device.pyi new file mode 100644 index 00000000000..d1b9f39eedd --- /dev/null +++ b/python/pyarrow-stubs/device.pyi @@ -0,0 +1,88 @@ +import enum + +from pyarrow.lib import _Weakrefable + +class DeviceAllocationType(enum.Flag): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + +class Device(_Weakrefable): + """ + Abstract interface for hardware devices + + This object represents a device with access to some memory spaces. + When handling a Buffer or raw memory address, it allows deciding in which + context the raw memory address should be interpreted + (e.g. CPU-accessible memory, or embedded memory on some particular GPU). + """ + + @property + def type_name(self) -> str: + """ + A shorthand for this device's type. + """ + @property + def device_id(self) -> int: + """ + A device ID to identify this device if there are multiple of this type. + + If there is no "device_id" equivalent (such as for the main CPU device on + non-numa systems) returns -1. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this device is the main CPU device. + + This shorthand method is very useful when deciding whether a memory address + is CPU-accessible. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + Return the DeviceAllocationType of this device. + """ + +class MemoryManager(_Weakrefable): + """ + An object that provides memory management primitives. + + A MemoryManager is always tied to a particular Device instance. + It can also have additional parameters (such as a MemoryPool to + allocate CPU memory). + + """ + @property + def device(self) -> Device: + """ + The device this MemoryManager is tied to. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this MemoryManager is tied to the main CPU device. + + This shorthand method is very useful when deciding whether a memory + address is CPU-accessible. + """ + +def default_cpu_memory_manager() -> MemoryManager: + """ + Return the default CPU MemoryManager instance. + + The returned singleton instance uses the default MemoryPool. + """ + +__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow-stubs/error.pyi b/python/pyarrow-stubs/error.pyi new file mode 100644 index 00000000000..981ed51e680 --- /dev/null +++ b/python/pyarrow-stubs/error.pyi @@ -0,0 +1,53 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowCapacityError(ArrowException): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + +ArrowIOError = IOError + +class StopToken: ... + +def enable_signal_handlers(enable: bool) -> None: ... + +have_signal_refcycle: bool + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/pyarrow-stubs/interchange/__init__.pyi b/python/pyarrow-stubs/interchange/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pyarrow-stubs/interchange/buffer.pyi b/python/pyarrow-stubs/interchange/buffer.pyi new file mode 100644 index 00000000000..46673961a75 --- /dev/null +++ b/python/pyarrow-stubs/interchange/buffer.pyi @@ -0,0 +1,58 @@ +import enum + +from pyarrow.lib import Buffer + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class _PyArrowBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ diff --git a/python/pyarrow-stubs/interchange/column.pyi b/python/pyarrow-stubs/interchange/column.pyi new file mode 100644 index 00000000000..e6662867b6b --- /dev/null +++ b/python/pyarrow-stubs/interchange/column.pyi @@ -0,0 +1,252 @@ +import enum + +from typing import Any, Iterable, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + @property + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + + TBD: are there any other in-memory representations that are needed? + """ + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ diff --git a/python/pyarrow-stubs/interchange/dataframe.pyi b/python/pyarrow-stubs/interchange/dataframe.pyi new file mode 100644 index 00000000000..526a58926a9 --- /dev/null +++ b/python/pyarrow-stubs/interchange/dataframe.pyi @@ -0,0 +1,102 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Iterable, Sequence + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ + def select_columns(self, indices: Sequence[int]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + def select_columns_by_name(self, names: Sequence[str]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ diff --git a/python/pyarrow-stubs/interchange/from_dataframe.pyi b/python/pyarrow-stubs/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..b04b6268975 --- /dev/null +++ b/python/pyarrow-stubs/interchange/from_dataframe.pyi @@ -0,0 +1,244 @@ +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... + +ColumnObject: TypeAlias = Any + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: + """ + Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + + Examples + -------- + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + Convert a pandas dataframe to a pyarrow table: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_attendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... } + ... ) + >>> df + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_attendees: int64 + country: large_string + ---- + n_attendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + """ + +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: + """ + Convert interchange protocol chunk to ``pa.RecordBatch``. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.RecordBatch + """ + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding one of the primitive dtypes to a PyArrow array. + A primitive type is one of: int, uint, float, bool (1 bit). + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding boolean dtype to a PyArrow array. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: + """ + Convert a column holding categorical data to a pa.DictionaryArray. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.DictionaryArray + """ + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: + """Parse datetime `format_str` to interpret the `data`.""" + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: + """Map column date type to pyarrow date type.""" + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: + """ + Build a PyArrow array from the passed buffer. + + Parameters + ---------- + buffer : ColumnBuffers + Dictionary containing tuples of underlying buffers and + their associated dtype. + data_type : Tuple[DtypeKind, int, str, str], + Dtype description of the column as a tuple ``(kind, bit-width, format string, + endianness)``. + length : int + The number of values in the array. + describe_null: ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + + Notes + ----- + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as + the returned PyArrow array is being used. + """ + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from the passed mask buffer. + + Parameters + ---------- + validity_buff : BufferObject + Tuple of underlying validity buffer and associated dtype. + validity_dtype : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from NaN or sentinel values. + + Parameters + ---------- + data_pa_buffer : pa.Buffer + PyArrow buffer for the column data. + data_type : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ diff --git a/python/pyarrow-stubs/io.pyi b/python/pyarrow-stubs/io.pyi index b8404225e18..3d630498a1d 100644 --- a/python/pyarrow-stubs/io.pyi +++ b/python/pyarrow-stubs/io.pyi @@ -37,7 +37,7 @@ import builtins from pyarrow._stubs_typing import Compression, SupportPyBuffer from pyarrow.lib import MemoryPool, _Weakrefable -# from .device import Device, DeviceAllocationType, MemoryManager +from .device import Device, DeviceAllocationType, MemoryManager from ._types import KeyValueMetadata def have_libhdfs() -> bool: @@ -605,34 +605,33 @@ class Buffer(_Weakrefable): """ Whether the buffer is CPU-accessible. """ - # TODO - # @property - # def device(self) -> Device: - # """ - # The device where the buffer resides. - # - # Returns - # ------- - # Device - # """ - # @property - # def memory_manager(self) -> MemoryManager: - # """ - # The memory manager associated with the buffer. - # - # Returns - # ------- - # MemoryManager - # """ - # @property - # def device_type(self) -> DeviceAllocationType: - # """ - # The device type where the buffer resides. - # - # Returns - # ------- - # DeviceAllocationType - # """ + @property + def device(self) -> Device: + """ + The device where the buffer resides. + + Returns + ------- + Device + """ + @property + def memory_manager(self) -> MemoryManager: + """ + The memory manager associated with the buffer. + + Returns + ------- + MemoryManager + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the buffer resides. + + Returns + ------- + DeviceAllocationType + """ @property def parent(self) -> Buffer | None: ... def __getitem__(self, key: builtins.slice | int) -> Self | int: diff --git a/python/pyarrow-stubs/ipc.pyi b/python/pyarrow-stubs/ipc.pyi new file mode 100644 index 00000000000..c7f2af004d4 --- /dev/null +++ b/python/pyarrow-stubs/ipc.pyi @@ -0,0 +1,123 @@ +from io import IOBase + +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... +def open_stream( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchFileWriter: ... +def open_file( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... +def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + +__all__ = [ + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi index 527f946b53a..c0a3cd08386 100644 --- a/python/pyarrow-stubs/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -26,15 +26,15 @@ from .array import * # from .benchmark import * # from .builder import * # from .compat import * -# from .config import * -# from .device import * -# from .error import * +from .config import * +from .device import * +from .error import * from .io import * -# from .__lib_pxi.ipc import * +from ._ipc import * from .memory import * # from .pandas_shim import * from .scalar import * -# from .table import * +from .table import * from .tensor import * from ._types import * diff --git a/python/pyarrow-stubs/table.pyi b/python/pyarrow-stubs/table.pyi new file mode 100644 index 00000000000..685ae725d4b --- /dev/null +++ b/python/pyarrow-stubs/table.pyi @@ -0,0 +1,5154 @@ +import datetime as dt +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import ( + Any, + Collection, + Generator, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Sequence, + TypeVar, +) +import builtins + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, MemoryManager, MemoryPool, Schema +from pyarrow.lib import Field as _Field + +from .array import Array, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from ._ipc import RecordBatchReader +from .scalar import BooleanScalar, Int64Scalar, Scalar, StructScalar +from .tensor import Tensor +from ._stubs_typing import NullableCollection +from ._types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +Field: TypeAlias = _Field[DataType] +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed +AggregateOptions: TypeAlias = ( + ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions +) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + An array-like composed from a (possibly empty) collection of pyarrow.Arrays + + Warnings + -------- + Do not call this class's constructor directly. + + Examples + -------- + To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: + + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) + True + """ + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: + """ + Return data type of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + """ + def length(self) -> int: + """ + Return length of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.length() + 6 + """ + __len__ = length + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the ChunkedArray + + Parameters + ---------- + indent : int + How much to indent right the content of the array, + by default ``0``. + window : int + How many items to preview within each chunk at the begin and end + of the chunk when the chunk is bigger than the window. + The other elements will be ellipsed. + container_window : int + How many chunks to preview at the begin and end + of the array when the array is bigger than the window. + The other elements will be ellipsed. + This setting also applies to list columns. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_string(skip_new_lines=True) + '[[2,2,4],[4,5,100]]' + """ + format = to_string + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def null_count(self) -> int: + """ + Number of null entries + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.null_count + 1 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the chunked array. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.nbytes + 49 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the chunked array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.get_total_buffer_size() + 49 + """ + def __sizeof__(self) -> int: ... + def __getitem__(self, key: int | builtins.slice) -> Self | _Scalar_co: + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or ChunkedArray (slice) + """ + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: + """ + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_null() + + [ + [ + false, + false, + false, + false, + true, + false + ] + ] + """ + def is_nan(self) -> ChunkedArray[BooleanScalar]: + """ + Return boolean array indicating the NaN values. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) + >>> arr.is_nan() + + [ + [ + false, + true, + false, + false, + null, + false + ] + ] + """ + def is_valid(self) -> ChunkedArray[BooleanScalar]: + """ + Return boolean array indicating the non-null values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_valid() + + [ + [ + true, + true, + true + ], + [ + true, + false, + true + ] + ] + """ + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: + """ + Replace each null element in values with fill_value. + + See :func:`pyarrow.compute.fill_null` for full usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array or ChunkedArray + A new array with nulls replaced by the given value. + + Examples + -------- + >>> import pyarrow as pa + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.fill_null(fill_value) + + [ + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + ] + """ + def equals(self, other: Self) -> bool: + """ + Return whether the contents of two chunked arrays are equal. + + Parameters + ---------- + other : pyarrow.ChunkedArray + Chunked array to compare against. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> n_legs.equals(n_legs) + True + >>> n_legs.equals(animals) + False + """ + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: + """ + Return a NumPy copy of this array (experimental). + + Parameters + ---------- + zero_copy_only : bool, default False + Introduced for signature consistence with pyarrow.Array.to_numpy. + This must be False here since NumPy arrays' buffer must be contiguous. + + Returns + ------- + array : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_numpy() + array([ 2, 2, 4, 4, 5, 100]) + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def cast( + self, + target_type: None | _CastAs = None, + safe: bool | None = None, + options: CastOptions | None = None, + ) -> Self | ChunkedArray[Scalar[_CastAs]]: + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + + Change the data type of an array: + + >>> n_legs_seconds = n_legs.cast(pa.duration("s")) + >>> n_legs_seconds.type + DurationType(duration[s]) + """ + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : ChunkedArray + A dictionary-encoded version of this array. + + Examples + -------- + >>> import pyarrow as pa + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> animals.dictionary_encode() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: + """ + Flatten this ChunkedArray. If it has a struct type, the column is + flattened into one array per struct field. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : list of ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> c_arr = pa.chunked_array(n_legs.value_counts()) + >>> c_arr + + [ + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + ] + >>> c_arr.flatten() + [ + [ + [ + 2, + 4, + 5, + 100 + ] + ], + [ + [ + 2, + 2, + 1, + 1 + ] + ]] + >>> c_arr.type + StructType(struct) + >>> n_legs.type + DataType(int64) + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: + """ + Flatten this ChunkedArray into a single non-chunked array. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.combine_chunks() + + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + """ + def unique(self) -> ChunkedArray[_Scalar_co]: + """ + Compute distinct elements in array + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.unique() + + [ + 2, + 4, + 5, + 100 + ] + """ + def value_counts(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + An array of structs + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.value_counts() + + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this ChunkedArray + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.slice(2, 2) + + [ + [ + 4 + ], + [ + 4 + ] + ] + """ + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the chunked array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array or ChunkedArray + An array of the same type, with only the elements selected by + the boolean mask. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> mask = pa.array([True, False, None, True, False, True]) + >>> n_legs.filter(mask) + + [ + [ + 2 + ], + [ + 4, + 100 + ] + ] + >>> n_legs.filter(mask, null_selection_behavior="emit_null") + + [ + [ + 2, + null + ], + [ + 4, + 100 + ] + ] + """ + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.index(4) + + >>> n_legs.index(4, start=3) + + """ + def take(self, indices: Indices) -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array or ChunkedArray + An array with the same datatype, containing the taken values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.take([1, 4, 5]) + + [ + [ + 2, + 5, + 100 + ] + ] + """ + def drop_null(self) -> Self: + """ + Remove missing values from a chunked array. + See :func:`pyarrow.compute.drop_null` for full description. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.drop_null() + + [ + [ + 2, + 2 + ], + [ + 4, + 5, + 100 + ] + ] + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the ChunkedArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : ChunkedArray + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent chunked array, but where all + chunks share the same dictionary values. Dictionary indices are + transposed accordingly. + + If there are no dictionaries in the chunked array, it is returned + unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> c_arr + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ] + ] + >>> c_arr.unify_dictionaries() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + @property + def num_chunks(self) -> int: + """ + Number of underlying chunks. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.num_chunks + 2 + """ + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: + """ + Select a chunk by its index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.chunk(1) + + [ + 4, + 5, + 100 + ] + """ + @property + def chunks(self) -> list[Array[_Scalar_co]]: + """ + Convert to a list of single-chunked arrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.chunks + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ]] + """ + def iterchunks( + self: ArrayOrChunkedArray[_ScalarT], + ) -> Generator[Array, None, None]: + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.to_pylist() + [2, 2, 4, 4, None, 100] + """ + def __arrow_c_stream__(self, requested_schema=None) -> Any: + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ + @property + def is_cpu(self) -> bool: + """ + Whether all chunks in the ChunkedArray are CPU-accessible. + """ + +def chunked_array( + arrays: Iterable[NullableCollection[Any]] | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray] | Iterable[Array[_ScalarT]], + type: DataType | str | None = None, +) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: + """ + Construct chunked array from list of array-like objects + + Parameters + ---------- + arrays : Array, list of Array, or array-like + Must all be the same data type. Can be empty only if type also passed. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be + passed as well. + type : DataType or string coercible to DataType + + Returns + ------- + ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + """ + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: + """ + Slice or return column at given index or column name + + Parameters + ---------- + key : integer, str, or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + Array (from RecordBatch) or ChunkedArray (from Table) for column input. + RecordBatch or Table for slice input. + """ + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: + """ + Select single column from Table or RecordBatch. + + Parameters + ---------- + i : int or string + The index or name of the column to retrieve. + + Returns + ------- + column : Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Select a column by numeric index: + + >>> table.column(0) + + [ + [ + 2, + 4, + 5, + 100 + ] + ] + + Select a column by its name: + + >>> table.column("animals") + + [ + [ + "Flamingo", + "Horse", + "Brittle stars", + "Centipede" + ] + ] + """ + @property + def column_names(self) -> list[str]: + """ + Names of the Table or RecordBatch columns. + + Returns + ------- + list of str + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=["n_legs", "animals"], + ... ) + >>> table.column_names + ['n_legs', 'animals'] + """ + @property + def columns(self) -> list[_ColumnT]: + """ + List of all columns in numerical order. + + Returns + ------- + columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.columns + [ + [ + [ + null, + 4, + 5, + null + ] + ], + [ + [ + "Flamingo", + "Horse", + null, + "Centipede" + ] + ]] + """ + def drop_null(self) -> Self: + """ + Remove rows that contain missing values from a Table or RecordBatch. + + See :func:`pyarrow.compute.drop_null` for full usage. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, with rows containing + no missing values. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [None, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", None, "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.drop_null() + pyarrow.Table + year: double + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def field(self, i: int | str) -> Field: + """ + Select a schema field by its column name or numeric index. + + Parameters + ---------- + i : int or string + The index or name of the field to retrieve. + + Returns + ------- + Field + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.field(0) + pyarrow.Field + >>> table.field(1) + pyarrow.Field + """ + @classmethod + def from_pydict( + cls, + mapping: Mapping[str, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from Arrow arrays or columns. + + Parameters + ---------- + mapping : dict or Mapping + A mapping of strings to Arrays or Python lists. + schema : Schema, default None + If not passed, will be inferred from the Mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> pydict = {"n_legs": n_legs, "animals": animals} + + Construct a Table from a dictionary of arrays: + + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a dictionary of arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pydict(pydict, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from list of rows / dictionaries. + + Parameters + ---------- + mapping : list of dicts of rows + A mapping of strings to row values. + schema : Schema, default None + If not passed, will be inferred from the first row of the + mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + + Construct a Table from a list of rows: + + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4]] + animals: [["Flamingo","Dog"]] + + Construct a Table from a list of rows with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def itercolumns(self) -> Generator[_ColumnT, None, None]: + """ + Iterator over all columns in their numerical order. + + Yields + ------ + Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> for i in table.itercolumns(): + ... print(i.null_count) + 2 + 1 + """ + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: + """ + Dimensions of the table or record batch: (#rows, #columns). + + Returns + ------- + (int, int) + Number of rows and number of columns. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table.shape + (4, 2) + """ + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: + """ + Sort the Table or RecordBatch by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + Table or RecordBatch + A new tabular object sorted according to the sort keys. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.sort_by("animal") + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,2021,2021,2020,2022,2022]] + n_legs: [[5,100,4,2,4,2]] + animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] + """ + def take(self, indices: Indices) -> Self: + """ + Select rows from a Table or RecordBatch. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the tabular object whose rows will be returned. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, containing the taken rows. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.take([1, 3]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def filter( + self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: + """ + Select rows from the table or record batch based on a boolean mask. + + The Table can be filtered based on a mask, which will be passed to + :func:`pyarrow.compute.filter` to perform the filtering, or it can + be filtered through a boolean :class:`.Expression` + + Parameters + ---------- + mask : Array or array-like or .Expression + The boolean mask or the :class:`.Expression` to filter the table with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled, does nothing if + an :class:`.Expression` is used. + + Returns + ------- + filtered : Table or RecordBatch + A tabular object of the same schema, with only the rows selected + by applied filtering + + Examples + -------- + Using a Table (works similarly for RecordBatch): + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Define an expression and select rows: + + >>> import pyarrow.compute as pc + >>> expr = pc.field("year") <= 2020 + >>> table.filter(expr) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2019]] + n_legs: [[2,5]] + animals: [["Flamingo","Brittle stars"]] + + Define a mask and select rows: + + >>> mask = [True, True, False, None] + >>> table.filter(mask) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022]] + n_legs: [[2,4]] + animals: [["Flamingo","Horse"]] + >>> table.filter(mask, null_selection_behavior="emit_null") + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,null]] + n_legs: [[2,4,null]] + animals: [["Flamingo","Horse",null]] + """ + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list[Any]]: + """ + Convert the Table or RecordBatch to a dict or OrderedDict. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + dict + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> table.to_pydict() + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} + """ + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: + """ + Convert the Table or RecordBatch to a list of rows / dictionaries. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + list + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] + >>> table = pa.table(data, names=["n_legs", "animals"]) + >>> table.to_pylist() + [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... + """ + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: + """ + Return human-readable string representation of Table or RecordBatch. + + Parameters + ---------- + show_metadata : bool, default False + Display Field-level and Schema-level KeyValueMetadata. + preview_cols : int, default 0 + Display values of the columns for the first N columns. + + Returns + ------- + str + """ + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new Table or RecordBatch. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Raises + ------ + KeyError + If any of the passed column names do not exist. + + Returns + ------- + Table or RecordBatch + A tabular object without the column(s). + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Drop one column: + + >>> table.drop_columns("animals") + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + + Drop one or more columns: + + >>> table.drop_columns(["n_legs", "animals"]) + pyarrow.Table + ... + ---- + """ + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: + """ + Append column at end of columns. + + Parameters + ---------- + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + Table or RecordBatch + New table or record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Append column at the end: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.append_column("year", [year]) + pyarrow.Table + n_legs: int64 + animals: string + year: int64 + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + year: [[2021,2022,2019,2021]] + """ + +class RecordBatch(_Tabular[Array]): + """ + Batch of rows of columns of equal length + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatch.from_*`` functions instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Constructing a RecordBatch from arrays: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Constructing a RecordBatch from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_pandas(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Constructing a RecordBatch from pylist: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + >>> pa.RecordBatch.from_pylist(pylist).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Dog + + You can also construct a RecordBatch using :func:`pyarrow.record_batch`: + + >>> pa.record_batch([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def replace_schema_metadata( + self, metadata: dict[str | bytes, str | bytes] | None = None + ) -> Self: + """ + Create shallow copy of record batch by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + shallow_copy : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + + Constructing a RecordBatch with schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) + >>> batch.schema + n_legs: int64 + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Shallow copy of a RecordBatch with deleted schema metadata: + + >>> batch.replace_schema_metadata().schema + n_legs: int64 + """ + @property + def num_columns(self) -> int: + """ + Number of columns + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_columns + 2 + """ + + @property + def num_rows(self) -> int: + """ + Number of rows + + Due to the definition of a RecordBatch, all columns have the same + number of rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_rows + 6 + """ + @property + def schema(self) -> Schema: + """ + Schema of the RecordBatch and its columns + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.schema + n_legs: int64 + animals: string + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the record batch. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.nbytes + 116 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the record batch + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.get_total_buffer_size() + 120 + """ + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to RecordBatch at position i. + + A new record batch is returned with the column added, the original record batch + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.add_column(0, "year", year) + pyarrow.RecordBatch + year: int64 + n_legs: int64 + animals: string + ---- + year: [2021,2022,2019,2021] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Original record batch is left unchanged: + + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def remove_column(self, i: int) -> Self: + """ + Create new RecordBatch with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New record batch without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.remove_column(1) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: + """ + Replace column in RecordBatch at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.set_column(1, "year", year) + pyarrow.RecordBatch + n_legs: int64 + year: int64 + ---- + n_legs: [2,4,5,100] + year: [2021,2022,2019,2021] + """ + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: + """ + Create new record batch with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> new_names = ["n", "name"] + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write RecordBatch to Buffer as encapsulated IPC message, which does not + include a Schema. + + To reconstruct a RecordBatch from the encapsulated IPC message Buffer + returned by this function, a Schema must be passed separately. See + Examples. + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> buf = batch.serialize() + >>> buf + + + Reconstruct RecordBatch from IPC message Buffer and original Schema + + >>> pa.ipc.read_record_batch(buf, batch.schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this RecordBatch + + Parameters + ---------- + offset : int, default 0 + Offset from start of record batch to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> batch.slice(offset=3).to_pandas() + n_legs animals + 0 4 Horse + 1 5 Brittle stars + 2 100 Centipede + >>> batch.slice(length=2).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + >>> batch.slice(offset=3, length=1).to_pandas() + n_legs animals + 0 4 Horse + """ + def equals(self, other: Self, check_metadata: bool = False) -> bool: + """ + Check if contents of two record batches are equal. + + Parameters + ---------- + other : pyarrow.RecordBatch + RecordBatch to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch_0 = pa.record_batch([]) + >>> batch_1 = pa.RecordBatch.from_arrays( + ... [n_legs, animals], + ... names=["n_legs", "animals"], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> batch.equals(batch) + True + >>> batch.equals(batch_0) + False + >>> batch.equals(batch_1) + True + >>> batch.equals(batch_1, check_metadata=True) + False + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the RecordBatch. + + Returns a new RecordBatch with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + + Select columns my indices: + + >>> batch.select([1]) + pyarrow.RecordBatch + animals: string + ---- + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Select columns by names: + + >>> batch.select(["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,2,4,4,5,100] + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast record batch values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast batch values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> batch.cast(target_schema=my_schema) + pyarrow.RecordBatch + n_legs: duration[s] + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a RecordBatch from multiple pyarrow.Arrays + + Parameters + ---------- + arrays : list of pyarrow.Array + One for each field in RecordBatch + names : list of str, optional + Names for the batch fields. If not passed, schema must be passed + schema : Schema, default None + Schema for the created batch. If not passed, names must be passed + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from pyarrow Arrays using names: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Construct a RecordBatch from pyarrow Arrays using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow RecordBatch + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the RecordBatch. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``RecordBatch``. The default of None will store the index as a + column, except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + + Returns + ------- + pyarrow.RecordBatch + + + Examples + -------- + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Convert pandas DataFrame to RecordBatch: + + >>> import pyarrow as pa + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_pandas(df, schema=my_schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch specifying columns: + + >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: + """ + Construct a RecordBatch from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``RecordBatch``. + + Parameters + ---------- + struct_array : StructArray + Array to construct the record batch from. + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.RecordBatch.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array(self) -> StructArray: + """ + Convert to a struct array. + """ + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: + """ + Convert to a :class:`~pyarrow.Tensor`. + + RecordBatches that can be converted have fields of type signed or unsigned + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], + ... names=["a", "b"], + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None): + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this schema. + If None, the batch will be returned as-is, with a schema matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the batch as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema + and ArrowArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowDeviceArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this data type. + If None, the batch will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a + C ArrowSchema and ArrowDeviceArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowDeviceArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the arrays in the RecordBatch reside. + + Returns + ------- + DeviceAllocationType + """ + @property + def is_cpu(self) -> bool: + """ + Whether the RecordBatch's arrays are CPU-accessible. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Copy the entire RecordBatch to destination device. + + This copies each column of the record batch to create + a new record batch where all underlying buffers for the columns have + been copied to the destination MemoryManager. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + RecordBatch + """ + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + +class Table(_Tabular[ChunkedArray[Any]]): + """ + A collection of top-level named, equal length Arrow arrays. + + Warnings + -------- + Do not call this class's constructor directly, use one of the ``from_*`` + methods instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a RecordBatch: + + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a dictionary of arrays: + + >>> pydict = {"n_legs": n_legs, "animals": animals} + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,null]] + animals: [["Flamingo","Centipede"]] + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [ + ... pa.field("year", pa.int64()), + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... ], + ... metadata={"year": "Year of entry"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + year: int64 + n_legs: int64 + animals: string + -- schema metadata -- + year: 'Year of entry' + + Construct a Table with :func:`pyarrow.table`: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this Table. + + Parameters + ---------- + offset : int, default 0 + Offset from start of table to slice. + length : int, default None + Length of slice (default is until end of table starting from + offset). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.slice(length=3) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019]] + n_legs: [[2,4,5]] + animals: [["Flamingo","Horse","Brittle stars"]] + >>> table.slice(offset=2) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019,2021]] + n_legs: [[5,100]] + animals: [["Brittle stars","Centipede"]] + >>> table.slice(offset=2, length=1) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019]] + n_legs: [[5]] + animals: [["Brittle stars"]] + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the Table. + + Returns a new Table with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.select([0, 1]) + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + >>> table.select(["year"]) + pyarrow.Table + year: int64 + ---- + year: [[2020,2022,2019,2021]] + """ + def replace_schema_metadata( + self, metadata: dict[str | bytes, str | bytes] | None = None + ) -> Self: + """ + Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None), + which deletes any existing metadata. + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Constructing a Table with pyarrow schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> table = pa.table(df, my_schema) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + + Create a shallow copy of a Table with deleted schema metadata: + + >>> table.replace_schema_metadata().schema + n_legs: int64 + animals: string + + Create a shallow copy of a Table with new schema metadata: + + >>> metadata = {"animals": "Which animal"} + >>> table.replace_schema_metadata(metadata=metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Which animal' + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Flatten this Table. + + Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> month = pa.array([4, 6]) + >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) + >>> table + pyarrow.Table + a: struct + child 0, animals: string + child 1, n_legs: int64 + child 2, year: int64 + month: int64 + ---- + a: [ + -- is_valid: all not null + -- child 0 type: string + ["Parrot",null] + -- child 1 type: int64 + [2,4] + -- child 2 type: int64 + [null,2022]] + month: [[4,6]] + + Flatten the columns with struct field: + + >>> table.flatten() + pyarrow.Table + a.animals: string + a.n_legs: int64 + a.year: int64 + month: int64 + ---- + a.animals: [["Parrot",null]] + a.n_legs: [[2,4]] + a.year: [[null,2022]] + month: [[4,6]] + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the ChunkedArray of each column are + concatenated into zero or one chunk. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + >>> table.combine_chunks() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4,4,5,100]] + animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent table, but where all chunks of + each column share the same dictionary values. Dictionary indices + are transposed accordingly. + + Columns without dictionaries are returned unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> table = pa.table([c_arr], names=["animals"]) + >>> table + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog"] -- indices: + [0,1,2], -- dictionary: + ["Horse","Brittle stars","Centipede"] -- indices: + [0,1,2]] + + Unify dictionaries across both chunks: + + >>> table.unify_dictionaries() + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [0,1,2], -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [3,4,5]] + """ + def equals(self, other: Self, check_metadata: bool = False) -> Self: + """ + Check if contents of two tables are equal. + + Parameters + ---------- + other : pyarrow.Table + Table to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.Table.from_arrays([n_legs, animals], names=names) + >>> table_0 = pa.Table.from_arrays([]) + >>> table_1 = pa.Table.from_arrays( + ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> table.equals(table) + True + >>> table.equals(table_0) + False + >>> table.equals(table_1) + True + >>> table.equals(table_1, check_metadata=True) + False + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast table values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast table values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> table.cast(target_schema=my_schema) + pyarrow.Table + n_legs: duration[s] + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + safe: bool = True, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow Table. + + The column types in the resulting Arrow Table are inferred from the + dtypes of the pandas.Series in the DataFrame. In the case of non-object + Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the + Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough + information to always lead to a meaningful Arrow type. In the case that + we cannot infer a type, e.g. because the DataFrame is of length 0 or + the Series only contains None/nan objects, the type is set to + null. This behavior can be avoided by constructing an explicit schema + and passing it to this function. + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the Arrow Table. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``Table``. The default of None will store the index as a column, + except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + safe : bool, default True + Check for overflows or other unsafe conversions. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[ArrayOrChunkedArray[Any]], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: + """ + Construct a Table from Arrow arrays. + + Parameters + ---------- + arrays : list of pyarrow.Array or pyarrow.ChunkedArray + Equal-length arrays that should form the table. + names : list of str, optional + Names for the table columns. If not passed, schema must be passed. + schema : Schema, default None + Schema for the created table. If not passed, names must be passed. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"animals": "Name of the animal species"}, + ... ) + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Name of the animal species' + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: + """ + Construct a Table from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``Table``. + + Parameters + ---------- + struct_array : StructArray or ChunkedArray + Array to construct the table from. + + Returns + ------- + pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.Table.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[StructScalar]: + """ + Convert to a chunked array of struct type. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for ChunkedArray chunks. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + ChunkedArray + """ + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: + """ + Construct a Table from a sequence or iterator of Arrow RecordBatches. + + Parameters + ---------- + batches : sequence or iterator of RecordBatch + Sequence of RecordBatch to be converted, all schemas must be equal. + schema : Schema, default None + If not passed, will be inferred from the first RecordBatch. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Construct a Table from a RecordBatch: + + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a sequence of RecordBatches: + + >>> pa.Table.from_batches([batch, batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: + """ + Convert Table to a list of RecordBatch objects. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + list[RecordBatch] + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatch: + + >>> table.to_batches()[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Convert a Table to a list of RecordBatches: + + >>> table.to_batches(max_chunksize=2)[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + >>> table.to_batches(max_chunksize=2)[1].to_pandas() + n_legs animals + 0 5 Brittle stars + 1 100 Centipede + """ + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: + """ + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + RecordBatchReader + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatchReader: + + >>> table.to_reader() + + + >>> reader = table.to_reader() + >>> reader.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + >>> reader.read_all() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @property + def schema(self) -> Schema: + """ + Schema of the table and its columns. + + Returns + ------- + Schema + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... + """ + @property + def num_columns(self) -> int: + """ + Number of columns in this table. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_columns + 2 + """ + @property + def num_rows(self) -> int: + """ + Number of rows in this table. + + Due to the definition of a table, all columns have the same number of + rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_rows + 4 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the table. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.nbytes + 72 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the table. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.get_total_buffer_size() + 76 + """ + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.add_column(0, "year", [year]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2021,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Original table is left unchanged: + + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def remove_column(self, i: int) -> Self: + """ + Create new Table with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New table without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.remove_column(1) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + """ + def set_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: + """ + Replace column in Table at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.set_column(1, "year", [year]) + pyarrow.Table + n_legs: int64 + year: int64 + ---- + n_legs: [[2,4,5,100]] + year: [[2021,2022,2019,2021]] + """ + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: + """ + Create new table with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> new_names = ["n", "name"] + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def drop(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new table. + + Alias of Table.drop_columns, but kept for backwards compatibility. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Returns + ------- + Table + New table without the column(s). + """ + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: + """ + Declare a grouping over the columns of the table. + + Resulting grouping can then be used to perform aggregations + with a subsequent ``aggregate()`` method. + + Parameters + ---------- + keys : str or list[str] + Name of the columns that should be used as the grouping key. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the + default), no stable ordering of the output is guaranteed. + + Returns + ------- + TableGroupBy + + See Also + -------- + TableGroupBy.aggregate + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.group_by("year").aggregate([("n_legs", "sum")]) + pyarrow.Table + year: int64 + n_legs_sum: int64 + ---- + year: [[2020,2022,2021,2019]] + n_legs_sum: [[2,6,104,5]] + """ + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: + """ + Perform a join between this table and another one. + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + keys : str or list[str] + The columns from current table that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to left column names. This prevents confusion + when the columns in left and right tables have colliding names. + right_suffix : str, default None + Which suffix to add to the right column names. This prevents confusion + when the columns in left and right tables have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whether to use multithreading or not. + + Returns + ------- + Table + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) + >>> df2 = pd.DataFrame( + ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} + ... ) + >>> t1 = pa.Table.from_pandas(df1) + >>> t2 = pa.Table.from_pandas(df2) + + Left outer join: + + >>> t1.join(t2, "id").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2]] + year: [[2019,2020,2022]] + n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] + + Full outer join: + + >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2,4]] + year: [[2019,2020,2022,null]] + n_legs: [[5,null,null,100]] + animal: [["Brittle stars",null,null,"Centipede"]] + + Right outer join: + + >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") + pyarrow.Table + year: int64 + id: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,null]] + id: [[3,4]] + n_legs: [[5,100]] + animal: [["Brittle stars","Centipede"]] + + Right anti join + + >>> t1.join(t2, "id", join_type="right anti") + pyarrow.Table + id: int64 + n_legs: int64 + animal: string + ---- + id: [[4]] + n_legs: [[100]] + animal: [["Centipede"]] + """ + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: + """ + Perform an asof join between this table and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both tables must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + on : str + The column from current table that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input dataset must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current table that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row ``right.on - left.on <= tolerance``. The + ``tolerance`` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_table that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left table. + right_by : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + + Returns + ------- + Table + + Example + -------- + >>> import pyarrow as pa + >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) + >>> t2 = pa.table( + ... { + ... "id": [3, 4], + ... "year": [2020, 2021], + ... "n_legs": [5, 100], + ... "animal": ["Brittle stars", "Centipede"], + ... } + ... ) + + >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[1,3,2,3,3]] + year: [[2020,2021,2022,2022,2023]] + n_legs: [[null,5,null,5,null]] + animal: [[null,"Brittle stars",null,"Brittle stars",null]] + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the table as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @property + def is_cpu(self) -> bool: + """ + Whether all ChunkedArrays are CPU-accessible. + """ + +def record_batch( + data: dict[str, list[Any] | Array[Any]] + | Collection[Array[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, +) -> RecordBatch: + """ + Create a pyarrow.RecordBatch from another Python data structure or sequence + of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of Arrays, + a pandas DataFame, or any tabular object implementing the + Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or + ``__arrow_c_device_array__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the RecordBatch. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + + Returns + ------- + RecordBatch + + See Also + -------- + RecordBatch.from_arrays, RecordBatch.from_pandas, table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from a python dictionary: + + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Creating a RecordBatch from a list of arrays with names: + + >>> pa.record_batch([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Creating a RecordBatch from a list of arrays with names and metadata: + + >>> my_metadata = {"n_legs": "How many legs does an animal have?"} + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'How many legs does an animal have?' + + Creating a RecordBatch from a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + >>> pa.record_batch(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Creating a RecordBatch from a pandas DataFrame with schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.record_batch(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + >>> pa.record_batch(df, my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + """ + +def table( + data: dict[str, list[Any] | Array[Any]] + | Collection[ArrayOrChunkedArray[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: + """ + Create a pyarrow.Table from a Python data structure or sequence of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of arrays or + chunked arrays, a pandas DataFame, or any tabular object implementing + the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, + ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the Arrow Table. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + If passed, the output will have exactly this schema (raising an error + when columns are not found in the data and ignoring additional data not + specified in the schema, when data is a dict or DataFrame). + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + nthreads : int, default None + For pandas.DataFrame inputs: if greater than 1, convert columns to + Arrow in parallel using indicated number of threads. By default, + this follows :func:`pyarrow.cpu_count` (may use up to system CPU count + threads). + + Returns + ------- + Table + + See Also + -------- + Table.from_arrays, Table.from_pandas, Table.from_pydict + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from a python dictionary: + + >>> pa.table({"n_legs": n_legs, "animals": animals}) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.table(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.table(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... + + Construct a Table from chunked arrays: + + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + """ + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: + """ + Concatenate pyarrow.Table objects. + + If promote_options="none", a zero-copy concatenation will be performed. The schemas + of all the Tables must be the same (except the metadata), otherwise an + exception will be raised. The result Table will share the metadata with the + first table. + + If promote_options="default", any null type arrays will be casted to the type of other + arrays in the column of the same name. If a table is missing a particular + field, null values of the appropriate type will be generated to take the + place of the missing field. The new schema will share the metadata with the + first table. Each field in the new schema will share the metadata with the + first table which has the field defined. Note that type promotions may + involve additional allocations on the given ``memory_pool``. + + If promote_options="permissive", the behavior of default plus types will be promoted + to the common denominator that fits all the fields. + + Parameters + ---------- + tables : iterable of pyarrow.Table objects + Pyarrow tables to concatenate into a single Table. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + promote_options : str, default none + Accepts strings "none", "default" and "permissive". + **kwargs : dict, optional + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) + >>> pa.concat_tables([t1, t2]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] + + """ + +class TableGroupBy: + """ + A grouping of columns in a table on which to perform aggregations. + + Parameters + ---------- + table : pyarrow.Table + Input table to execute the aggregation on. + keys : str or list[str] + Name of the grouped columns. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the default), + no stable ordering of the output is guaranteed. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table( + ... [ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], + ... names=["keys", "values"], + ... ) + + Grouping of columns: + + >>> pa.TableGroupBy(t, "keys") + + + Perform aggregations: + + >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + """ + + keys: str | list[str] + def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: + """ + Perform an aggregation over the grouped columns of the table. + + Parameters + ---------- + aggregations : list[tuple(str, str)] or \ +list[tuple(str, str, FunctionOptions)] + List of tuples, where each tuple is one aggregation specification + and consists of: aggregation column name followed + by function name and optionally aggregation function option. + Pass empty list to get a single row for each group. + The column name can be a string, an empty list or a list of + column names, for unary, nullary and n-ary aggregation functions + respectively. + + For the list of function names and respective aggregation + function options see :ref:`py-grouped-aggrs`. + + Returns + ------- + Table + Results of the aggregation functions. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + + Sum the column "values" over the grouped column "keys": + + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + + Count the rows over the grouped column "keys": + + >>> t.group_by("keys").aggregate([([], "count_all")]) + pyarrow.Table + keys: string + count_all: int64 + ---- + keys: [["a","b","c"]] + count_all: [[2,2,1]] + + Do multiple aggregations: + + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + keys: string + values_sum: int64 + keys_count: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + + Count the number of non-null values for column "values" + over the grouped column "keys": + + >>> import pyarrow.compute as pc + >>> t.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + keys: string + values_count: int64 + ---- + keys: [["a","b","c"]] + values_count: [[2,2,1]] + + Get a single row for each group in column "keys": + + >>> t.group_by("keys").aggregate([]) + pyarrow.Table + keys: string + ---- + keys: [["a","b","c"]] + """ + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: + """ + Concatenate pyarrow.RecordBatch objects. + + All recordbatches must share the same Schema, + the operation implies a copy of the data to merge + the arrays of the different RecordBatches. + + Parameters + ---------- + recordbatches : iterable of pyarrow.RecordBatch objects + Pyarrow record batches to concatenate into a single RecordBatch. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.record_batch( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.record_batch( + ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] + ... ) + >>> pa.concat_batches([t1, t2]) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100,2,4] + animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] + + """ + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", +] diff --git a/python/pyarrow-stubs/util.pyi b/python/pyarrow-stubs/util.pyi new file mode 100644 index 00000000000..c2ecf7d6b61 --- /dev/null +++ b/python/pyarrow-stubs/util.pyi @@ -0,0 +1,27 @@ +from collections.abc import Callable +from os import PathLike +from typing import Any, Protocol, Sequence, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... diff --git a/python/pyarrow/ipc.py b/python/pyarrow/_ipc.py similarity index 100% rename from python/pyarrow/ipc.py rename to python/pyarrow/_ipc.py From a50d6f8bde3b3c783c349ff5ad74d890d82d883e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 02:57:46 +0200 Subject: [PATCH 14/26] Remaining stubs --- python/pyarrow-stubs/__init__.pyi | 2 - python/pyarrow-stubs/_azurefs.pyi | 91 + python/pyarrow-stubs/_benchmark.pyi | 3 + python/pyarrow-stubs/_csv.pyi | 556 ++ python/pyarrow-stubs/_cuda.pyi | 557 ++ python/pyarrow-stubs/_dataset.pyi | 4582 ++++++++--------- python/pyarrow-stubs/_dataset_orc.pyi | 6 + python/pyarrow-stubs/_dataset_parquet.pyi | 314 ++ .../_dataset_parquet_encryption.pyi | 85 + python/pyarrow-stubs/_feather.pyi | 29 + python/pyarrow-stubs/_flight.pyi | 1380 +++++ python/pyarrow-stubs/_fs.pyi | 1001 ++++ python/pyarrow-stubs/_gcsfs.pyi | 83 + python/pyarrow-stubs/_hdfs.pyi | 75 + python/pyarrow-stubs/_json.pyi | 169 + python/pyarrow-stubs/_orc.pyi | 56 + python/pyarrow-stubs/_parquet.pyi | 445 ++ python/pyarrow-stubs/_parquet_encryption.pyi | 67 + python/pyarrow-stubs/_s3fs.pyi | 75 + python/pyarrow-stubs/_substrait.pyi | 39 + python/pyarrow-stubs/acero.pyi | 85 + python/pyarrow-stubs/builder.pyi | 89 + python/pyarrow-stubs/cffi.pyi | 4 + python/pyarrow-stubs/compat.pyi | 22 + python/pyarrow-stubs/csv.pyi | 27 + python/pyarrow-stubs/cuda.pyi | 25 + python/pyarrow-stubs/dataset.pyi | 458 +- python/pyarrow-stubs/feather.pyi | 50 + python/pyarrow-stubs/flight.pyi | 95 + python/pyarrow-stubs/fs.pyi | 77 + python/pyarrow-stubs/gandiva.pyi | 65 + python/pyarrow-stubs/json.pyi | 3 + python/pyarrow-stubs/lib.pyi | 9 +- python/pyarrow-stubs/orc.pyi | 279 + python/pyarrow-stubs/pandas_compat.pyi | 54 + python/pyarrow-stubs/pandas_shim.pyi | 51 + python/pyarrow-stubs/parquet/__init__.pyi | 1 + python/pyarrow-stubs/parquet/core.pyi | 2061 ++++++++ python/pyarrow-stubs/parquet/encryption.pyi | 15 + python/pyarrow-stubs/substrait.pyi | 21 + 40 files changed, 10570 insertions(+), 2536 deletions(-) create mode 100644 python/pyarrow-stubs/_azurefs.pyi create mode 100644 python/pyarrow-stubs/_benchmark.pyi create mode 100644 python/pyarrow-stubs/_csv.pyi create mode 100644 python/pyarrow-stubs/_cuda.pyi create mode 100644 python/pyarrow-stubs/_dataset_orc.pyi create mode 100644 python/pyarrow-stubs/_dataset_parquet.pyi create mode 100644 python/pyarrow-stubs/_dataset_parquet_encryption.pyi create mode 100644 python/pyarrow-stubs/_feather.pyi create mode 100644 python/pyarrow-stubs/_flight.pyi create mode 100644 python/pyarrow-stubs/_fs.pyi create mode 100644 python/pyarrow-stubs/_gcsfs.pyi create mode 100644 python/pyarrow-stubs/_hdfs.pyi create mode 100644 python/pyarrow-stubs/_json.pyi create mode 100644 python/pyarrow-stubs/_orc.pyi create mode 100644 python/pyarrow-stubs/_parquet.pyi create mode 100644 python/pyarrow-stubs/_parquet_encryption.pyi create mode 100644 python/pyarrow-stubs/_s3fs.pyi create mode 100644 python/pyarrow-stubs/_substrait.pyi create mode 100644 python/pyarrow-stubs/acero.pyi create mode 100644 python/pyarrow-stubs/builder.pyi create mode 100644 python/pyarrow-stubs/cffi.pyi create mode 100644 python/pyarrow-stubs/compat.pyi create mode 100644 python/pyarrow-stubs/csv.pyi create mode 100644 python/pyarrow-stubs/cuda.pyi create mode 100644 python/pyarrow-stubs/feather.pyi create mode 100644 python/pyarrow-stubs/flight.pyi create mode 100644 python/pyarrow-stubs/fs.pyi create mode 100644 python/pyarrow-stubs/gandiva.pyi create mode 100644 python/pyarrow-stubs/json.pyi create mode 100644 python/pyarrow-stubs/orc.pyi create mode 100644 python/pyarrow-stubs/pandas_compat.pyi create mode 100644 python/pyarrow-stubs/pandas_shim.pyi create mode 100644 python/pyarrow-stubs/parquet/__init__.pyi create mode 100644 python/pyarrow-stubs/parquet/core.pyi create mode 100644 python/pyarrow-stubs/parquet/encryption.pyi create mode 100644 python/pyarrow-stubs/substrait.pyi diff --git a/python/pyarrow-stubs/__init__.pyi b/python/pyarrow-stubs/__init__.pyi index 3f5e3073fd8..d74b486fd55 100644 --- a/python/pyarrow-stubs/__init__.pyi +++ b/python/pyarrow-stubs/__init__.pyi @@ -336,8 +336,6 @@ from pyarrow.lib import ( ) from .ipc import serialize_pandas, deserialize_pandas -# TODO? -# import _ipc as ipc import types as types diff --git a/python/pyarrow-stubs/_azurefs.pyi b/python/pyarrow-stubs/_azurefs.pyi new file mode 100644 index 00000000000..b9a83f01c56 --- /dev/null +++ b/python/pyarrow-stubs/_azurefs.pyi @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Literal + +from ._fs import FileSystem + +class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. + + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default `DefaultAzureCredential `__ + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any authentication parameters are provided when + initialising the FileSystem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. If sas_token and account_key are None the + default credential will be used. The parameters account_key and sas_token are + mutually exclusive. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like Azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. + blob_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + dfs_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + sas_token : str, default None + SAS token for the storage account, used as an alternative to account_key. If sas_token + and account_key are None the default credential will be used. The parameters + account_key and sas_token are mutually exclusive. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") + >>> azurite_fs = fs.AzureFileSystem( + ... account_name="devstoreaccount1", + ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ... blob_storage_authority="127.0.0.1:10000", + ... dfs_storage_authority="127.0.0.1:10000", + ... blob_storage_scheme="http", + ... dfs_storage_scheme="http", + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + + def __init__( + self, + account_name: str, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_schema: Literal["http", "https"] = "https", + dfs_storage_schema: Literal["http", "https"] = "https", + sas_token: str | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/_benchmark.pyi b/python/pyarrow-stubs/_benchmark.pyi new file mode 100644 index 00000000000..048973301dc --- /dev/null +++ b/python/pyarrow-stubs/_benchmark.pyi @@ -0,0 +1,3 @@ +from pyarrow.lib import benchmark_PandasObjectIsNull + +__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/pyarrow-stubs/_csv.pyi b/python/pyarrow-stubs/_csv.pyi new file mode 100644 index 00000000000..ad52b2f380f --- /dev/null +++ b/python/pyarrow-stubs/_csv.pyi @@ -0,0 +1,556 @@ +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +from . import lib +from ._stubs_typing import ArrayLike + +class Context(lib._Weakrefable): + """ + CUDA driver context. + """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + def to_numba(self) -> _numba_driver.Context: + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ + @staticmethod + def get_num_devices() -> int: + """Return the number of GPU devices.""" + @property + def device_number(self) -> int: + """Return context device number.""" + @property + def handle(self) -> int: + """Return pointer to context handle.""" + def synchronize(self) -> None: + """Blocks until the device has completed all preceding requested + tasks. + """ + @property + def bytes_allocated(self) -> int: + """Return the number of allocated bytes.""" + def get_device_address(self, address: int) -> int: + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + def new_buffer(self, nbytes: int) -> CudaBuffer: + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ + @property + def memory_manager(self) -> lib.MemoryManager: + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + @property + def device(self) -> lib.Device: + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: + """Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + def buffer_from_object(self, obj: Any) -> CudaBuffer: + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ + +class IpcMemHandle(lib._Weakrefable): + """A serializable container for a CUDA IPC handle.""" + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ + +class CudaBuffer(lib.Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: + """Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + def to_numba(self) -> _numba_driver.MemoryPointer: + """Return numba memory pointer of CudaBuffer instance.""" + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + def export_for_ipc(self) -> IpcMemHandle: + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ + @property + def context(self) -> Context: + """Returns the CUDA driver context of this buffer.""" + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + def to_pybytes(self) -> bytes: + """Return device buffer content as Python bytes.""" + +class HostBuffer(lib.Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ + @property + def size(self) -> int: ... + +class BufferReader(lib.NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ + +class BufferWriter(lib.NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ + @property + def buffer_size(self) -> int: + """Returns size of host (CPU) buffer, 0 for unbuffered""" + @buffer_size.setter + def buffer_size(self, buffer_size: int): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ + @property + def num_bytes_buffered(self) -> int: + """Returns number of bytes buffered on host""" + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: + """Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: + """Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ diff --git a/python/pyarrow-stubs/_cuda.pyi b/python/pyarrow-stubs/_cuda.pyi new file mode 100644 index 00000000000..94f1b33e2e0 --- /dev/null +++ b/python/pyarrow-stubs/_cuda.pyi @@ -0,0 +1,557 @@ +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +# from . import lib +from .lib import _Weakrefable, Buffer, MemoryPool, NativeFile, RecordBatch, Schema, DictionaryMemo, Message, MemoryManager, Device +from ._stubs_typing import ArrayLike + +class Context(_Weakrefable): + """ + CUDA driver context. + """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + def to_numba(self) -> _numba_driver.Context: + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ + @staticmethod + def get_num_devices() -> int: + """Return the number of GPU devices.""" + @property + def device_number(self) -> int: + """Return context device number.""" + @property + def handle(self) -> int: + """Return pointer to context handle.""" + def synchronize(self) -> None: + """Blocks until the device has completed all preceding requested + tasks. + """ + @property + def bytes_allocated(self) -> int: + """Return the number of allocated bytes.""" + def get_device_address(self, address: int) -> int: + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + def new_buffer(self, nbytes: int) -> CudaBuffer: + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ + @property + def memory_manager(self) -> MemoryManager: + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + @property + def device(self) -> Device: + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: + """Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + def buffer_from_object(self, obj: Any) -> CudaBuffer: + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ + +class IpcMemHandle(_Weakrefable): + """A serializable container for a CUDA IPC handle.""" + @staticmethod + def from_buffer(opaque_handle: Buffer) -> IpcMemHandle: + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + def serialize(self, pool: MemoryPool | None = None) -> Buffer: + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ + +class CudaBuffer(Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + + @staticmethod + def from_buffer(buf: Buffer) -> CudaBuffer: + """Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + def to_numba(self) -> _numba_driver.MemoryPointer: + """Return numba memory pointer of CudaBuffer instance.""" + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: Buffer | None = None, + memory_pool: MemoryPool | None = None, + resizable: bool = False, + ) -> Buffer: + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ + def copy_from_host( + self, data: Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + def export_for_ipc(self) -> IpcMemHandle: + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ + @property + def context(self) -> Context: + """Returns the CUDA driver context of this buffer.""" + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + def to_pybytes(self) -> bytes: + """Return device buffer content as Python bytes.""" + +class HostBuffer(Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ + @property + def size(self) -> int: ... + +class BufferReader(NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ + +class BufferWriter(NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ + @property + def buffer_size(self) -> int: + """Returns size of host (CPU) buffer, 0 for unbuffered""" + @buffer_size.setter + def buffer_size(self, buffer_size: int): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ + @property + def num_bytes_buffered(self) -> int: + """Returns number of bytes buffered on host""" + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + +def serialize_record_batch(batch: RecordBatch, ctx: Context) -> CudaBuffer: + """Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: MemoryManager | None = None +) -> Message: + """Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + +def read_record_batch( + buffer: Buffer, + object: Schema, + *, + dictionary_memo: DictionaryMemo | None = None, + pool: MemoryPool | None = None, +) -> RecordBatch: + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ diff --git a/python/pyarrow-stubs/_dataset.pyi b/python/pyarrow-stubs/_dataset.pyi index 03e7762b6df..e0f38d54eff 100644 --- a/python/pyarrow-stubs/_dataset.pyi +++ b/python/pyarrow-stubs/_dataset.pyi @@ -1,2300 +1,2282 @@ -# import sys -# -# if sys.version_info >= (3, 11): -# from typing import Self -# else: -# from typing_extensions import Self -# from typing import ( -# IO, -# Any, -# Callable, -# Generic, -# Iterator, -# Literal, -# NamedTuple, -# TypeVar, -# overload, -# ) -# -# from _typeshed import StrPath -# -# from . import _csv, _json, _parquet, lib -# from ._fs import FileSelector, FileSystem, SupportedFileSystem -# from ._stubs_typing import Indices, JoinType, Order -# from .acero import ExecNodeOptions -# from .compute import Expression -# from .ipc import IpcWriteOptions, RecordBatchReader -# -# class Dataset(lib._Weakrefable): -# """ -# Collection of data fragments and potentially child datasets. -# -# Arrow Datasets allow you to query against data that has been split across -# multiple files. This sharding of data may indicate partitioning, which -# can accelerate queries that only touch some partitions (files). -# """ -# -# @property -# def partition_expression(self) -> Expression: -# """ -# An Expression which evaluates to true for all data viewed by this -# Dataset. -# """ -# def replace_schema(self, schema: lib.Schema) -> None: -# """ -# Return a copy of this Dataset with a different schema. -# -# The copy will view the same Fragments. If the new schema is not -# compatible with the original dataset's schema then an error will -# be raised. -# -# Parameters -# ---------- -# schema : Schema -# The new dataset schema. -# """ -# def get_fragments(self, filter: Expression | None = None): -# """Returns an iterator over the fragments in this dataset. -# -# Parameters -# ---------- -# filter : Expression, default None -# Return fragments matching the optional filter, either using the -# partition_expression or internal information like Parquet's -# statistics. -# -# Returns -# ------- -# fragments : iterator of Fragment -# """ -# def scanner( -# self, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Scanner: -# """ -# Build a scan operation against the dataset. -# -# Data is not loaded immediately. Instead, this produces a Scanner, -# which exposes further operations (e.g. loading all data as a -# table, counting rows). -# -# See the :meth:`Scanner.from_dataset` method for further information. -# -# Parameters -# ---------- -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# scanner : Scanner -# -# Examples -# -------- -# >>> import pyarrow as pa -# >>> table = pa.table( -# ... { -# ... "year": [2020, 2022, 2021, 2022, 2019, 2021], -# ... "n_legs": [2, 2, 4, 4, 5, 100], -# ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], -# ... } -# ... ) -# >>> -# >>> import pyarrow.parquet as pq -# >>> pq.write_table(table, "dataset_scanner.parquet") -# -# >>> import pyarrow.dataset as ds -# >>> dataset = ds.dataset("dataset_scanner.parquet") -# -# Selecting a subset of the columns: -# -# >>> dataset.scanner(columns=["year", "n_legs"]).to_table() -# pyarrow.Table -# year: int64 -# n_legs: int64 -# ---- -# year: [[2020,2022,2021,2022,2019,2021]] -# n_legs: [[2,2,4,4,5,100]] -# -# Projecting selected columns using an expression: -# -# >>> dataset.scanner( -# ... columns={ -# ... "n_legs_uint": ds.field("n_legs").cast("uint8"), -# ... } -# ... ).to_table() -# pyarrow.Table -# n_legs_uint: uint8 -# ---- -# n_legs_uint: [[2,2,4,4,5,100]] -# -# Filtering rows while scanning: -# -# >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() -# pyarrow.Table -# year: int64 -# n_legs: int64 -# animal: string -# ---- -# year: [[2022,2021,2022,2021]] -# n_legs: [[2,4,4,100]] -# animal: [["Parrot","Dog","Horse","Centipede"]] -# """ -# def to_batches( -# self, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Iterator[lib.RecordBatch]: -# """ -# Read the dataset as materialized record batches. -# -# Parameters -# ---------- -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# record_batches : iterator of RecordBatch -# """ -# def to_table( -# self, -# columns: list[str] | dict[str, Expression] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> lib.Table: -# """ -# Read the dataset to an Arrow table. -# -# Note that this method reads all the selected data from the dataset -# into memory. -# -# Parameters -# ---------- -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# table : Table -# """ -# def take( -# self, -# indices: Indices, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> lib.Table: -# """ -# Select rows of data by index. -# -# Parameters -# ---------- -# indices : Array or array-like -# indices of rows to select in the dataset. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# table : Table -# """ -# def head( -# self, -# num_rows: int, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> lib.Table: -# """ -# Load the first N rows of the dataset. -# -# Parameters -# ---------- -# num_rows : int -# The number of rows to load. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# table : Table -# """ -# def count_rows( -# self, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> int: -# """ -# Count rows matching the scanner filter. -# -# Parameters -# ---------- -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# count : int -# """ -# @property -# def schema(self) -> lib.Schema: -# """The common schema of the full Dataset""" -# def filter(self, expression: Expression) -> Self: -# """ -# Apply a row filter to the dataset. -# -# Parameters -# ---------- -# expression : Expression -# The filter that should be applied to the dataset. -# -# Returns -# ------- -# Dataset -# """ -# def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: -# """ -# Sort the Dataset by one or multiple columns. -# -# Parameters -# ---------- -# sorting : str or list[tuple(name, order)] -# Name of the column to use to sort (ascending), or -# a list of multiple sorting conditions where -# each entry is a tuple with column name -# and sorting order ("ascending" or "descending") -# **kwargs : dict, optional -# Additional sorting options. -# As allowed by :class:`SortOptions` -# -# Returns -# ------- -# InMemoryDataset -# A new dataset sorted according to the sort keys. -# """ -# def join( -# self, -# right_dataset: Dataset, -# keys: str | list[str], -# right_keys: str | list[str] | None = None, -# join_type: JoinType = "left outer", -# left_suffix: str | None = None, -# right_suffix: str | None = None, -# coalesce_keys: bool = True, -# use_threads: bool = True, -# ) -> InMemoryDataset: -# """ -# Perform a join between this dataset and another one. -# -# Result of the join will be a new dataset, where further -# operations can be applied. -# -# Parameters -# ---------- -# right_dataset : dataset -# The dataset to join to the current one, acting as the right dataset -# in the join operation. -# keys : str or list[str] -# The columns from current dataset that should be used as keys -# of the join operation left side. -# right_keys : str or list[str], default None -# The columns from the right_dataset that should be used as keys -# on the join operation right side. -# When ``None`` use the same key names as the left dataset. -# join_type : str, default "left outer" -# The kind of join that should be performed, one of -# ("left semi", "right semi", "left anti", "right anti", -# "inner", "left outer", "right outer", "full outer") -# left_suffix : str, default None -# Which suffix to add to right column names. This prevents confusion -# when the columns in left and right datasets have colliding names. -# right_suffix : str, default None -# Which suffix to add to the left column names. This prevents confusion -# when the columns in left and right datasets have colliding names. -# coalesce_keys : bool, default True -# If the duplicated keys should be omitted from one of the sides -# in the join result. -# use_threads : bool, default True -# Whenever to use multithreading or not. -# -# Returns -# ------- -# InMemoryDataset -# """ -# def join_asof( -# self, -# right_dataset: Dataset, -# on: str, -# by: str | list[str], -# tolerance: int, -# right_on: str | list[str] | None = None, -# right_by: str | list[str] | None = None, -# ) -> InMemoryDataset: -# """ -# Perform an asof join between this dataset and another one. -# -# This is similar to a left-join except that we match on nearest key rather -# than equal keys. Both datasets must be sorted by the key. This type of join -# is most useful for time series data that are not perfectly aligned. -# -# Optionally match on equivalent keys with "by" before searching with "on". -# -# Result of the join will be a new Dataset, where further -# operations can be applied. -# -# Parameters -# ---------- -# right_dataset : dataset -# The dataset to join to the current one, acting as the right dataset -# in the join operation. -# on : str -# The column from current dataset that should be used as the "on" key -# of the join operation left side. -# -# An inexact match is used on the "on" key, i.e. a row is considered a -# match if and only if left_on - tolerance <= right_on <= left_on. -# -# The input table must be sorted by the "on" key. Must be a single -# field of a common type. -# -# Currently, the "on" key must be an integer, date, or timestamp type. -# by : str or list[str] -# The columns from current dataset that should be used as the keys -# of the join operation left side. The join operation is then done -# only for the matches in these columns. -# tolerance : int -# The tolerance for inexact "on" key matching. A right row is considered -# a match with the left row `right.on - left.on <= tolerance`. The -# `tolerance` may be: -# -# - negative, in which case a past-as-of-join occurs; -# - or positive, in which case a future-as-of-join occurs; -# - or zero, in which case an exact-as-of-join occurs. -# -# The tolerance is interpreted in the same units as the "on" key. -# right_on : str or list[str], default None -# The columns from the right_dataset that should be used as the on key -# on the join operation right side. -# When ``None`` use the same key name as the left dataset. -# right_by : str or list[str], default None -# The columns from the right_dataset that should be used as by keys -# on the join operation right side. -# When ``None`` use the same key names as the left dataset. -# -# Returns -# ------- -# InMemoryDataset -# """ -# -# class InMemoryDataset(Dataset): -# """ -# A Dataset wrapping in-memory data. -# -# Parameters -# ---------- -# source : RecordBatch, Table, list, tuple -# The data for this dataset. Can be a RecordBatch, Table, list of -# RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader -# If an iterable is provided, the schema must also be provided. -# schema : Schema, optional -# Only required if passing an iterable as the source -# """ -# -# class UnionDataset(Dataset): -# """ -# A Dataset wrapping child datasets. -# -# Children's schemas must agree with the provided schema. -# -# Parameters -# ---------- -# schema : Schema -# A known schema to conform to. -# children : list of Dataset -# One or more input children -# """ -# -# @property -# def children(self) -> list[Dataset]: ... -# -# class FileSystemDataset(Dataset): -# """ -# A Dataset of file fragments. -# -# A FileSystemDataset is composed of one or more FileFragment. -# -# Parameters -# ---------- -# fragments : list[Fragments] -# List of fragments to consume. -# schema : Schema -# The top-level schema of the Dataset. -# format : FileFormat -# File format of the fragments, currently only ParquetFileFormat, -# IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. -# filesystem : FileSystem -# FileSystem of the fragments. -# root_partition : Expression, optional -# The top-level partition of the DataDataset. -# """ -# -# def __init__( -# self, -# fragments: list[Fragment], -# schema: lib.Schema, -# format: FileFormat, -# filesystem: SupportedFileSystem | None = None, -# root_partition: Expression | None = None, -# ) -> None: ... -# @classmethod -# def from_paths( -# cls, -# paths: list[str], -# schema: lib.Schema | None = None, -# format: FileFormat | None = None, -# filesystem: SupportedFileSystem | None = None, -# partitions: list[Expression] | None = None, -# root_partition: Expression | None = None, -# ) -> FileSystemDataset: -# """ -# A Dataset created from a list of paths on a particular filesystem. -# -# Parameters -# ---------- -# paths : list of str -# List of file paths to create the fragments from. -# schema : Schema -# The top-level schema of the DataDataset. -# format : FileFormat -# File format to create fragments from, currently only -# ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. -# filesystem : FileSystem -# The filesystem which files are from. -# partitions : list[Expression], optional -# Attach additional partition information for the file paths. -# root_partition : Expression, optional -# The top-level partition of the DataDataset. -# """ -# @property -# def filesystem(self) -> FileSystem: ... -# @property -# def partitioning(self) -> Partitioning | None: -# """ -# The partitioning of the Dataset source, if discovered. -# -# If the FileSystemDataset is created using the ``dataset()`` factory -# function with a partitioning specified, this will return the -# finalized Partitioning object from the dataset discovery. In all -# other cases, this returns None. -# """ -# @property -# def files(self) -> list[str]: -# """List of the files""" -# @property -# def format(self) -> FileFormat: -# """The FileFormat of this source.""" -# -# class FileWriteOptions(lib._Weakrefable): -# @property -# def format(self) -> FileFormat: ... -# -# class FileFormat(lib._Weakrefable): -# def inspect( -# self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None -# ) -> lib.Schema: -# """ -# Infer the schema of a file. -# -# Parameters -# ---------- -# file : file-like object, path-like or str -# The file or file path to infer a schema from. -# filesystem : Filesystem, optional -# If `filesystem` is given, `file` must be a string and specifies -# the path of the file to read from the filesystem. -# -# Returns -# ------- -# schema : Schema -# The schema inferred from the file -# """ -# def make_fragment( -# self, -# file: StrPath | IO, -# filesystem: SupportedFileSystem | None = None, -# partition_expression: Expression | None = None, -# *, -# file_size: int | None = None, -# ) -> Fragment: -# """ -# Make a FileFragment from a given file. -# -# Parameters -# ---------- -# file : file-like object, path-like or str -# The file or file path to make a fragment from. -# filesystem : Filesystem, optional -# If `filesystem` is given, `file` must be a string and specifies -# the path of the file to read from the filesystem. -# partition_expression : Expression, optional -# An expression that is guaranteed true for all rows in the fragment. Allows -# fragment to be potentially skipped while scanning with a filter. -# file_size : int, optional -# The size of the file in bytes. Can improve performance with high-latency filesystems -# when file size needs to be known before reading. -# -# Returns -# ------- -# fragment : Fragment -# The file fragment -# """ -# def make_write_options(self) -> FileWriteOptions: ... -# @property -# def default_extname(self) -> str: ... -# @property -# def default_fragment_scan_options(self) -> FragmentScanOptions: ... -# @default_fragment_scan_options.setter -# def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... -# -# class Fragment(lib._Weakrefable): -# """Fragment of data from a Dataset.""" -# @property -# def physical_schema(self) -> lib.Schema: -# """Return the physical schema of this Fragment. This schema can be -# different from the dataset read schema.""" -# @property -# def partition_expression(self) -> Expression: -# """An Expression which evaluates to true for all data viewed by this -# Fragment. -# """ -# def scanner( -# self, -# schema: lib.Schema | None = None, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Scanner: -# """ -# Build a scan operation against the fragment. -# -# Data is not loaded immediately. Instead, this produces a Scanner, -# which exposes further operations (e.g. loading all data as a -# table, counting rows). -# -# Parameters -# ---------- -# schema : Schema -# Schema to use for scanning. This is used to unify a Fragment to -# its Dataset's schema. If not specified this will use the -# Fragment's physical schema which might differ for each Fragment. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# scanner : Scanner -# """ -# def to_batches( -# self, -# schema: lib.Schema | None = None, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Iterator[lib.RecordBatch]: -# """ -# Read the fragment as materialized record batches. -# -# Parameters -# ---------- -# schema : Schema, optional -# Concrete schema to use for scanning. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# record_batches : iterator of RecordBatch -# """ -# def to_table( -# self, -# schema: lib.Schema | None = None, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> lib.Table: -# """ -# Convert this Fragment into a Table. -# -# Use this convenience utility with care. This will serially materialize -# the Scan result in memory before creating the Table. -# -# Parameters -# ---------- -# schema : Schema, optional -# Concrete schema to use for scanning. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# table : Table -# """ -# def take( -# self, -# indices: Indices, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> lib.Table: -# """ -# Select rows of data by index. -# -# Parameters -# ---------- -# indices : Array or array-like -# The indices of row to select in the dataset. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# Table -# """ -# def head( -# self, -# num_rows: int, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> lib.Table: -# """ -# Load the first N rows of the fragment. -# -# Parameters -# ---------- -# num_rows : int -# The number of rows to load. -# columns : list of str, default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# Table -# """ -# def count_rows( -# self, -# columns: list[str] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> int: -# """ -# Count rows matching the scanner filter. -# -# Parameters -# ---------- -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# -# Returns -# ------- -# count : int -# """ -# -# class FileFragment(Fragment): -# """A Fragment representing a data file.""" -# -# def open(self) -> lib.NativeFile: -# """ -# Open a NativeFile of the buffer or file viewed by this fragment. -# """ -# @property -# def path(self) -> str: -# """ -# The path of the data file viewed by this fragment, if it views a -# file. If instead it views a buffer, this will be "". -# """ -# @property -# def filesystem(self) -> FileSystem: -# """ -# The FileSystem containing the data file viewed by this fragment, if -# it views a file. If instead it views a buffer, this will be None. -# """ -# @property -# def buffer(self) -> lib.Buffer: -# """ -# The buffer viewed by this fragment, if it views a buffer. If -# instead it views a file, this will be None. -# """ -# @property -# def format(self) -> FileFormat: -# """ -# The format of the data file viewed by this fragment. -# """ -# -# class FragmentScanOptions(lib._Weakrefable): -# """Scan options specific to a particular fragment and scan operation.""" -# -# @property -# def type_name(self) -> str: ... -# -# class IpcFileWriteOptions(FileWriteOptions): -# @property -# def write_options(self) -> IpcWriteOptions: ... -# @write_options.setter -# def write_options(self, write_options: IpcWriteOptions) -> None: ... -# -# class IpcFileFormat(FileFormat): -# def equals(self, other: IpcFileFormat) -> bool: ... -# def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... -# @property -# def default_extname(self) -> str: ... -# -# class FeatherFileFormat(IpcFileFormat): ... -# -# class CsvFileFormat(FileFormat): -# """ -# FileFormat for CSV files. -# -# Parameters -# ---------- -# parse_options : pyarrow.csv.ParseOptions -# Options regarding CSV parsing. -# default_fragment_scan_options : CsvFragmentScanOptions -# Default options for fragments scan. -# convert_options : pyarrow.csv.ConvertOptions -# Options regarding value conversion. -# read_options : pyarrow.csv.ReadOptions -# General read options. -# """ -# def __init__( -# self, -# parse_options: _csv.ParseOptions | None = None, -# default_fragment_scan_options: CsvFragmentScanOptions | None = None, -# convert_options: _csv.ConvertOptions | None = None, -# read_options: _csv.ReadOptions | None = None, -# ) -> None: ... -# def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] -# @property -# def parse_options(self) -> _csv.ParseOptions: ... -# @parse_options.setter -# def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... -# def equals(self, other: CsvFileFormat) -> bool: ... -# -# class CsvFragmentScanOptions(FragmentScanOptions): -# """ -# Scan-specific options for CSV fragments. -# -# Parameters -# ---------- -# convert_options : pyarrow.csv.ConvertOptions -# Options regarding value conversion. -# read_options : pyarrow.csv.ReadOptions -# General read options. -# """ -# -# convert_options: _csv.ConvertOptions -# read_options: _csv.ReadOptions -# -# def __init__( -# self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions -# ) -> None: ... -# def equals(self, other: CsvFragmentScanOptions) -> bool: ... -# -# class CsvFileWriteOptions(FileWriteOptions): -# write_options: _csv.WriteOptions -# -# class JsonFileFormat(FileFormat): -# """ -# FileFormat for JSON files. -# -# Parameters -# ---------- -# default_fragment_scan_options : JsonFragmentScanOptions -# Default options for fragments scan. -# parse_options : pyarrow.json.ParseOptions -# Options regarding json parsing. -# read_options : pyarrow.json.ReadOptions -# General read options. -# """ -# def __init__( -# self, -# default_fragment_scan_options: JsonFragmentScanOptions | None = None, -# parse_options: _json.ParseOptions | None = None, -# read_options: _json.ReadOptions | None = None, -# ) -> None: ... -# def equals(self, other: JsonFileFormat) -> bool: ... -# -# class JsonFragmentScanOptions(FragmentScanOptions): -# """ -# Scan-specific options for JSON fragments. -# -# Parameters -# ---------- -# parse_options : pyarrow.json.ParseOptions -# Options regarding JSON parsing. -# read_options : pyarrow.json.ReadOptions -# General read options. -# """ -# -# parse_options: _json.ParseOptions -# read_options: _json.ReadOptions -# def __init__( -# self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions -# ) -> None: ... -# def equals(self, other: JsonFragmentScanOptions) -> bool: ... -# -# class Partitioning(lib._Weakrefable): -# def parse(self, path: str) -> Expression: -# """ -# Parse a path into a partition expression. -# -# Parameters -# ---------- -# path : str -# -# Returns -# ------- -# pyarrow.dataset.Expression -# """ -# def format(self, expr: Expression) -> tuple[str, str]: -# """ -# Convert a filter expression into a tuple of (directory, filename) using -# the current partitioning scheme -# -# Parameters -# ---------- -# expr : pyarrow.dataset.Expression -# -# Returns -# ------- -# tuple[str, str] -# -# Examples -# -------- -# -# Specify the Schema for paths like "/2009/June": -# -# >>> import pyarrow as pa -# >>> import pyarrow.dataset as ds -# >>> import pyarrow.compute as pc -# >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) -# >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) -# ('1862/Jan', '') -# """ -# @property -# def schema(self) -> lib.Schema: -# """The arrow Schema attached to the partitioning.""" -# -# class PartitioningFactory(lib._Weakrefable): -# @property -# def type_name(self) -> str: ... -# -# class KeyValuePartitioning(Partitioning): -# @property -# def dictionaries(self) -> list[lib.Array | None]: -# """ -# The unique values for each partition field, if available. -# -# Those values are only available if the Partitioning object was -# created through dataset discovery from a PartitioningFactory, or -# if the dictionaries were manually specified in the constructor. -# If no dictionary field is available, this returns an empty list. -# """ -# -# class DirectoryPartitioning(KeyValuePartitioning): -# """ -# A Partitioning based on a specified Schema. -# -# The DirectoryPartitioning expects one segment in the file path for each -# field in the schema (all fields are required to be present). -# For example given schema the path "/2009/11" would -# be parsed to ("year"_ == 2009 and "month"_ == 11). -# -# Parameters -# ---------- -# schema : Schema -# The schema that describes the partitions present in the file path. -# dictionaries : dict[str, Array] -# If the type of any field of `schema` is a dictionary type, the -# corresponding entry of `dictionaries` must be an array containing -# every value which may be taken by the corresponding column or an -# error will be raised in parsing. -# segment_encoding : str, default "uri" -# After splitting paths into segments, decode the segments. Valid -# values are "uri" (URI-decode segments) and "none" (leave as-is). -# -# Returns -# ------- -# DirectoryPartitioning -# -# Examples -# -------- -# >>> from pyarrow.dataset import DirectoryPartitioning -# >>> partitioning = DirectoryPartitioning( -# ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) -# ... ) -# >>> print(partitioning.parse("/2009/11/")) -# ((year == 2009) and (month == 11)) -# """ -# -# @staticmethod -# def discover( -# field_names: list[str] | None = None, -# infer_dictionary: bool = False, -# max_partition_dictionary_size: int = 0, -# schema: lib.Schema | None = None, -# segment_encoding: Literal["uri", "none"] = "uri", -# ) -> PartitioningFactory: -# """ -# Discover a DirectoryPartitioning. -# -# Parameters -# ---------- -# field_names : list of str -# The names to associate with the values from the subdirectory names. -# If schema is given, will be populated from the schema. -# infer_dictionary : bool, default False -# When inferring a schema for partition fields, yield dictionary -# encoded types instead of plain types. This can be more efficient -# when materializing virtual columns, and Expressions parsed by the -# finished Partitioning will include dictionaries of all unique -# inspected values for each field. -# max_partition_dictionary_size : int, default 0 -# Synonymous with infer_dictionary for backwards compatibility with -# 1.0: setting this to -1 or None is equivalent to passing -# infer_dictionary=True. -# schema : Schema, default None -# Use this schema instead of inferring a schema from partition -# values. Partition values will be validated against this schema -# before accumulation into the Partitioning's dictionary. -# segment_encoding : str, default "uri" -# After splitting paths into segments, decode the segments. Valid -# values are "uri" (URI-decode segments) and "none" (leave as-is). -# -# Returns -# ------- -# PartitioningFactory -# To be used in the FileSystemFactoryOptions. -# """ -# def __init__( -# self, -# schema: lib.Schema, -# dictionaries: dict[str, lib.Array] | None = None, -# segment_encoding: Literal["uri", "none"] = "uri", -# ) -> None: ... -# -# class HivePartitioning(KeyValuePartitioning): -# """ -# A Partitioning for "/$key=$value/" nested directories as found in -# Apache Hive. -# -# Multi-level, directory based partitioning scheme originating from -# Apache Hive with all data files stored in the leaf directories. Data is -# partitioned by static values of a particular column in the schema. -# Partition keys are represented in the form $key=$value in directory names. -# Field order is ignored, as are missing or unrecognized field names. -# -# For example, given schema, a possible -# path would be "/year=2009/month=11/day=15". -# -# Parameters -# ---------- -# schema : Schema -# The schema that describes the partitions present in the file path. -# dictionaries : dict[str, Array] -# If the type of any field of `schema` is a dictionary type, the -# corresponding entry of `dictionaries` must be an array containing -# every value which may be taken by the corresponding column or an -# error will be raised in parsing. -# null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" -# If any field is None then this fallback will be used as a label -# segment_encoding : str, default "uri" -# After splitting paths into segments, decode the segments. Valid -# values are "uri" (URI-decode segments) and "none" (leave as-is). -# -# Returns -# ------- -# HivePartitioning -# -# Examples -# -------- -# >>> from pyarrow.dataset import HivePartitioning -# >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) -# >>> print(partitioning.parse("/year=2009/month=11/")) -# ((year == 2009) and (month == 11)) -# -# """ -# def __init__( -# self, -# schema: lib.Schema, -# dictionaries: dict[str, lib.Array] | None = None, -# null_fallback: str = "__HIVE_DEFAULT_PARTITION__", -# segment_encoding: Literal["uri", "none"] = "uri", -# ) -> None: ... -# @staticmethod -# def discover( -# infer_dictionary: bool = False, -# max_partition_dictionary_size: int = 0, -# null_fallback="__HIVE_DEFAULT_PARTITION__", -# schema: lib.Schema | None = None, -# segment_encoding: Literal["uri", "none"] = "uri", -# ) -> PartitioningFactory: -# """ -# Discover a HivePartitioning. -# -# Parameters -# ---------- -# infer_dictionary : bool, default False -# When inferring a schema for partition fields, yield dictionary -# encoded types instead of plain. This can be more efficient when -# materializing virtual columns, and Expressions parsed by the -# finished Partitioning will include dictionaries of all unique -# inspected values for each field. -# max_partition_dictionary_size : int, default 0 -# Synonymous with infer_dictionary for backwards compatibility with -# 1.0: setting this to -1 or None is equivalent to passing -# infer_dictionary=True. -# null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" -# When inferring a schema for partition fields this value will be -# replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ -# for compatibility with Spark -# schema : Schema, default None -# Use this schema instead of inferring a schema from partition -# values. Partition values will be validated against this schema -# before accumulation into the Partitioning's dictionary. -# segment_encoding : str, default "uri" -# After splitting paths into segments, decode the segments. Valid -# values are "uri" (URI-decode segments) and "none" (leave as-is). -# -# Returns -# ------- -# PartitioningFactory -# To be used in the FileSystemFactoryOptions. -# """ -# -# class FilenamePartitioning(KeyValuePartitioning): -# """ -# A Partitioning based on a specified Schema. -# -# The FilenamePartitioning expects one segment in the file name for each -# field in the schema (all fields are required to be present) separated -# by '_'. For example given schema the name -# ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). -# -# Parameters -# ---------- -# schema : Schema -# The schema that describes the partitions present in the file path. -# dictionaries : dict[str, Array] -# If the type of any field of `schema` is a dictionary type, the -# corresponding entry of `dictionaries` must be an array containing -# every value which may be taken by the corresponding column or an -# error will be raised in parsing. -# segment_encoding : str, default "uri" -# After splitting paths into segments, decode the segments. Valid -# values are "uri" (URI-decode segments) and "none" (leave as-is). -# -# Returns -# ------- -# FilenamePartitioning -# -# Examples -# -------- -# >>> from pyarrow.dataset import FilenamePartitioning -# >>> partitioning = FilenamePartitioning( -# ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) -# ... ) -# >>> print(partitioning.parse("2009_11_data.parquet")) -# ((year == 2009) and (month == 11)) -# """ -# -# def __init__( -# self, -# schema: lib.Schema, -# dictionaries: dict[str, lib.Array] | None = None, -# segment_encoding: Literal["uri", "none"] = "uri", -# ) -> None: ... -# @staticmethod -# def discover( -# field_names: list[str] | None = None, -# infer_dictionary: bool = False, -# schema: lib.Schema | None = None, -# segment_encoding: Literal["uri", "none"] = "uri", -# ) -> PartitioningFactory: -# """ -# Discover a FilenamePartitioning. -# -# Parameters -# ---------- -# field_names : list of str -# The names to associate with the values from the subdirectory names. -# If schema is given, will be populated from the schema. -# infer_dictionary : bool, default False -# When inferring a schema for partition fields, yield dictionary -# encoded types instead of plain types. This can be more efficient -# when materializing virtual columns, and Expressions parsed by the -# finished Partitioning will include dictionaries of all unique -# inspected values for each field. -# schema : Schema, default None -# Use this schema instead of inferring a schema from partition -# values. Partition values will be validated against this schema -# before accumulation into the Partitioning's dictionary. -# segment_encoding : str, default "uri" -# After splitting paths into segments, decode the segments. Valid -# values are "uri" (URI-decode segments) and "none" (leave as-is). -# -# Returns -# ------- -# PartitioningFactory -# To be used in the FileSystemFactoryOptions. -# """ -# -# class DatasetFactory(lib._Weakrefable): -# """ -# DatasetFactory is used to create a Dataset, inspect the Schema -# of the fragments contained in it, and declare a partitioning. -# """ -# -# root_partition: Expression -# def finish(self, schema: lib.Schema | None = None) -> Dataset: -# """ -# Create a Dataset using the inspected schema or an explicit schema -# (if given). -# -# Parameters -# ---------- -# schema : Schema, default None -# The schema to conform the source to. If None, the inspected -# schema is used. -# -# Returns -# ------- -# Dataset -# """ -# def inspect(self) -> lib.Schema: -# """ -# Inspect all data fragments and return a common Schema. -# -# Returns -# ------- -# Schema -# """ -# def inspect_schemas(self) -> list[lib.Schema]: ... -# -# class FileSystemFactoryOptions(lib._Weakrefable): -# """ -# Influences the discovery of filesystem paths. -# -# Parameters -# ---------- -# partition_base_dir : str, optional -# For the purposes of applying the partitioning, paths will be -# stripped of the partition_base_dir. Files not matching the -# partition_base_dir prefix will be skipped for partitioning discovery. -# The ignored files will still be part of the Dataset, but will not -# have partition information. -# partitioning : Partitioning/PartitioningFactory, optional -# Apply the Partitioning to every discovered Fragment. See Partitioning or -# PartitioningFactory documentation. -# exclude_invalid_files : bool, optional (default True) -# If True, invalid files will be excluded (file format specific check). -# This will incur IO for each files in a serial and single threaded -# fashion. Disabling this feature will skip the IO, but unsupported -# files may be present in the Dataset (resulting in an error at scan -# time). -# selector_ignore_prefixes : list, optional -# When discovering from a Selector (and not from an explicit file list), -# ignore files and directories matching any of these prefixes. -# By default this is ['.', '_']. -# """ -# -# partitioning: Partitioning -# partitioning_factory: PartitioningFactory -# partition_base_dir: str -# exclude_invalid_files: bool -# selector_ignore_prefixes: list[str] -# -# def __init__( -# self, -# artition_base_dir: str | None = None, -# partitioning: Partitioning | PartitioningFactory | None = None, -# exclude_invalid_files: bool = True, -# selector_ignore_prefixes: list[str] | None = None, -# ) -> None: ... -# -# class FileSystemDatasetFactory(DatasetFactory): -# """ -# Create a DatasetFactory from a list of paths with schema inspection. -# -# Parameters -# ---------- -# filesystem : pyarrow.fs.FileSystem -# Filesystem to discover. -# paths_or_selector : pyarrow.fs.FileSelector or list of path-likes -# Either a Selector object or a list of path-like objects. -# format : FileFormat -# Currently only ParquetFileFormat and IpcFileFormat are supported. -# options : FileSystemFactoryOptions, optional -# Various flags influencing the discovery of filesystem paths. -# """ -# -# def __init__( -# self, -# filesystem: SupportedFileSystem, -# paths_or_selector: FileSelector, -# format: FileFormat, -# options: FileSystemFactoryOptions | None = None, -# ) -> None: ... -# -# class UnionDatasetFactory(DatasetFactory): -# """ -# Provides a way to inspect/discover a Dataset's expected schema before -# materialization. -# -# Parameters -# ---------- -# factories : list of DatasetFactory -# """ -# def __init__(self, factories: list[DatasetFactory]) -> None: ... -# -# _RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) -# -# class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): -# """An iterator over a sequence of record batches.""" -# def __iter__(self) -> Self: ... -# def __next__(self) -> _RecordBatchT: ... -# -# class TaggedRecordBatch(NamedTuple): -# """ -# A combination of a record batch and the fragment it came from. -# -# Parameters -# ---------- -# record_batch : RecordBatch -# The record batch. -# fragment : Fragment -# Fragment of the record batch. -# """ -# -# record_batch: lib.RecordBatch -# fragment: Fragment -# -# class TaggedRecordBatchIterator(lib._Weakrefable): -# """An iterator over a sequence of record batches with fragments.""" -# def __iter__(self) -> Self: ... -# def __next__(self) -> TaggedRecordBatch: ... -# -# class Scanner(lib._Weakrefable): -# """A materialized scan operation with context and options bound. -# -# A scanner is the class that glues the scan tasks, data fragments and data -# sources together. -# """ -# @staticmethod -# def from_dataset( -# dataset: Dataset, -# *, -# columns: list[str] | dict[str, Expression] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Scanner: -# """ -# Create Scanner from Dataset, -# -# Parameters -# ---------- -# dataset : Dataset -# Dataset to scan. -# columns : list[str] or dict[str, Expression], default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# """ -# @staticmethod -# def from_fragment( -# fragment: Fragment, -# *, -# schema: lib.Schema | None = None, -# columns: list[str] | dict[str, Expression] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Scanner: -# """ -# Create Scanner from Fragment, -# -# Parameters -# ---------- -# fragment : Fragment -# fragment to scan. -# schema : Schema, optional -# The schema of the fragment. -# columns : list[str] or dict[str, Expression], default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# """ -# @overload -# @staticmethod -# def from_batches( -# source: Iterator[lib.RecordBatch], -# *, -# schema: lib.Schema, -# columns: list[str] | dict[str, Expression] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Scanner: ... -# @overload -# @staticmethod -# def from_batches( -# source: RecordBatchReader, -# *, -# columns: list[str] | dict[str, Expression] | None = None, -# filter: Expression | None = None, -# batch_size: int = ..., -# batch_readahead: int = 16, -# fragment_readahead: int = 4, -# fragment_scan_options: FragmentScanOptions | None = None, -# use_threads: bool = True, -# cache_metadata: bool = True, -# memory_pool: lib.MemoryPool | None = None, -# ) -> Scanner: ... -# @staticmethod -# def from_batches(*args, **kwargs): -# """ -# Create a Scanner from an iterator of batches. -# -# This creates a scanner which can be used only once. It is -# intended to support writing a dataset (which takes a scanner) -# from a source which can be read only once (e.g. a -# RecordBatchReader or generator). -# -# Parameters -# ---------- -# source : Iterator or Arrow-compatible stream object -# The iterator of Batches. This can be a pyarrow RecordBatchReader, -# any object that implements the Arrow PyCapsule Protocol for -# streams, or an actual Python iterator of RecordBatches. -# schema : Schema -# The schema of the batches (required when passing a Python -# iterator). -# columns : list[str] or dict[str, Expression], default None -# The columns to project. This can be a list of column names to -# include (order and duplicates will be preserved), or a dictionary -# with {new_column_name: expression} values for more advanced -# projections. -# -# The list of columns or expressions may use the special fields -# `__batch_index` (the index of the batch within the fragment), -# `__fragment_index` (the index of the fragment within the dataset), -# `__last_in_fragment` (whether the batch is last in fragment), and -# `__filename` (the name of the source file or a description of the -# source fragment). -# -# The columns will be passed down to Datasets and corresponding data -# fragments to avoid loading, copying, and deserializing columns -# that will not be required further down the compute chain. -# By default all of the available columns are projected. Raises -# an exception if any of the referenced column names does not exist -# in the dataset's Schema. -# filter : Expression, default None -# Scan will return only the rows matching the filter. -# If possible the predicate will be pushed down to exploit the -# partition information or internal metadata found in the data -# source, e.g. Parquet statistics. Otherwise filters the loaded -# RecordBatches before yielding them. -# batch_size : int, default 131_072 -# The maximum row count for scanned record batches. If scanned -# record batches are overflowing memory then this method can be -# called to reduce their size. -# batch_readahead : int, default 16 -# The number of batches to read ahead in a file. This might not work -# for all file formats. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_readahead : int, default 4 -# The number of files to read ahead. Increasing this number will increase -# RAM usage but could also improve IO utilization. -# fragment_scan_options : FragmentScanOptions, default None -# Options specific to a particular scan and fragment type, which -# can change between different scans of the same dataset. -# use_threads : bool, default True -# If enabled, then maximum parallelism will be used determined by -# the number of available CPU cores. -# cache_metadata : bool, default True -# If enabled, metadata may be cached when scanning to speed up -# repeated scans. -# memory_pool : MemoryPool, default None -# For memory allocations, if required. If not specified, uses the -# default pool. -# """ -# @property -# def dataset_schema(self) -> lib.Schema: -# """The schema with which batches will be read from fragments.""" -# @property -# def projected_schema(self) -> lib.Schema: -# """ -# The materialized schema of the data, accounting for projections. -# -# This is the schema of any data returned from the scanner. -# """ -# def to_batches(self) -> Iterator[lib.RecordBatch]: -# """ -# Consume a Scanner in record batches. -# -# Returns -# ------- -# record_batches : iterator of RecordBatch -# """ -# def scan_batches(self) -> TaggedRecordBatchIterator: -# """ -# Consume a Scanner in record batches with corresponding fragments. -# -# Returns -# ------- -# record_batches : iterator of TaggedRecordBatch -# """ -# def to_table(self) -> lib.Table: -# """ -# Convert a Scanner into a Table. -# -# Use this convenience utility with care. This will serially materialize -# the Scan result in memory before creating the Table. -# -# Returns -# ------- -# Table -# """ -# def take(self, indices: Indices) -> lib.Table: -# """ -# Select rows of data by index. -# -# Will only consume as many batches of the underlying dataset as -# needed. Otherwise, this is equivalent to -# ``to_table().take(indices)``. -# -# Parameters -# ---------- -# indices : Array or array-like -# indices of rows to select in the dataset. -# -# Returns -# ------- -# Table -# """ -# def head(self, num_rows: int) -> lib.Table: -# """ -# Load the first N rows of the dataset. -# -# Parameters -# ---------- -# num_rows : int -# The number of rows to load. -# -# Returns -# ------- -# Table -# """ -# def count_rows(self) -> int: -# """ -# Count rows matching the scanner filter. -# -# Returns -# ------- -# count : int -# """ -# def to_reader(self) -> RecordBatchReader: -# """Consume this scanner as a RecordBatchReader. -# -# Returns -# ------- -# RecordBatchReader -# """ -# -# def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: -# """ -# Extract partition keys (equality constraints between a field and a scalar) -# from an expression as a dict mapping the field's name to its value. -# -# NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning -# will be conjunctions of equality conditions and are accessible through this -# function. Other subexpressions will be ignored. -# -# Parameters -# ---------- -# partition_expression : pyarrow.dataset.Expression -# -# Returns -# ------- -# dict -# -# Examples -# -------- -# -# For example, an expression of -# -# is converted to {'part': 'A', 'year': 2016} -# """ -# -# class WrittenFile(lib._Weakrefable): -# """ -# Metadata information about files written as -# part of a dataset write operation -# -# Parameters -# ---------- -# path : str -# Path to the file. -# metadata : pyarrow.parquet.FileMetaData, optional -# For Parquet files, the Parquet file metadata. -# size : int -# The size of the file in bytes. -# """ -# def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... -# -# def _filesystemdataset_write( -# data: Scanner, -# base_dir: StrPath, -# basename_template: str, -# filesystem: SupportedFileSystem, -# partitioning: Partitioning, -# file_options: FileWriteOptions, -# max_partitions: int, -# file_visitor: Callable[[str], None], -# existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], -# max_open_files: int, -# max_rows_per_file: int, -# min_rows_per_group: int, -# max_rows_per_group: int, -# create_dir: bool, -# ): ... -# -# class _ScanNodeOptions(ExecNodeOptions): -# def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... -# -# class ScanNodeOptions(_ScanNodeOptions): -# """ -# A Source node which yields batches from a Dataset scan. -# -# This is the option class for the "scan" node factory. -# -# This node is capable of applying pushdown projections or filters -# to the file readers which reduce the amount of data that needs to -# be read (if supported by the file format). But note that this does not -# construct associated filter or project nodes to perform the final -# filtering or projection. Rather, you may supply the same filter -# expression or projection to the scan node that you also supply -# to the filter or project node. -# -# Yielded batches will be augmented with fragment/batch indices when -# implicit_ordering=True to enable stable ordering for simple ExecPlans. -# -# Parameters -# ---------- -# dataset : pyarrow.dataset.Dataset -# The table which acts as the data source. -# **kwargs : dict, optional -# Scan options. See `Scanner.from_dataset` for possible arguments. -# require_sequenced_output : bool, default False -# Batches are yielded sequentially, like single-threaded -# implicit_ordering : bool, default False -# Preserve implicit ordering of data. -# """ -# -# def __init__( -# self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs -# ) -> None: ... +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + IO, + Any, + Callable, + Generic, + Iterator, + Literal, + NamedTuple, + TypeVar, + overload, +) + +from _typeshed import StrPath + +from . import csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + +class Dataset(lib._Weakrefable): + """ + Collection of data fragments and potentially child datasets. + + Arrow Datasets allow you to query against data that has been split across + multiple files. This sharding of data may indicate partitioning, which + can accelerate queries that only touch some partitions (files). + """ + + @property + def partition_expression(self) -> Expression: + """ + An Expression which evaluates to true for all data viewed by this + Dataset. + """ + def replace_schema(self, schema: lib.Schema) -> None: + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. + """ + def get_fragments(self, filter: Expression | None = None): + """Returns an iterator over the fragments in this dataset. + + Parameters + ---------- + filter : Expression, default None + Return fragments matching the optional filter, either using the + partition_expression or internal information like Parquet's + statistics. + + Returns + ------- + fragments : iterator of Fragment + """ + def scanner( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the dataset. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + See the :meth:`Scanner.from_dataset` method for further information. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "dataset_scanner.parquet") + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("dataset_scanner.parquet") + + Selecting a subset of the columns: + + >>> dataset.scanner(columns=["year", "n_legs"]).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + + Projecting selected columns using an expression: + + >>> dataset.scanner( + ... columns={ + ... "n_legs_uint": ds.field("n_legs").cast("uint8"), + ... } + ... ).to_table() + pyarrow.Table + n_legs_uint: uint8 + ---- + n_legs_uint: [[2,2,4,4,5,100]] + + Filtering rows while scanning: + + >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2022,2021,2022,2021]] + n_legs: [[2,4,4,100]] + animal: [["Parrot","Dog","Horse","Centipede"]] + """ + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the dataset as materialized record batches. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Read the dataset to an Arrow table. + + Note that this method reads all the selected data from the dataset + into memory. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def count_rows( + self, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + @property + def schema(self) -> lib.Schema: + """The common schema of the full Dataset""" + def filter(self, expression: Expression) -> Self: + """ + Apply a row filter to the dataset. + + Parameters + ---------- + expression : Expression + The filter that should be applied to the dataset. + + Returns + ------- + Dataset + """ + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: + """ + Sort the Dataset by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + InMemoryDataset + A new dataset sorted according to the sort keys. + """ + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: + """ + Perform a join between this dataset and another one. + + Result of the join will be a new dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + keys : str or list[str] + The columns from current dataset that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_dataset that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to right column names. This prevents confusion + when the columns in left and right datasets have colliding names. + right_suffix : str, default None + Which suffix to add to the left column names. This prevents confusion + when the columns in left and right datasets have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whenever to use multithreading or not. + + Returns + ------- + InMemoryDataset + """ + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: + """ + Perform an asof join between this dataset and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both datasets must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + on : str + The column from current dataset that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input table must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current dataset that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row `right.on - left.on <= tolerance`. The + `tolerance` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_dataset that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left dataset. + right_by : str or list[str], default None + The columns from the right_dataset that should be used as by keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + + Returns + ------- + InMemoryDataset + """ + +class InMemoryDataset(Dataset): + """ + A Dataset wrapping in-memory data. + + Parameters + ---------- + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader + If an iterable is provided, the schema must also be provided. + schema : Schema, optional + Only required if passing an iterable as the source + """ + +class UnionDataset(Dataset): + """ + A Dataset wrapping child datasets. + + Children's schemas must agree with the provided schema. + + Parameters + ---------- + schema : Schema + A known schema to conform to. + children : list of Dataset + One or more input children + """ + + @property + def children(self) -> list[Dataset]: ... + +class FileSystemDataset(Dataset): + """ + A Dataset of file fragments. + + A FileSystemDataset is composed of one or more FileFragment. + + Parameters + ---------- + fragments : list[Fragments] + List of fragments to consume. + schema : Schema + The top-level schema of the Dataset. + format : FileFormat + File format of the fragments, currently only ParquetFileFormat, + IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + FileSystem of the fragments. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: + """ + A Dataset created from a list of paths on a particular filesystem. + + Parameters + ---------- + paths : list of str + List of file paths to create the fragments from. + schema : Schema + The top-level schema of the DataDataset. + format : FileFormat + File format to create fragments from, currently only + ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + The filesystem which files are from. + partitions : list[Expression], optional + Attach additional partition information for the file paths. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: + """ + The partitioning of the Dataset source, if discovered. + + If the FileSystemDataset is created using the ``dataset()`` factory + function with a partitioning specified, this will return the + finalized Partitioning object from the dataset discovery. In all + other cases, this returns None. + """ + @property + def files(self) -> list[str]: + """List of the files""" + @property + def format(self) -> FileFormat: + """The FileFormat of this source.""" + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + +class Fragment(lib._Weakrefable): + """Fragment of data from a Dataset.""" + @property + def physical_schema(self) -> lib.Schema: + """Return the physical schema of this Fragment. This schema can be + different from the dataset read schema.""" + @property + def partition_expression(self) -> Expression: + """An Expression which evaluates to true for all data viewed by this + Fragment. + """ + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the fragment. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + Parameters + ---------- + schema : Schema + Schema to use for scanning. This is used to unify a Fragment to + its Dataset's schema. If not specified this will use the + Fragment's physical schema which might differ for each Fragment. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + """ + def to_batches( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the fragment as materialized record batches. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Convert this Fragment into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the fragment. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + +class FileFragment(Fragment): + """A Fragment representing a data file.""" + + def open(self) -> lib.NativeFile: + """ + Open a NativeFile of the buffer or file viewed by this fragment. + """ + @property + def path(self) -> str: + """ + The path of the data file viewed by this fragment, if it views a + file. If instead it views a buffer, this will be "". + """ + @property + def filesystem(self) -> FileSystem: + """ + The FileSystem containing the data file viewed by this fragment, if + it views a file. If instead it views a buffer, this will be None. + """ + @property + def buffer(self) -> lib.Buffer: + """ + The buffer viewed by this fragment, if it views a buffer. If + instead it views a file, this will be None. + """ + @property + def format(self) -> FileFormat: + """ + The format of the data file viewed by this fragment. + """ + +class FragmentScanOptions(lib._Weakrefable): + """Scan options specific to a particular fragment and scan operation.""" + + @property + def type_name(self) -> str: ... + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + +class FeatherFileFormat(IpcFileFormat): ... + +class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : pyarrow.csv.ParseOptions + Options regarding CSV parsing. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + def __init__( + self, + parse_options: csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: csv.ConvertOptions | None = None, + read_options: csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options(self) -> csv.WriteOptions: ... # type: ignore[override] + @property + def parse_options(self) -> csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + +class CsvFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + + convert_options: csv.ConvertOptions + read_options: csv.ReadOptions + + def __init__( + self, convert_options: csv.ConvertOptions, read_options: csv.ReadOptions + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + +class CsvFileWriteOptions(FileWriteOptions): + write_options: csv.WriteOptions + +class JsonFileFormat(FileFormat): + """ + FileFormat for JSON files. + + Parameters + ---------- + default_fragment_scan_options : JsonFragmentScanOptions + Default options for fragments scan. + parse_options : pyarrow.json.ParseOptions + Options regarding json parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + +class JsonFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for JSON fragments. + + Parameters + ---------- + parse_options : pyarrow.json.ParseOptions + Options regarding JSON parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + def __init__( + self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: + """ + Parse a path into a partition expression. + + Parameters + ---------- + path : str + + Returns + ------- + pyarrow.dataset.Expression + """ + def format(self, expr: Expression) -> tuple[str, str]: + """ + Convert a filter expression into a tuple of (directory, filename) using + the current partitioning scheme + + Parameters + ---------- + expr : pyarrow.dataset.Expression + + Returns + ------- + tuple[str, str] + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> import pyarrow.compute as pc + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) + >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) + ('1862/Jan', '') + """ + @property + def schema(self) -> lib.Schema: + """The arrow Schema attached to the partitioning.""" + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[lib.Array | None]: + """ + The unique values for each partition field, if available. + + Those values are only available if the Partitioning object was + created through dataset discovery from a PartitioningFactory, or + if the dictionaries were manually specified in the constructor. + If no dictionary field is available, this returns an empty list. + """ + +class DirectoryPartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The DirectoryPartitioning expects one segment in the file path for each + field in the schema (all fields are required to be present). + For example given schema the path "/2009/11" would + be parsed to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + DirectoryPartitioning + + Examples + -------- + >>> from pyarrow.dataset import DirectoryPartitioning + >>> partitioning = DirectoryPartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("/2009/11/")) + ((year == 2009) and (month == 11)) + """ + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a DirectoryPartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + +class HivePartitioning(KeyValuePartitioning): + """ + A Partitioning for "/$key=$value/" nested directories as found in + Apache Hive. + + Multi-level, directory based partitioning scheme originating from + Apache Hive with all data files stored in the leaf directories. Data is + partitioned by static values of a particular column in the schema. + Partition keys are represented in the form $key=$value in directory names. + Field order is ignored, as are missing or unrecognized field names. + + For example, given schema, a possible + path would be "/year=2009/month=11/day=15". + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + If any field is None then this fallback will be used as a label + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + HivePartitioning + + Examples + -------- + >>> from pyarrow.dataset import HivePartitioning + >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/year=2009/month=11/")) + ((year == 2009) and (month == 11)) + + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a HivePartitioning. + + Parameters + ---------- + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain. This can be more efficient when + materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class FilenamePartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The FilenamePartitioning expects one segment in the file name for each + field in the schema (all fields are required to be present) separated + by '_'. For example given schema the name + ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + FilenamePartitioning + + Examples + -------- + >>> from pyarrow.dataset import FilenamePartitioning + >>> partitioning = FilenamePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("2009_11_data.parquet")) + ((year == 2009) and (month == 11)) + """ + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a FilenamePartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class DatasetFactory(lib._Weakrefable): + """ + DatasetFactory is used to create a Dataset, inspect the Schema + of the fragments contained in it, and declare a partitioning. + """ + + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: + """ + Create a Dataset using the inspected schema or an explicit schema + (if given). + + Parameters + ---------- + schema : Schema, default None + The schema to conform the source to. If None, the inspected + schema is used. + + Returns + ------- + Dataset + """ + def inspect(self) -> lib.Schema: + """ + Inspect all data fragments and return a common Schema. + + Returns + ------- + Schema + """ + def inspect_schemas(self) -> list[lib.Schema]: ... + +class FileSystemFactoryOptions(lib._Weakrefable): + """ + Influences the discovery of filesystem paths. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning/PartitioningFactory, optional + Apply the Partitioning to every discovered Fragment. See Partitioning or + PartitioningFactory documentation. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + selector_ignore_prefixes : list, optional + When discovering from a Selector (and not from an explicit file list), + ignore files and directories matching any of these prefixes. + By default this is ['.', '_']. + """ + + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + artition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + +class FileSystemDatasetFactory(DatasetFactory): + """ + Create a DatasetFactory from a list of paths with schema inspection. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem + Filesystem to discover. + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes + Either a Selector object or a list of path-like objects. + format : FileFormat + Currently only ParquetFileFormat and IpcFileFormat are supported. + options : FileSystemFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + +class UnionDatasetFactory(DatasetFactory): + """ + Provides a way to inspect/discover a Dataset's expected schema before + materialization. + + Parameters + ---------- + factories : list of DatasetFactory + """ + def __init__(self, factories: list[DatasetFactory]) -> None: ... + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + """An iterator over a sequence of record batches.""" + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + +class TaggedRecordBatch(NamedTuple): + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : RecordBatch + The record batch. + fragment : Fragment + Fragment of the record batch. + """ + + record_batch: lib.RecordBatch + fragment: Fragment + +class TaggedRecordBatchIterator(lib._Weakrefable): + """An iterator over a sequence of record batches with fragments.""" + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + +class Scanner(lib._Weakrefable): + """A materialized scan operation with context and options bound. + + A scanner is the class that glues the scan tasks, data fragments and data + sources together. + """ + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Dataset, + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Fragment, + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema, optional + The schema of the fragment. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch] | RecordBatchReader, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create a Scanner from an iterator of batches. + + This creates a scanner which can be used only once. It is + intended to support writing a dataset (which takes a scanner) + from a source which can be read only once (e.g. a + RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator or Arrow-compatible stream object + The iterator of Batches. This can be a pyarrow RecordBatchReader, + any object that implements the Arrow PyCapsule Protocol for + streams, or an actual Python iterator of RecordBatches. + schema : Schema + The schema of the batches (required when passing a Python + iterator). + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @property + def dataset_schema(self) -> lib.Schema: + """The schema with which batches will be read from fragments.""" + @property + def projected_schema(self) -> lib.Schema: + """ + The materialized schema of the data, accounting for projections. + + This is the schema of any data returned from the scanner. + """ + def to_batches(self) -> Iterator[lib.RecordBatch]: + """ + Consume a Scanner in record batches. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def scan_batches(self) -> TaggedRecordBatchIterator: + """ + Consume a Scanner in record batches with corresponding fragments. + + Returns + ------- + record_batches : iterator of TaggedRecordBatch + """ + def to_table(self) -> lib.Table: + """ + Convert a Scanner into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Returns + ------- + Table + """ + def take(self, indices: Indices) -> lib.Table: + """ + Select rows of data by index. + + Will only consume as many batches of the underlying dataset as + needed. Otherwise, this is equivalent to + ``to_table().take(indices)``. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + + Returns + ------- + Table + """ + def head(self, num_rows: int) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + + Returns + ------- + Table + """ + def count_rows(self) -> int: + """ + Count rows matching the scanner filter. + + Returns + ------- + count : int + """ + def to_reader(self) -> RecordBatchReader: + """Consume this scanner as a RecordBatchReader. + + Returns + ------- + RecordBatchReader + """ + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: + """ + Extract partition keys (equality constraints between a field and a scalar) + from an expression as a dict mapping the field's name to its value. + + NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning + will be conjunctions of equality conditions and are accessible through this + function. Other subexpressions will be ignored. + + Parameters + ---------- + partition_expression : pyarrow.dataset.Expression + + Returns + ------- + dict + + Examples + -------- + + For example, an expression of + + is converted to {'part': 'A', 'year': 2016} + """ + +class WrittenFile(lib._Weakrefable): + """ + Metadata information about files written as + part of a dataset write operation + + Parameters + ---------- + path : str + Path to the file. + metadata : pyarrow.parquet.FileMetaData, optional + For Parquet files, the Parquet file metadata. + size : int + The size of the file in bytes. + """ + def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None], + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + +class ScanNodeOptions(_ScanNodeOptions): + """ + A Source node which yields batches from a Dataset scan. + + This is the option class for the "scan" node factory. + + This node is capable of applying pushdown projections or filters + to the file readers which reduce the amount of data that needs to + be read (if supported by the file format). But note that this does not + construct associated filter or project nodes to perform the final + filtering or projection. Rather, you may supply the same filter + expression or projection to the scan node that you also supply + to the filter or project node. + + Yielded batches will be augmented with fragment/batch indices when + implicit_ordering=True to enable stable ordering for simple ExecPlans. + + Parameters + ---------- + dataset : pyarrow.dataset.Dataset + The table which acts as the data source. + **kwargs : dict, optional + Scan options. See `Scanner.from_dataset` for possible arguments. + require_sequenced_output : bool, default False + Batches are yielded sequentially, like single-threaded + implicit_ordering : bool, default False + Preserve implicit ordering of data. + """ + + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/pyarrow-stubs/_dataset_orc.pyi b/python/pyarrow-stubs/_dataset_orc.pyi new file mode 100644 index 00000000000..9c4ac04198f --- /dev/null +++ b/python/pyarrow-stubs/_dataset_orc.pyi @@ -0,0 +1,6 @@ +from ._dataset import FileFormat + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/pyarrow-stubs/_dataset_parquet.pyi b/python/pyarrow-stubs/_dataset_parquet.pyi new file mode 100644 index 00000000000..cbcc17235f1 --- /dev/null +++ b/python/pyarrow-stubs/_dataset_parquet.pyi @@ -0,0 +1,314 @@ +from dataclasses import dataclass +from typing import IO, Any, Iterable, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from .lib import CacheOptions, Schema, _Weakrefable + +parquet_encryption_enabled: bool + +class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option + """ + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + row_groups : Iterable, optional + The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + +class _NameStats(TypedDict): + min: Any + max: Any + +class RowGroupInfo: + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. + """ + + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + +class ParquetFileFragment(FileFragment): + """A Fragment representing a parquet file.""" + + def ensure_complete_metadata(self) -> None: ... + @property + def row_groups(self) -> list[RowGroupInfo]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups viewed by this fragment (not the + number of row groups in the origin file). + """ + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: + """ + Split the fragment into multiple fragments. + + Yield a Fragment wrapping each row group in this ParquetFileFragment. + Row groups will be excluded whose metadata contradicts the optional + filter. + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + + Returns + ------- + A list of Fragments + """ + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: + """ + Create a subset of the fragment (viewing a subset of the row groups). + + Subset can be specified by either a filter predicate (with optional + schema) or by a list of row group IDs. Note that when using a filter, + the resulting fragment can be empty (viewing no row groups). + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + row_group_ids : list of ints + The row group IDs to include in the subset. Can only be specified + if `filter` is None. + + Returns + ------- + ParquetFileFragment + """ + +class ParquetReadOptions(_Weakrefable): + """ + Parquet format specific options for reading. + + Parameters + ---------- + dictionary_columns : list of string, default None + Names of columns which should be dictionary encoded as + they are read + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds + """ + def __init__( + self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None + ) -> None: ... + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + def equals(self, other: ParquetReadOptions) -> bool: ... + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for Parquet fragments. + + Parameters + ---------- + use_buffered_stream : bool, default False + Read files through buffered input streams rather than loading entire + row groups at once. This may be enabled to reduce memory overhead. + Disabled by default. + buffer_size : int, default 8192 + Size of buffered stream, if enabled. Default is 8KB. + pre_buffer : bool, default True + If enabled, pre-buffer the raw Parquet data instead of issuing one + read per column chunk. This can improve performance on high-latency + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None + If not None, use the provided ParquetDecryptionConfig to decrypt the + Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + """ + + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + """ + Influences the discovery of parquet dataset. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning, PartitioningFactory, optional + The partitioning scheme applied to fragments, see ``Partitioning``. + validate_column_chunk_paths : bool, default False + Assert that all ColumnChunk paths are consistent. The parquet spec + allows for ColumnChunk data to be stored in multiple files, but + ParquetDatasetFactory supports only a single file with all ColumnChunk + data. If this flag is set construction of a ParquetDatasetFactory will + raise an error if ColumnChunk data is not resident in a single file. + """ + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + +class ParquetDatasetFactory(DatasetFactory): + """ + Create a ParquetDatasetFactory from a Parquet `_metadata` file. + + Parameters + ---------- + metadata_path : str + Path to the `_metadata` parquet metadata-only file generated with + `pyarrow.parquet.write_metadata`. + filesystem : pyarrow.fs.FileSystem + Filesystem to read the metadata_path from, and subsequent parquet + files. + format : ParquetFileFormat + Parquet format options. + options : ParquetFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..7623275b865 --- /dev/null +++ b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi @@ -0,0 +1,85 @@ +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig +from .lib import _Weakrefable + +class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/pyarrow-stubs/_feather.pyi b/python/pyarrow-stubs/_feather.pyi new file mode 100644 index 00000000000..8bb914ba45d --- /dev/null +++ b/python/pyarrow-stubs/_feather.pyi @@ -0,0 +1,29 @@ +from typing import IO + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + +class FeatherError(Exception): ... + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: int = 2, +): ... + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: list[int]) -> Table: ... + def read_names(self, names: list[str]) -> Table: ... diff --git a/python/pyarrow-stubs/_flight.pyi b/python/pyarrow-stubs/_flight.pyi new file mode 100644 index 00000000000..4450c42df49 --- /dev/null +++ b/python/pyarrow-stubs/_flight.pyi @@ -0,0 +1,1380 @@ +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar + +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Schema, + Table, + TimestampScalar, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + +class FlightCallOptions(_Weakrefable): + """RPC-layer options for a Flight call.""" + + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str, str]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: + """Create call options. + + Parameters + ---------- + timeout : float, None + A timeout for the call, in seconds. None means that the + timeout defaults to an implementation-specific value. + write_options : pyarrow.ipc.IpcWriteOptions, optional + IPC write options. The default options can be controlled + by environment variables (see pyarrow.ipc). + headers : List[Tuple[str, str]], optional + A list of arbitrary headers as key, value tuples + read_options : pyarrow.ipc.IpcReadOptions, optional + Serialization options for reading IPC format. + """ + +class CertKeyPair(NamedTuple): + """A TLS certificate and key for use in Flight.""" + + cert: str + key: str + +class FlightError(Exception): + """ + The base class for Flight-specific errors. + + A server may raise this class or one of its subclasses to provide + a more detailed error to clients. + + Parameters + ---------- + message : str, optional + The error message. + extra_info : bytes, optional + Extra binary error details that were provided by the + server/will be sent to the client. + + Attributes + ---------- + extra_info : bytes + Extra binary error details that were provided by the + server/will be sent to the client. + """ + + extra_info: bytes + +class FlightInternalError(FlightError, ArrowException): + """An error internal to the Flight server occurred.""" + +class FlightTimedOutError(FlightError, ArrowException): + """The Flight RPC call timed out.""" + +class FlightCancelledError(FlightError, ArrowCancelled): + """The operation was cancelled.""" + +class FlightServerError(FlightError, ArrowException): + """A server error occurred.""" + +class FlightUnauthenticatedError(FlightError, ArrowException): + """The client is not authenticated.""" + +class FlightUnauthorizedError(FlightError, ArrowException): + """The client is not authorized to perform the given operation.""" + +class FlightUnavailableError(FlightError, ArrowException): + """The server is not reachable or available.""" + +class FlightWriteSizeExceededError(ArrowInvalid): + """A write operation exceeded the client-configured limit.""" + + limit: int + actual: int + +class Action(_Weakrefable): + """An action executable on a Flight service.""" + + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: + """Create an action from a type and a buffer. + + Parameters + ---------- + action_type : bytes or str + buf : Buffer or bytes-like object + """ + @property + def type(self) -> str: + """The action type.""" + @property + def body(self) -> Buffer: + """The action body (arguments for the action).""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class ActionType(NamedTuple): + """A type of action that is executable on a Flight service.""" + + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: + """Create an Action with this type. + + Parameters + ---------- + buf : obj + An Arrow buffer or Python bytes or bytes-like object. + """ + +class Result(_Weakrefable): + """A result from executing an Action.""" + def __init__(self, buf: Buffer | bytes) -> None: + """Create a new result. + + Parameters + ---------- + buf : Buffer or bytes-like object + """ + @property + def body(self) -> Buffer: + """Get the Buffer containing the result.""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class BasicAuth(_Weakrefable): + """A container for basic auth.""" + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: + """Create a new basic auth object. + + Parameters + ---------- + username : string + password : string + """ + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + +class DescriptorType(enum.Enum): + """ + The type of a FlightDescriptor. + + Attributes + ---------- + + UNKNOWN + An unknown descriptor type. + + PATH + A Flight stream represented by a path. + + CMD + A Flight stream represented by an application-defined command. + + """ + + UNKNOWN = 0 + PATH = 1 + CMD = 2 + +class FlightMethod(enum.Enum): + """The implemented methods in Flight.""" + + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + +class FlightDescriptor(_Weakrefable): + """A description of a data stream available from a Flight service.""" + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for a resource path.""" + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for an opaque command.""" + @property + def descriptor_type(self) -> DescriptorType: + """Get the type of this descriptor.""" + @property + def path(self) -> list[bytes] | None: + """Get the path for this descriptor.""" + @property + def command(self) -> bytes | None: + """Get the command for this descriptor.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Ticket(_Weakrefable): + """A ticket for requesting a Flight stream.""" + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Location(_Weakrefable): + """The location of a Flight service.""" + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: + """Create a Location for a TCP-based gRPC service.""" + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: + """Create a Location for a TLS-based gRPC service.""" + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: + """Create a Location for a domain socket-based gRPC service.""" + +class FlightEndpoint(_Weakrefable): + """A Flight stream, along with the ticket and locations to access it.""" + def __init__( + self, + ticket: Ticket | str | bytes, + locations: list[str | Location], + expiration_time: TimestampScalar | None = ..., + app_metadata: bytes | str = ..., + ): + """Create a FlightEndpoint from a ticket and list of locations. + + Parameters + ---------- + ticket : Ticket or bytes + the ticket needed to access this flight + locations : list of string URIs + locations where this flight is available + expiration_time : TimestampScalar, default None + Expiration time of this stream. If present, clients may assume + they can retry DoGet requests. Otherwise, clients should avoid + retrying DoGet requests. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + + Raises + ------ + ArrowException + If one of the location URIs is not a valid URI. + """ + @property + def ticket(self) -> Ticket: + """Get the ticket in this endpoint.""" + @property + def locations(self) -> list[Location]: + """Get locations where this flight is available.""" + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> TimestampScalar | None: + """Get the expiration time of this stream. + + If present, clients may assume they can retry DoGet requests. + Otherwise, clients should avoid retrying DoGet requests. + + """ + @property + def app_metadata(self) -> bytes | str: + """Get application-defined opaque metadata.""" + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class SchemaResult(_Weakrefable): + """The serialized schema returned from a GetSchema request.""" + def __init__(self, schema: Schema) -> None: + """Create a SchemaResult from a schema. + + Parameters + ---------- + schema: Schema + the schema of the data in this flight. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightInfo(_Weakrefable): + """A description of a Flight stream.""" + def __init__( + self, + schema: Schema, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int = ..., + total_bytes: int = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: + """Create a FlightInfo object from a schema, descriptor, and endpoints. + + Parameters + ---------- + schema : Schema + the schema of the data in this flight. + descriptor : FlightDescriptor + the descriptor for this flight. + endpoints : list of FlightEndpoint + a list of endpoints where this flight is available. + total_records : int, default None + the total records in this flight, -1 or None if unknown. + total_bytes : int, default None + the total bytes in this flight, -1 or None if unknown. + ordered : boolean, default False + Whether endpoints are in the same order as the data. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + @property + def descriptor(self) -> FlightDescriptor: + """The descriptor of the data in this flight.""" + @property + def endpoints(self) -> list[FlightEndpoint]: + """The endpoints where this flight is available.""" + @property + def total_records(self) -> int: + """The total record count of this flight, or -1 if unknown.""" + @property + def total_bytes(self) -> int: + """The size in bytes of the data in this flight, or -1 if unknown.""" + @property + def ordered(self) -> bool: + """Whether endpoints are in the same order as the data.""" + @property + def app_metadata(self) -> bytes | str: + """ + Application-defined opaque metadata. + + There is no inherent or required relationship between this and the + app_metadata fields in the FlightEndpoints or resulting FlightData + messages. Since this metadata is application-defined, a given + application could define there to be a relationship, but there is + none required by the spec. + + """ + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightStreamChunk(_Weakrefable): + """A RecordBatch with application metadata on the side.""" + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + """A reader for Flight streams.""" + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: + """Get the schema for this reader.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + def read_chunk(self) -> FlightStreamChunk: + """Read the next FlightStreamChunk along with any metadata. + + Returns + ------- + chunk : FlightStreamChunk + The next FlightStreamChunk in the stream. + + Raises + ------ + StopIteration + when the stream is finished + """ + def to_reader(self) -> RecordBatchReader: + """Convert this reader into a regular RecordBatchReader. + + This may fail if the schema cannot be read from the remote end. + + Returns + ------- + RecordBatchReader + """ + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + """The base class for readers for Flight streams. + + See Also + -------- + FlightStreamReader + """ + +class FlightStreamReader(MetadataRecordBatchReader): + """A reader that can also be canceled.""" + def cancel(self) -> None: + """Cancel the read operation.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + """A RecordBatchWriter that also allows writing application metadata. + + This class is a context manager; on exit, close() will be called. + """ + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: + """Prepare to write data to this stream with the given schema.""" + def write_metadata(self, buf: Buffer) -> None: + """Write Flight metadata by itself.""" + def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + """ + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: + """Write a RecordBatch along with Flight metadata. + + Parameters + ---------- + batch : RecordBatch + The next RecordBatch in the stream. + buf : Buffer + Application-specific metadata for the batch as defined by + Flight. + """ + +class FlightStreamWriter(MetadataRecordBatchWriter): + """A writer that also allows closing the write side of a stream.""" + def done_writing(self) -> None: + """Indicate that the client is done writing, but not done reading.""" + +class FlightMetadataReader(_Weakrefable): + """A reader for Flight metadata messages sent during a DoPut.""" + def read(self) -> Buffer | None: + """Read the next metadata message.""" + +class FlightMetadataWriter(_Weakrefable): + """A sender for Flight metadata messages during a DoPut.""" + def write(self, message: Buffer) -> None: + """Write the next metadata message. + + Parameters + ---------- + message : Buffer + """ + +class AsyncioCall(Generic[_T]): + """State for an async RPC using asyncio.""" + + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + +class AsyncioFlightClient: + """ + A FlightClient with an asyncio-based async interface. + + This interface is EXPERIMENTAL. + """ + + def __init__(self, client: FlightClient) -> None: ... + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + +class FlightClient(_Weakrefable): + """A client to a Flight service. + + Connect to a Flight service on the given host and port. + + Parameters + ---------- + location : str, tuple or Location + Location to connect to. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + tls_root_certs : bytes or None + PEM-encoded + cert_chain: bytes or None + Client certificate if using mutual TLS + private_key: bytes or None + Client private key for cert_chain is using mutual TLS + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list optional, default None + A list of ClientMiddlewareFactory instances. + write_size_limit_bytes : int optional, default None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean optional, default False + A flag that indicates that, if the client is connecting + with TLS, that it skips server verification. If this is + enabled, all other TLS settings are overridden. + generic_options : list optional, default None + A list of generic (string, int or string) option tuples passed + to the underlying transport. Effect is implementation + dependent. + """ + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: + """Block until the server can be contacted. + + Parameters + ---------- + timeout : int, default 5 + The maximum seconds to wait. + """ + @deprecated( + "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." + ) + @classmethod + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: + """Connect to a Flight server. + + .. deprecated:: 0.15.0 + Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. + """ + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: + """Authenticate to the server. + + Parameters + ---------- + auth_handler : ClientAuthHandler + The authentication mechanism to use. + options : FlightCallOptions + Options for this call. + """ + def authenticate_basic_token( + self, username: str, password: str, options: FlightCallOptions | None = None + ) -> tuple[str, str]: + """Authenticate to the server with HTTP basic authentication. + + Parameters + ---------- + username : string + Username to authenticate with + password : string + Password to authenticate with + options : FlightCallOptions + Options for this call + + Returns + ------- + tuple : Tuple[str, str] + A tuple representing the FlightCallOptions authorization + header entry of a bearer token. + """ + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: + """List the actions available on a service.""" + def do_action( + self, action: Action, options: FlightCallOptions | None = None + ) -> Iterator[Result]: + """ + Execute an action on a service. + + Parameters + ---------- + action : str, tuple, or Action + Can be action type name (no body), type and body, or any Action + object + options : FlightCallOptions + RPC options + + Returns + ------- + results : iterator of Result values + """ + def list_flights( + self, criteria: str | None = None, options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: + """List the flights available on a service.""" + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: + """Request information about an available flight.""" + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> Schema: + """Request schema for an available flight.""" + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: + """Request the data for a flight. + + Returns + ------- + reader : FlightStreamReader + """ + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Upload data to a flight. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightMetadataReader + """ + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Start a bidirectional data exchange with a server. + + Parameters + ---------- + descriptor : FlightDescriptor + A descriptor for the flight. + options : FlightCallOptions + RPC options. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightStreamReader + """ + def close(self) -> None: + """Close the client and disconnect.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + +class FlightDataStream(_Weakrefable): + """ + Abstract base class for Flight data streams. + + See Also + -------- + RecordBatchStream + GeneratorStream + """ + +class RecordBatchStream(FlightDataStream): + """A Flight data stream backed by RecordBatches. + + The remainder of this DoGet request will be handled in C++, + without having to acquire the GIL. + + """ + def __init__( + self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None + ) -> None: + """Create a RecordBatchStream from a data source. + + Parameters + ---------- + data_source : RecordBatchReader or Table + The data to stream to the client. + options : pyarrow.ipc.IpcWriteOptions, optional + Optional IPC options to control how to write the data. + """ + +class GeneratorStream(FlightDataStream): + """A Flight data stream backed by a Python generator.""" + def __init__( + self, + schema: Schema, + generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], + options: IpcWriteOptions | None = None, + ) -> None: + """Create a GeneratorStream from a Python generator. + + Parameters + ---------- + schema : Schema + The schema for the data to be returned. + + generator : iterator or iterable + The generator should yield other FlightDataStream objects, + Tables, RecordBatches, or RecordBatchReaders. + + options : pyarrow.ipc.IpcWriteOptions, optional + """ + +class ServerCallContext(_Weakrefable): + """Per-call state/context.""" + def peer_identity(self) -> bytes: + """Get the identity of the authenticated peer. + + May be the empty string. + """ + def peer(self) -> str: + """Get the address of the peer.""" + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: + """Check if the current RPC call has been canceled by the client.""" + def add_header(self, key: str, value: str) -> None: + """Add a response header.""" + def add_trailer(self, key: str, value: str) -> None: + """Add a response trailer.""" + def get_middleware(self, key: str) -> ServerMiddleware | None: + """ + Get a middleware instance by key. + + Returns None if the middleware was not found. + """ + +class ServerAuthReader(_Weakrefable): + """A reader for messages from the client during an auth handshake.""" + def read(self) -> str: ... + +class ServerAuthSender(_Weakrefable): + """A writer for messages to the client during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ClientAuthReader(_Weakrefable): + """A reader for messages from the server during an auth handshake.""" + def read(self) -> str: ... + +class ClientAuthSender(_Weakrefable): + """A writer for messages to the server during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ServerAuthHandler(_Weakrefable): + """Authentication middleware for a server. + + To implement an authentication mechanism, subclass this class and + override its methods. + + """ + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): + """Conduct the handshake with the client. + + May raise an error if the client cannot authenticate. + + Parameters + ---------- + outgoing : ServerAuthSender + A channel to send messages to the client. + incoming : ServerAuthReader + A channel to read messages from the client. + """ + def is_valid(self, token: str) -> bool: + """Validate a client token, returning their identity. + + May return an empty string (if the auth mechanism does not + name the peer) or raise an exception (if the token is + invalid). + + Parameters + ---------- + token : bytes + The authentication token from the client. + + """ + +class ClientAuthHandler(_Weakrefable): + """Authentication plugin for a client.""" + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): + """Conduct the handshake with the server. + + Parameters + ---------- + outgoing : ClientAuthSender + A channel to send messages to the server. + incoming : ClientAuthReader + A channel to read messages from the server. + """ + def get_token(self) -> str: + """Get the auth token for a call.""" + +class CallInfo(NamedTuple): + """Information about a particular RPC for Flight middleware.""" + + method: FlightMethod + +class ClientMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + client are accessible from the middleware itself. + + """ + def start_call(self, info: CallInfo) -> ClientMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe and must not raise exceptions. + + Parameters + ---------- + info : CallInfo + Information about the call. + + Returns + ------- + instance : ClientMiddleware + An instance of ClientMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + """ + +class ClientMiddleware(_Weakrefable): + """Client-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the request, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): + """A callback when headers are received. + + The default implementation does nothing. + + Parameters + ---------- + headers : dict + A dictionary of headers from the server. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + """ + + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + The default implementation does nothing. + + Parameters + ---------- + exception : ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class ServerMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + middleware are accessible from the method itself. + + """ + + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe. + + Parameters + ---------- + info : CallInfo + Information about the call. + headers : dict + A dictionary of headers from the client. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + Returns + ------- + instance : ServerMiddleware + An instance of ServerMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + Raises + ------ + exception : pyarrow.ArrowException + If an exception is raised, the call will be rejected with + the given error. + + """ + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + """A factory for tracing middleware instances. + + This enables OpenTelemetry support in Arrow (if Arrow was compiled + with OpenTelemetry support enabled). A new span will be started on + each RPC call. The TracingServerMiddleware instance can then be + retrieved within an RPC handler to get the propagated context, + which can be used to start a new span on the Python side. + + Because the Python/C++ OpenTelemetry libraries do not + interoperate, spans on the C++ side are not directly visible to + the Python side and vice versa. + + """ + +class ServerMiddleware(_Weakrefable): + """Server-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the response, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + Parameters + ---------- + exception : pyarrow.ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + """Wrapper to bundle server middleware into a single C++ one.""" + + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + +class _FlightServerFinalizer(_Weakrefable): + """ + A finalizer that shuts down the server on destruction. + + See ARROW-16597. If the server is still active at interpreter + exit, the process may segfault. + """ + + def finalize(self) -> None: ... + +class FlightServerBase(_Weakrefable): + """A Flight service definition. + + To start the server, create an instance of this class with an + appropriate location. The server will be running as soon as the + instance is created; it is not required to call :meth:`serve`. + + Override methods to define your Flight service. + + Parameters + ---------- + location : str, tuple or Location optional, default None + Location to serve on. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + If None is passed then the server will be started on localhost with a + system provided random port. + auth_handler : ServerAuthHandler optional, default None + An authentication mechanism to use. May be None. + tls_certificates : list optional, default None + A list of (certificate, key) pairs. + verify_client : boolean optional, default False + If True, then enable mutual TLS: require the client to present + a client certificate, and validate the certificate. + root_certificates : bytes optional, default None + If enabling mutual TLS, this specifies the PEM-encoded root + certificate used to validate client certificates. + middleware : dict optional, default None + A dictionary of :class:`ServerMiddlewareFactory` instances. The + string keys can be used to retrieve the middleware instance within + RPC handlers (see :meth:`ServerCallContext.get_middleware`). + + """ + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + @property + def port(self) -> int: + """ + Get the port that this server is listening on. + + Returns a non-positive value if the operation is invalid + (e.g. init() was not called or server is listening on a domain + socket). + """ + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: + """List flights available on this service. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + criteria : bytes + Filter criteria provided by the client. + + Returns + ------- + iterator of FlightInfo + + """ + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: + """Get information about a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + FlightInfo + + """ + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: + """Get the schema of a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + Schema + + """ + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : FlightMetadataWriter + A writer to send responses to the client. + + """ + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + ticket : Ticket + The ticket for the flight. + + Returns + ------- + FlightDataStream + A stream of data to send back to the client. + + """ + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : MetadataRecordBatchWriter + A writer to send responses to the client. + + """ + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: + """List custom actions available on this server. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + + Returns + ------- + iterator of ActionType or tuple + + """ + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: + """Execute a custom action. + + This method should return an iterator, or it should be a + generator. Applications should override this method to + implement their own behavior. The default method raises a + NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + action : Action + The action to execute. + + Returns + ------- + iterator of bytes + + """ + def serve(self) -> None: + """Block until the server shuts down. + + This method only returns if shutdown() is called or a signal is + received. + """ + def run(self) -> None: + """Block until the server shuts down. + + .. deprecated:: 0.15.0 + Use the ``FlightServer.serve`` method instead + """ + def shutdown(self) -> None: + """Shut down the server, blocking until current requests finish. + + Do not call this directly from the implementation of a Flight + method, as then the server will block forever waiting for that + request to finish. Instead, call this method from a background + thread. + + This method should only be called once. + """ + def wait(self) -> None: + """Block until server is terminated with shutdown.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback): ... + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, +) -> FlightClient: + """ + Connect to a Flight server. + + Parameters + ---------- + location : str, tuple, or Location + Location to connect to. Either a URI like "grpc://localhost:port", + a tuple of (host, port), or a Location instance. + tls_root_certs : bytes or None + PEM-encoded. + cert_chain: str or None + If provided, enables TLS mutual authentication. + private_key: str or None + If provided, enables TLS mutual authentication. + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list or None + A list of ClientMiddlewareFactory instances to apply. + write_size_limit_bytes : int or None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean or None + Disable verifying the server when using TLS. + Insecure, use with caution. + generic_options : list or None + A list of generic (string, int or string) options to pass to + the underlying transport. + + Returns + ------- + client : FlightClient + """ diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi new file mode 100644 index 00000000000..9b0f0ceaa20 --- /dev/null +++ b/python/pyarrow-stubs/_fs.pyi @@ -0,0 +1,1001 @@ +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Union, overload + +from fsspec import AbstractFileSystem # type: ignore[import-untyped] + +from .lib import NativeFile, _Weakrefable + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + +class FileInfo(_Weakrefable): + """ + FileSystem entry info. + + Parameters + ---------- + path : str + The full path to the filesystem entry. + type : FileType + The type of the filesystem entry. + mtime : datetime or float, default None + If given, the modification time of the filesystem entry. + If a float is given, it is the number of seconds since the + Unix epoch. + mtime_ns : int, default None + If given, the modification time of the filesystem entry, + in nanoseconds since the Unix epoch. + `mtime` and `mtime_ns` are mutually exclusive. + size : int, default None + If given, the filesystem entry size in bytes. This should only + be given if `type` is `FileType.File`. + + Examples + -------- + Generate a file: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> path_fs = local_path + "/pyarrow-fs-example.dat" + >>> with local.open_output_stream(path_fs) as stream: + ... stream.write(b"data") + 4 + + Get FileInfo object using ``get_file_info()``: + + >>> file_info = local.get_file_info(path_fs) + >>> file_info + + + Inspect FileInfo attributes: + + >>> file_info.type + + + >>> file_info.is_file + True + + >>> file_info.path + '/.../pyarrow-fs-example.dat' + + >>> file_info.base_name + 'pyarrow-fs-example.dat' + + >>> file_info.size + 4 + + >>> file_info.extension + 'dat' + + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + @property + def type(self) -> FileType: + """ + Type of the file. + + The returned enum values can be the following: + + - FileType.NotFound: target does not exist + - FileType.Unknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character device, or + Windows NUL / CON / ...) + - FileType.File: target is a regular file + - FileType.Directory: target is a regular directory + + Returns + ------- + type : FileType + """ + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: + """ + The full file path in the filesystem. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.path + '/.../pyarrow-fs-example.dat' + """ + @property + def base_name(self) -> str: + """ + The file base name. + + Component after the last directory separator. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.base_name + 'pyarrow-fs-example.dat' + """ + @property + def size(self) -> int: + """ + The size in bytes, if available. + + Only regular files are guaranteed to have a size. + + Returns + ------- + size : int or None + """ + @property + def extension(self) -> str: + """ + The file extension. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.extension + 'dat' + """ + @property + def mtime(self) -> dt.datetime | None: + """ + The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + """ + @property + def mtime_ns(self) -> int | None: + """ + The time of last modification, if available, expressed in nanoseconds + since the Unix epoch. + + Returns + ------- + mtime_ns : int or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + +class FileSelector(_Weakrefable): + """ + File and directory selector. + + It contains a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : str + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. + allow_not_found : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + + Examples + -------- + List the contents of a directory and subdirectories: + + >>> selector_1 = fs.FileSelector(local_path, recursive=True) + >>> local.get_file_info(selector_1) # doctest: +SKIP + [, + , + ] + + List only the contents of the base directory: + + >>> selector_2 = fs.FileSelector(local_path) + >>> local.get_file_info(selector_2) # doctest: +SKIP + [, + ] + + Return empty selection if the directory doesn't exist: + + >>> selector_not_found = fs.FileSelector( + ... local_path + "/missing", recursive=True, allow_not_found=True + ... ) + >>> local.get_file_info(selector_not_found) + [] + """ + + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... + +class FileSystem(_Weakrefable): + """ + Abstract file system API. + """ + + @classmethod + def from_uri(cls, uri: str) -> tuple[Self, str]: + """ + Create a new FileSystem from URI or Path. + + Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". + In addition, the argument can be a pathlib.Path object, or a string + describing an absolute local path. + + Parameters + ---------- + uri : string + URI-based path, for example: file:///some/local/path. + + Returns + ------- + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. + + Examples + -------- + Create a new FileSystem subclass from a URI: + + >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) + >>> local_new, path_new = fs.FileSystem.from_uri(uri) + >>> local_new + >> path_new + '/.../pyarrow-fs-example.dat' + + Or from a s3 bucket: + + >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") + (, 'usgs-landsat/collection02') + """ + def equals(self, other: FileSystem) -> bool: + """ + Parameters + ---------- + other : pyarrow.fs.FileSystem + + Returns + ------- + bool + """ + @property + def type_name(self) -> str: + """ + The filesystem's type name. + """ + def get_file_info(self, paths_or_selector: str | FileSelector | list[str]) -> FileInfo | list[FileInfo]: + """ + Get info for the given files. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns a FileStat object and has a FileType of + value NotFound. An exception indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector : FileSelector, path-like or list of path-likes + Either a selector object, a path-like object or a list of + path-like objects. The selector's base directory will not be + part of the results, even if it exists. If it doesn't exist, + use `allow_not_found`. + + Returns + ------- + FileInfo or list of FileInfo + Single FileInfo object is returned for a single path, otherwise + a list of FileInfo objects is returned. + + Examples + -------- + >>> local + + >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) + + """ + def create_dir(self, path: str, *, recursive: bool = True) -> None: + """ + Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : str + The path of the new directory. + recursive : bool, default True + Create nested directories as well. + """ + def delete_dir(self, path: str) -> None: + """ + Delete a directory and its contents, recursively. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + """ + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: + """ + Delete a directory's contents, recursively. + + Like delete_dir, but doesn't delete the directory itself. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + accept_root_dir : boolean, default False + Allow deleting the root directory's contents + (if path is empty or "/") + missing_dir_ok : boolean, default False + If False then an error is raised if path does + not exist + """ + def move(self, src: str, dest: str) -> None: + """ + Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : str + The path of the file or the directory to be moved. + dest : str + The destination path where the file or directory is moved to. + + Examples + -------- + Create a new folder with a file: + + >>> local.create_dir("/tmp/other_dir") + >>> local.copy_file(path, "/tmp/move_example.dat") + + Move the file: + + >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") + + Inspect the file info: + + >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") + + >>> local.get_file_info("/tmp/move_example.dat") + + + Delete the folder: + >>> local.delete_dir("/tmp/other_dir") + """ + def copy_file(self, src: str, dest: str) -> None: + """ + Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : str + The path of the file to be copied from. + dest : str + The destination path where the file is copied to. + + Examples + -------- + >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") + + Inspect the file info: + + >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") + + >>> local.get_file_info(path) + + """ + def delete_file(self, path: str) -> None: + """ + Delete a file. + + Parameters + ---------- + path : str + The path of the file to be deleted. + """ + def open_input_file(self, path: str) -> NativeFile: + """ + Open an input file for random access reading. + + Parameters + ---------- + path : str + The source to open for reading. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_file()`: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data' + """ + def open_input_stream( + self, path: str, compression: str | None = "detect", buffer_size: int | None = None + ) -> NativeFile: + """ + Open an input stream for sequential reading. + + Parameters + ---------- + path : str + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_stream()`: + + >>> with local.open_input_stream(path) as f: + ... print(f.readall()) + b'data' + """ + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: + """ + Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream(path) as stream: + ... stream.write(b"data") + 4 + """ + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): + """ + Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + .. note:: + Some filesystem implementations do not support efficient + appending to an existing file, in which case this method will + raise NotImplementedError. + Consider writing to multiple files (using e.g. the dataset layer) + instead. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Append new data to a FileSystem subclass with nonempty file: + + >>> with local.open_append_stream(path) as f: + ... f.write(b"+newly added") + 12 + + Print out the content to the file: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data+newly added' + """ + def normalize_path(self, path: str) -> str: + """ + Normalize filesystem path. + + Parameters + ---------- + path : str + The path to normalize + + Returns + ------- + normalized_path : str + The normalized path + """ + +class LocalFileSystem(FileSystem): + """ + A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + + Parameters + ---------- + use_mmap : bool, default False + Whether open_input_stream and open_input_file should return + a mmap'ed file or a regular file. + + Examples + -------- + Create a FileSystem object with LocalFileSystem constructor: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> local + + + and write data on to the file: + + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: + ... print(stream.readall()) + b'data' + + Create a FileSystem object inferred from a URI of the saved file: + + >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") + >>> local_new + >> path + '/tmp/local_fs.dat' + + Check if FileSystems `local` and `local_new` are equal: + + >>> local.equals(local_new) + True + + Compare two different FileSystems: + + >>> local2 = fs.LocalFileSystem(use_mmap=True) + >>> local.equals(local2) + False + + Copy a file and print out the data: + + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: + ... print(stream.readall()) + b'data' + + Open an output stream for appending, add text and print the new data: + + >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: + ... f.write(b"+newly added") + 12 + + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: + ... print(f.readall()) + b'data+newly added' + + Create a directory, copy a file into it and then delete the whole directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.delete_dir("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + + Create a directory, copy a file into it and then delete + the content of the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.delete_dir_contents("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + + Create a directory, copy a file into it and then delete + the file from the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.delete_file("/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.get_file_info("/tmp/new_folder") + + + Move the file: + + >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") + + >>> local.get_file_info("/tmp/local_fs-copy.dat") + + + To finish delete the file left: + >>> local.delete_file("/tmp/local_fs.dat") + """ + + def __init__(self, *, use_mmap: bool = False) -> None: ... + +class SubTreeFileSystem(FileSystem): + """ + Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + + Parameters + ---------- + base_path : str + The root of the subtree. + base_fs : FileSystem + FileSystem object the operations delegated to. + + Examples + -------- + Create a LocalFileSystem instance: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + + Create a directory and a SubTreeFileSystem instance: + + >>> local.create_dir("/tmp/sub_tree") + >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) + + Write data into the existing file: + + >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: + ... f.write(b"+newly added") + 12 + + Print out the attributes: + + >>> subtree.base_fs + + >>> subtree.base_path + '/tmp/sub_tree/' + + Get info for the given directory or given file: + + >>> subtree.get_file_info("") + + >>> subtree.get_file_info("sub_tree_fs.dat") + + + Delete the file and directory: + + >>> subtree.delete_file("sub_tree_fs.dat") + >>> local.delete_dir("/tmp/sub_tree") + >>> local.delete_file("/tmp/local_fs.dat") + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + +class PyFileSystem(FileSystem): + """ + A FileSystem with behavior implemented in Python. + + Parameters + ---------- + handler : FileSystemHandler + The handler object implementing custom filesystem behavior. + + Examples + -------- + Create an fsspec-based filesystem object for GitHub: + + >>> from fsspec.implementations import github + >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP + + Get a PyArrow FileSystem object: + + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler + >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP + + Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: + + >>> pa_fs.get_file_info("README.md") # doctest: +SKIP + + """ + def __init__(self, handler: FileSystemHandler) -> None: ... + @property + def handler(self) -> FileSystemHandler: + """ + The filesystem's underlying handler. + + Returns + ------- + handler : FileSystemHandler + """ + +class FileSystemHandler(ABC): + """ + An abstract class exposing methods to implement PyFileSystem's behavior. + """ + @abstractmethod + def get_type_name(self) -> str: + """ + Implement PyFileSystem.type_name. + """ + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : list of str + paths for which we want to retrieve the info. + """ + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : FileSelector + selector for which we want to retrieve the info. + """ + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: + """ + Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. + """ + @abstractmethod + def delete_dir(self, path: str) -> None: + """ + Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : str + path of the directory. + """ + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: + """ + Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : str + path of the directory. + missing_dir_ok : bool + if False an error should be raised if path does not exist + """ + @abstractmethod + def delete_root_dir_contents(self) -> None: + """ + Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). + """ + @abstractmethod + def delete_file(self, path: str) -> None: + """ + Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : str + path of the file. + """ + @abstractmethod + def move(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.move(...). + + Parameters + ---------- + src : str + path of what should be moved. + dest : str + path of where it should be moved to. + """ + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : str + path of what should be copied. + dest : str + path of where it should be copied to. + """ + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + @abstractmethod + def normalize_path(self, path: str) -> str: + """ + Implement PyFileSystem.normalize_path(...). + + Parameters + ---------- + path : str + path of what should be normalized. + """ diff --git a/python/pyarrow-stubs/_gcsfs.pyi b/python/pyarrow-stubs/_gcsfs.pyi new file mode 100644 index 00000000000..4fc7ea68e48 --- /dev/null +++ b/python/pyarrow-stubs/_gcsfs.pyi @@ -0,0 +1,83 @@ +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class GcsFileSystem(FileSystem): + """ + Google Cloud Storage (GCS) backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credentials, resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user or service account to have the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if `access_token` is specified. + default_bucket_location : str, default 'US' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. + project_id : str, default None + The GCP project identifier to use for creating buckets. + If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + variable. Most I/O operations do not need a project id, only applications + that create new buckets need a project id. + """ + + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: + """ + The GCP location this filesystem will write to. + """ + @property + def project_id(self) -> str: + """ + The GCP project id this filesystem will use. + """ diff --git a/python/pyarrow-stubs/_hdfs.pyi b/python/pyarrow-stubs/_hdfs.pyi new file mode 100644 index 00000000000..200f669379b --- /dev/null +++ b/python/pyarrow-stubs/_hdfs.pyi @@ -0,0 +1,75 @@ +from _typeshed import StrPath + +from ._fs import FileSystem + +class HadoopFileSystem(FileSystem): + """ + HDFS backed FileSystem implementation + + Parameters + ---------- + host : str + HDFS host to connect to. Set to "default" for fs.defaultFS from + core-site.xml. + port : int, default 8020 + HDFS port to connect to. Set to 0 for default or logical (HA) nodes. + user : str, default None + Username when connecting to HDFS; None implies login user. + replication : int, default 3 + Number of copies each block will have. + buffer_size : int, default 0 + If 0, no buffering will happen otherwise the size of the temporary read + and write buffer. + default_block_size : int, default None + None means the default configuration for HDFS, a typical block size is + 128 MB. + kerb_ticket : string or path, default None + If not None, the path to the Kerberos ticket cache. + extra_conf : dict, default None + Extra key/value pairs for configuration; will override any + hdfs-site.xml properties. + + Examples + -------- + >>> from pyarrow import fs + >>> hdfs = fs.HadoopFileSystem( + ... host, port, user=user, kerb_ticket=ticket_cache_path + ... ) # doctest: +SKIP + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__( + self, + host: str, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] + """ + Instantiate HadoopFileSystem object from an URI string. + + The following two calls are equivalent + + * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ +&replication=1')`` + * ``HadoopFileSystem('localhost', port=8020, user='test', \ +replication=1)`` + + Parameters + ---------- + uri : str + A string URI describing the connection to HDFS. + In order to change the user, replication, buffer_size or + default_block_size pass the values as query parts. + + Returns + ------- + HadoopFileSystem + """ diff --git a/python/pyarrow-stubs/_json.pyi b/python/pyarrow-stubs/_json.pyi new file mode 100644 index 00000000000..43d2ae83cd8 --- /dev/null +++ b/python/pyarrow-stubs/_json.pyi @@ -0,0 +1,169 @@ +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + +class ReadOptions(_Weakrefable): + """ + Options for reading JSON files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual chunks in the Table. + """ + + use_threads: bool + """ + Whether to use multiple threads to accelerate reading. + """ + block_size: int + """ + How much bytes to process at a time from the input stream. + + This will determine multi-threading granularity as well as the size of + individual chunks in the Table. + """ + def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... + def equals(self, other: ReadOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ReadOptions + + Returns + ------- + bool + """ + +class ParseOptions(_Weakrefable): + """ + Options for parsing JSON files. + + Parameters + ---------- + explicit_schema : Schema, optional (default None) + Optional explicit schema (no type inference, ignores other fields). + newlines_in_values : bool, optional (default False) + Whether objects may be printed across multiple lines (for example + pretty printed). If false, input must end with an empty line. + unexpected_field_behavior : str, default "infer" + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + """ + + explicit_schema: Schema + """ + Optional explicit schema (no type inference, ignores other fields) + """ + newlines_in_values: bool + """ + Whether newline characters are allowed in JSON values. + Setting this to True reduces the performance of multi-threaded + JSON reading. + """ + unexpected_field_behavior: Literal["ignore", "error", "infer"] + """ + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + + Set to "infer" by default. + """ + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ParseOptions + + Returns + ------- + bool + """ + +class JSONStreamingReader(RecordBatchReader): + """An object that reads record batches incrementally from a JSON file. + + Should not be instantiated directly by user code. + """ + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: + """ + Read a Table from a stream of JSON data. + + Parameters + ---------- + input_file : str, path or file-like object + The location of JSON data. Currently only the line-delimited JSON + format is supported. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. + + Returns + ------- + :class:`pyarrow.Table` + Contents of the JSON file as a in-memory table. + """ + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: + """ + Open a streaming reader of JSON data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of JSON data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see pyarrow.json.ReadOptions constructor + for defaults) + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see pyarrow.json.ParseOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.json.JSONStreamingReader` + """ diff --git a/python/pyarrow-stubs/_orc.pyi b/python/pyarrow-stubs/_orc.pyi new file mode 100644 index 00000000000..71bf0dde9ba --- /dev/null +++ b/python/pyarrow-stubs/_orc.pyi @@ -0,0 +1,56 @@ +from typing import IO, Literal + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, + compression_block_size: int | None = None, + compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/pyarrow-stubs/_parquet.pyi b/python/pyarrow-stubs/_parquet.pyi new file mode 100644 index 00000000000..a9187df0428 --- /dev/null +++ b/python/pyarrow-stubs/_parquet.pyi @@ -0,0 +1,445 @@ +from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def hash_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnChunkMetaData: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open( + self, + source: StrPath | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + def iter_batches( + self, + batch_size: int, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Iterator[RecordBatch]: ... + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def read_row_groups( + self, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Table: ... + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + +class FileEncryptionProperties: ... +class FileDecryptionProperties: ... diff --git a/python/pyarrow-stubs/_parquet_encryption.pyi b/python/pyarrow-stubs/_parquet_encryption.pyi new file mode 100644 index 00000000000..c707edb844a --- /dev/null +++ b/python/pyarrow-stubs/_parquet_encryption.pyi @@ -0,0 +1,67 @@ +import datetime as dt + +from typing import Callable + +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + ) -> None: ... + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/pyarrow-stubs/_s3fs.pyi b/python/pyarrow-stubs/_s3fs.pyi new file mode 100644 index 00000000000..50f63cd7e32 --- /dev/null +++ b/python/pyarrow-stubs/_s3fs.pyi @@ -0,0 +1,75 @@ +import enum + +from typing import Literal, TypedDict +from typing_extensions import Required, NotRequired + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class _ProxyOptions(TypedDict): + schema: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + +class AwsStandardS3RetryStrategy(S3RetryStrategy): ... +class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + check_directory_existence_before_creation: bool = False, + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/pyarrow-stubs/_substrait.pyi b/python/pyarrow-stubs/_substrait.pyi new file mode 100644 index 00000000000..ff226e9521b --- /dev/null +++ b/python/pyarrow-stubs/_substrait.pyi @@ -0,0 +1,39 @@ +from typing import Any, Callable + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + +class SubstraitSchema: + schema: Schema + expression: Expression + def __init__(self, schema: Schema, expression: Expression) -> None: ... + def to_pysubstrait(self) -> Any: ... + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes) -> Schema: ... +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow-stubs/acero.pyi b/python/pyarrow-stubs/acero.pyi new file mode 100644 index 00000000000..8a520bdc24a --- /dev/null +++ b/python/pyarrow-stubs/acero.pyi @@ -0,0 +1,85 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression, FunctionOptions + +_StrOrExpr: TypeAlias = str | Expression + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: list[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + +class ExecNodeOptions(lib._Weakrefable): ... + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table) -> None: ... + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression) -> None: ... + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: list[tuple[list[str], str, FunctionOptions, str]], + keys: list[_StrOrExpr] | None = None, + ) -> None: ... + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | list[_StrOrExpr], + right_keys: _StrOrExpr | list[_StrOrExpr], + left_output: list[_StrOrExpr] | None = None, + right_output: list[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + ) -> None: ... + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | list[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | list[_StrOrExpr], + tolerance: int, + ) -> None: ... diff --git a/python/pyarrow-stubs/builder.pyi b/python/pyarrow-stubs/builder.pyi new file mode 100644 index 00000000000..4a0e9ca4708 --- /dev/null +++ b/python/pyarrow-stubs/builder.pyi @@ -0,0 +1,89 @@ +from typing import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + +class StringBuilder(_Weakrefable): + """ + Builder class for UTF8 strings. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringViewArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow-stubs/cffi.pyi b/python/pyarrow-stubs/cffi.pyi new file mode 100644 index 00000000000..2ae945c5974 --- /dev/null +++ b/python/pyarrow-stubs/cffi.pyi @@ -0,0 +1,4 @@ +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/pyarrow-stubs/compat.pyi b/python/pyarrow-stubs/compat.pyi new file mode 100644 index 00000000000..2ea013555c0 --- /dev/null +++ b/python/pyarrow-stubs/compat.pyi @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/pyarrow-stubs/csv.pyi b/python/pyarrow-stubs/csv.pyi new file mode 100644 index 00000000000..510229d7e72 --- /dev/null +++ b/python/pyarrow-stubs/csv.pyi @@ -0,0 +1,27 @@ +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/pyarrow-stubs/cuda.pyi b/python/pyarrow-stubs/cuda.pyi new file mode 100644 index 00000000000..e11baf7d4e7 --- /dev/null +++ b/python/pyarrow-stubs/cuda.pyi @@ -0,0 +1,25 @@ +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/pyarrow-stubs/dataset.pyi b/python/pyarrow-stubs/dataset.pyi index a145437bb52..98f1a38aa85 100644 --- a/python/pyarrow-stubs/dataset.pyi +++ b/python/pyarrow-stubs/dataset.pyi @@ -1,229 +1,229 @@ -# from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload -# -# from _typeshed import StrPath -# from pyarrow._dataset import ( -# CsvFileFormat, -# CsvFragmentScanOptions, -# Dataset, -# DatasetFactory, -# DirectoryPartitioning, -# FeatherFileFormat, -# FileFormat, -# FileFragment, -# FilenamePartitioning, -# FileSystemDataset, -# FileSystemDatasetFactory, -# FileSystemFactoryOptions, -# FileWriteOptions, -# Fragment, -# FragmentScanOptions, -# HivePartitioning, -# InMemoryDataset, -# IpcFileFormat, -# IpcFileWriteOptions, -# JsonFileFormat, -# JsonFragmentScanOptions, -# Partitioning, -# PartitioningFactory, -# Scanner, -# TaggedRecordBatch, -# UnionDataset, -# UnionDatasetFactory, -# WrittenFile, -# get_partition_keys, -# ) -# from pyarrow._dataset_orc import OrcFileFormat -# from pyarrow._dataset_parquet import ( -# ParquetDatasetFactory, -# ParquetFactoryOptions, -# ParquetFileFormat, -# ParquetFileFragment, -# ParquetFileWriteOptions, -# ParquetFragmentScanOptions, -# ParquetReadOptions, -# RowGroupInfo, -# ) -# from pyarrow._dataset_parquet_encryption import ( -# ParquetDecryptionConfig, -# ParquetEncryptionConfig, -# ) -# from pyarrow.compute import Expression, field, scalar -# from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table -# -# from ._fs import SupportedFileSystem -# -# _orc_available: bool -# _parquet_available: bool -# -# __all__ = [ -# "CsvFileFormat", -# "CsvFragmentScanOptions", -# "Dataset", -# "DatasetFactory", -# "DirectoryPartitioning", -# "FeatherFileFormat", -# "FileFormat", -# "FileFragment", -# "FilenamePartitioning", -# "FileSystemDataset", -# "FileSystemDatasetFactory", -# "FileSystemFactoryOptions", -# "FileWriteOptions", -# "Fragment", -# "FragmentScanOptions", -# "HivePartitioning", -# "InMemoryDataset", -# "IpcFileFormat", -# "IpcFileWriteOptions", -# "JsonFileFormat", -# "JsonFragmentScanOptions", -# "Partitioning", -# "PartitioningFactory", -# "Scanner", -# "TaggedRecordBatch", -# "UnionDataset", -# "UnionDatasetFactory", -# "WrittenFile", -# "get_partition_keys", -# # Orc -# "OrcFileFormat", -# # Parquet -# "ParquetDatasetFactory", -# "ParquetFactoryOptions", -# "ParquetFileFormat", -# "ParquetFileFragment", -# "ParquetFileWriteOptions", -# "ParquetFragmentScanOptions", -# "ParquetReadOptions", -# "RowGroupInfo", -# # Parquet Encryption -# "ParquetDecryptionConfig", -# "ParquetEncryptionConfig", -# # Compute -# "Expression", -# "field", -# "scalar", -# # Dataset -# "partitioning", -# "parquet_dataset", -# "write_dataset", -# ] -# -# _DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] -# -# @overload -# def partitioning( -# schema: Schema, -# ) -> Partitioning: ... -# @overload -# def partitioning( -# schema: Schema, -# *, -# flavor: Literal["filename"], -# dictionaries: dict[str, Array] | None = None, -# ) -> Partitioning: ... -# @overload -# def partitioning( -# schema: Schema, -# *, -# flavor: Literal["filename"], -# dictionaries: Literal["infer"], -# ) -> PartitioningFactory: ... -# @overload -# def partitioning( -# field_names: list[str], -# *, -# flavor: Literal["filename"], -# ) -> PartitioningFactory: ... -# @overload -# def partitioning( -# schema: Schema, -# *, -# flavor: Literal["hive"], -# dictionaries: Literal["infer"], -# ) -> PartitioningFactory: ... -# @overload -# def partitioning( -# *, -# flavor: Literal["hive"], -# ) -> PartitioningFactory: ... -# @overload -# def partitioning( -# schema: Schema, -# *, -# flavor: Literal["hive"], -# dictionaries: dict[str, Array] | None = None, -# ) -> Partitioning: ... -# def parquet_dataset( -# metadata_path: StrPath, -# schema: Schema | None = None, -# filesystem: SupportedFileSystem | None = None, -# format: ParquetFileFormat | None = None, -# partitioning: Partitioning | PartitioningFactory | None = None, -# partition_base_dir: str | None = None, -# ) -> FileSystemDataset: ... -# @overload -# def dataset( -# source: StrPath | Sequence[StrPath], -# schema: Schema | None = None, -# format: FileFormat | _DatasetFormat | None = None, -# filesystem: SupportedFileSystem | str | None = None, -# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, -# partition_base_dir: str | None = None, -# exclude_invalid_files: bool | None = None, -# ignore_prefixes: list[str] | None = None, -# ) -> FileSystemDataset: ... -# @overload -# def dataset( -# source: list[Dataset], -# schema: Schema | None = None, -# format: FileFormat | _DatasetFormat | None = None, -# filesystem: SupportedFileSystem | str | None = None, -# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, -# partition_base_dir: str | None = None, -# exclude_invalid_files: bool | None = None, -# ignore_prefixes: list[str] | None = None, -# ) -> UnionDataset: ... -# @overload -# def dataset( -# source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, -# schema: Schema | None = None, -# format: FileFormat | _DatasetFormat | None = None, -# filesystem: SupportedFileSystem | str | None = None, -# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, -# partition_base_dir: str | None = None, -# exclude_invalid_files: bool | None = None, -# ignore_prefixes: list[str] | None = None, -# ) -> InMemoryDataset: ... -# @overload -# def dataset( -# source: RecordBatch | Table, -# schema: Schema | None = None, -# format: FileFormat | _DatasetFormat | None = None, -# filesystem: SupportedFileSystem | str | None = None, -# partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, -# partition_base_dir: str | None = None, -# exclude_invalid_files: bool | None = None, -# ignore_prefixes: list[str] | None = None, -# ) -> InMemoryDataset: ... -# def write_dataset( -# data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], -# base_dir: StrPath, -# *, -# basename_template: str | None = None, -# format: FileFormat | _DatasetFormat | None = None, -# partitioning: Partitioning | list[str] | None = None, -# partitioning_flavor: str | None = None, -# schema: Schema | None = None, -# filesystem: SupportedFileSystem | None = None, -# file_options: FileWriteOptions | None = None, -# use_threads: bool = True, -# max_partitions: int = 1024, -# max_open_files: int = 1024, -# max_rows_per_file: int = 0, -# min_rows_per_group: int = 0, -# max_rows_per_group: int = 1024 * 1024, -# file_visitor: Callable[[str], None] | None = None, -# existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", -# create_dir: bool = True, -# ): ... +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + +@overload +def partitioning( + schema: Schema, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + field_names: list[str], + *, + flavor: Literal["filename"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + *, + flavor: Literal["hive"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: StrPath | Sequence[StrPath], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: list[Dataset], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> UnionDataset: ... +@overload +def dataset( + source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +@overload +def dataset( + source: RecordBatch | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +def write_dataset( + data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, +): ... diff --git a/python/pyarrow-stubs/feather.pyi b/python/pyarrow-stubs/feather.pyi new file mode 100644 index 00000000000..9451ee15763 --- /dev/null +++ b/python/pyarrow-stubs/feather.pyi @@ -0,0 +1,50 @@ +from typing import IO, Literal + +import pandas as pd + +from _typeshed import StrPath +from pyarrow._feather import FeatherError +from pyarrow.lib import Table + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + +def check_chunked_overflow(name: str, col) -> None: ... +def write_feather( + df: pd.DataFrame | Table, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... +def read_feather( + source: StrPath | IO, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... +def read_table( + source: StrPath | IO, + columns: list[str] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... diff --git a/python/pyarrow-stubs/flight.pyi b/python/pyarrow-stubs/flight.pyi new file mode 100644 index 00000000000..9b806ccf305 --- /dev/null +++ b/python/pyarrow-stubs/flight.pyi @@ -0,0 +1,95 @@ +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/pyarrow-stubs/fs.pyi b/python/pyarrow-stubs/fs.pyi new file mode 100644 index 00000000000..6bf75616c13 --- /dev/null +++ b/python/pyarrow-stubs/fs.pyi @@ -0,0 +1,77 @@ +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, + use_threads: bool = True, +) -> None: ... + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/pyarrow-stubs/gandiva.pyi b/python/pyarrow-stubs/gandiva.pyi new file mode 100644 index 00000000000..a344f885b29 --- /dev/null +++ b/python/pyarrow-stubs/gandiva.pyi @@ -0,0 +1,65 @@ +from typing import Iterable, Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... + def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... + def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... + def make_field(self, field: Field) -> Node: ... + def make_if( + self, condition: Node, this_node: Node, else_node: Node, return_type: DataType + ) -> Node: ... + def make_and(self, children: list[Node]) -> Node: ... + def make_or(self, children: list[Node]) -> Node: ... + def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... + def make_condition(self, condition: Node) -> Condition: ... + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + +def make_projector( + schema: Schema, + children: list[Expression], + pool: MemoryPool, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... +def make_filter( + schema: Schema, condition: Condition, configuration: Configuration | None = None +) -> Filter: ... + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/pyarrow-stubs/json.pyi b/python/pyarrow-stubs/json.pyi new file mode 100644 index 00000000000..db1d35e0b8b --- /dev/null +++ b/python/pyarrow-stubs/json.pyi @@ -0,0 +1,3 @@ +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi index c0a3cd08386..57e23c3eaea 100644 --- a/python/pyarrow-stubs/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -22,17 +22,16 @@ from typing import NamedTuple, Literal from typing_extensions import TypeVar from .array import * -# TODO -# from .benchmark import * -# from .builder import * -# from .compat import * +from ._benchmark import * +from .builder import * +from .compat import * from .config import * from .device import * from .error import * from .io import * from ._ipc import * from .memory import * -# from .pandas_shim import * +from .pandas_shim import * from .scalar import * from .table import * from .tensor import * diff --git a/python/pyarrow-stubs/orc.pyi b/python/pyarrow-stubs/orc.pyi new file mode 100644 index 00000000000..2eba8d40a11 --- /dev/null +++ b/python/pyarrow-stubs/orc.pyi @@ -0,0 +1,279 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + +class ORCFile: + """ + Reader interface for a single ORC file + + Parameters + ---------- + source : str or pyarrow.NativeFile + Readable source. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + """ + + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: + """The file metadata, as an arrow KeyValueMetadata""" + @property + def schema(self) -> Schema: + """The file schema, as an arrow schema""" + @property + def nrows(self) -> int: + """The number of rows in the file""" + @property + def nstripes(self) -> int: + """The number of stripes in the file""" + @property + def file_version(self) -> str: + """Format version of the ORC file, must be 0.11 or 0.12""" + @property + def software_version(self) -> str: + """Software instance and version that wrote this file""" + @property + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: + """Compression codec of the file""" + @property + def compression_size(self) -> int: + """Number of bytes to buffer for the compression codec in the file""" + @property + def writer(self) -> str: + """Name of the writer that wrote this file. + If the writer is unknown then its Writer ID + (a number) is returned""" + @property + def writer_version(self) -> str: + """Version of the writer""" + @property + def row_index_stride(self) -> int: + """Number of rows per an entry in the row index or 0 + if there is no row index""" + @property + def nstripe_statistics(self) -> int: + """Number of stripe statistics""" + @property + def content_length(self) -> int: + """Length of the data stripes in the file in bytes""" + @property + def stripe_statistics_length(self) -> int: + """The number of compressed bytes in the file stripe statistics""" + @property + def file_footer_length(self) -> int: + """The number of compressed bytes in the file footer""" + @property + def file_postscript_length(self) -> int: + """The number of bytes in the file postscript""" + @property + def file_length(self) -> int: + """The number of bytes in the file""" + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: + """Read a single stripe from the file. + + Parameters + ---------- + n : int + The stripe index + columns : list + If not None, only these columns will be read from the stripe. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.RecordBatch + Content of the stripe as a RecordBatch. + """ + def read(self, columns: list[str] | None = None) -> Table: + """Read the whole file. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. Output always follows the + ordering of the file and not the `columns` list. + + Returns + ------- + pyarrow.Table + Content of the file as a Table. + """ + +class ORCWriter: + """ + Writer interface for a single ORC file + + Parameters + ---------- + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ + + writer: _orc.ORCWriter + is_open: bool + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def write(self, table: Table) -> None: + """ + Write the table into an ORC file. The schema of the table must + be equal to the schema used when opening the ORC file. + + Parameters + ---------- + table : pyarrow.Table + The table to be written into the ORC file + """ + def close(self) -> None: + """ + Close the ORC file + """ + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Table: + """ + Read a Table from an ORC file. + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name. For file-like objects, + only read a single file. Use pyarrow.BufferReader to read a file + contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. Output always follows the ordering of the file and + not the `columns` list. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + """ + +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, +) -> None: + """ + Write a table into an ORC file. + + Parameters + ---------- + table : pyarrow.lib.Table + The table to be written into the ORC file + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ diff --git a/python/pyarrow-stubs/pandas_compat.pyi b/python/pyarrow-stubs/pandas_compat.pyi new file mode 100644 index 00000000000..efbd05ac2fe --- /dev/null +++ b/python/pyarrow-stubs/pandas_compat.pyi @@ -0,0 +1,54 @@ +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table + +_T = TypeVar("_T") + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... +def table_to_dataframe( + options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None +) -> pd.DataFrame: ... +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/python/pyarrow-stubs/pandas_shim.pyi b/python/pyarrow-stubs/pandas_shim.pyi new file mode 100644 index 00000000000..0e80fae4ebf --- /dev/null +++ b/python/pyarrow-stubs/pandas_shim.pyi @@ -0,0 +1,51 @@ +from types import ModuleType +from typing import Any, Iterable, TypeGuard + +import pandas as pd + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> pd.Series: ... + def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> ModuleType: ... + @property + def pd(self) -> ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + @property + def categorical_type(self) -> type[pd.Categorical]: ... + @property + def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + def is_array_like( + self, obj: Any + ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... + def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/pyarrow-stubs/parquet/__init__.pyi b/python/pyarrow-stubs/parquet/__init__.pyi new file mode 100644 index 00000000000..151ee188f84 --- /dev/null +++ b/python/pyarrow-stubs/parquet/__init__.pyi @@ -0,0 +1 @@ +from .core import * # noqa diff --git a/python/pyarrow-stubs/parquet/core.pyi b/python/pyarrow-stubs/parquet/core.pyi new file mode 100644 index 00000000000..56b2c8447d9 --- /dev/null +++ b/python/pyarrow-stubs/parquet/core.pyi @@ -0,0 +1,2061 @@ +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Callable, Iterator, Literal, Sequence + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: + """ + Check if filters are well-formed and convert to an ``Expression``. + + Parameters + ---------- + filters : List[Tuple] or List[List[Tuple]] + + Notes + ----- + See internal ``pyarrow._DNF_filter_doc`` attribute for more details. + + Examples + -------- + + >>> filters_to_expression([("foo", "==", "bar")]) + + + Returns + ------- + pyarrow.compute.Expression + An Expression representing the filters + """ + +@deprecated("use filters_to_expression") +def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + +_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] + +class ParquetFile: + """ + Reader interface for a single Parquet file. + + Parameters + ---------- + source : str, pathlib.Path, pyarrow.NativeFile, or file-like object + Readable source. For passing bytes or buffer-like file containing a + Parquet file, use pyarrow.BufferReader. + metadata : FileMetaData, default None + Use existing metadata object, rather than reading from file. + common_metadata : FileMetaData, default None + Will be used in reads for pandas schema metadata if not found in the + main file's metadata, no other uses at the moment. + read_dictionary : list + List of column names to read directly as DictionaryArray. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + pre_buffer : bool, default False + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties, default None + File decryption properties for Parquet Modular Encryption. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Examples + -------- + + Generate an example PyArrow Table and write it to Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Create a ``ParquetFile`` object from the Parquet file: + + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the data: + + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Create a ParquetFile object with "animal" column as DictionaryArray: + + >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [ -- dictionary: + ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: + [0,1,2,3,4,5]] + """ + + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: + """ + Return the Parquet metadata. + """ + @property + def schema(self) -> ParquetSchema: + """ + Return the Parquet schema, unconverted to Arrow types + """ + @property + def schema_arrow(self) -> Schema: + """ + Return the inferred Arrow schema, converted from the whole Parquet + file's schema + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the Arrow schema: + + >>> parquet_file.schema_arrow + n_legs: int64 + animal: string + """ + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups of the Parquet file. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.num_row_groups + 1 + """ + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + def read_row_group( + self, + i: int, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a single row group from a Parquet file. + + Parameters + ---------- + i : int + Index of the individual row group that we want to read. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row group as a table (of columns) + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_group(0) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def read_row_groups( + self, + row_groups: list, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a multiple row groups from a Parquet file. + + Parameters + ---------- + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row groups as a table (of columns). + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_groups([0, 0]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] + """ + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list | None = None, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: + """ + Read streaming batches from a Parquet file. + + Parameters + ---------- + batch_size : int, default 64K + Maximum number of records to yield per batch. Batches may be + smaller if there aren't enough rows in the file. + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : boolean, default True + Perform multi-threaded column reads. + use_pandas_metadata : boolean, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Yields + ------ + pyarrow.RecordBatch + Contents of each batch as a record batch + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + >>> for i in parquet_file.iter_batches(): + ... print("RecordBatch") + ... print(i.to_pandas()) + RecordBatch + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + def read( + self, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a Table from Parquet format. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read a Table: + + >>> parquet_file.read(columns=["animal"]) + pyarrow.Table + animal: string + ---- + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: + """ + Read contents of file for the given columns and batch size. + + Notes + ----- + This function's primary purpose is benchmarking. + The scan is executed on a single thread. + + Parameters + ---------- + columns : list of integers, default None + Select columns to read, if None scan all columns. + batch_size : int, default 64K + Number of rows to read at a time internally. + + Returns + ------- + num_rows : int + Number of rows in file + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.scan_contents() + 6 + """ + +class ParquetWriter: + """ + Class for incrementally building a Parquet file for Arrow tables. + + Parameters + ---------- + where : path or file-like object + schema : pyarrow.Schema + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + writer_engine_version : unused + **options : dict + If options contains a key `metadata_collector` then the + corresponding value is assumed to be a list (or any object with + `.append` method) that will be filled with the file metadata instance + of the written file. + + Examples + -------- + Generate an example PyArrow Table and RecordBatch: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.record_batch( + ... [ + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... ], + ... names=["n_legs", "animal"], + ... ) + + create a ParquetWriter object: + + >>> import pyarrow.parquet as pq + >>> writer = pq.ParquetWriter("example.parquet", table.schema) + + and write the Table into the Parquet file: + + >>> writer.write_table(table) + >>> writer.close() + + >>> pq.read_table("example.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + create a ParquetWriter object for the RecordBatch: + + >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) + + and write the RecordBatch into the Parquet file: + + >>> writer2.write_batch(batch) + >>> writer2.close() + + >>> pq.read_table("example2.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: + """ + Write RecordBatch or Table to the Parquet file. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the input + table or batch length and 1024 * 1024. + """ + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: + """ + Write RecordBatch to the Parquet file. + + Parameters + ---------- + batch : RecordBatch + row_group_size : int, default None + Maximum number of rows in written row group. If None, the + row group size will be the minimum of the RecordBatch + size and 1024 * 1024. If set larger than 64Mi then 64Mi + will be used instead. + """ + def write_table(self, table: Table, row_group_size: int | None = None) -> None: + """ + Write Table to the Parquet file. + + Parameters + ---------- + table : Table + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the Table size + and 1024 * 1024. If set larger than 64Mi then 64Mi will + be used instead. + + """ + def close(self) -> None: + """ + Close the connection to the Parquet file. + """ + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: + """ + Add key-value metadata to the file. + This will overwrite any existing metadata with the same key. + + Parameters + ---------- + key_value_metadata : dict + Keys and values must be string-like / coercible to bytes. + """ + +class ParquetDataset: + """ + Encapsulates details of reading a complete Parquet dataset possibly + consisting of multiple files and partitions in subdirectories. + + Parameters + ---------- + path_or_paths : str or List[str] + A directory name, single file name, or list of file names. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : pyarrow.parquet.Schema + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. Set to False if you want to prioritize minimal memory usage + over maximum speed. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular resolution + (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 + timestamps will be inferred as timestamps in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + + Examples + -------- + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) + + create a ParquetDataset object from the dataset source: + + >>> dataset = pq.ParquetDataset("dataset_v2/") + + and read the data: + + >>> dataset.read().to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + create a ParquetDataset object with filter: + + >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) + >>> dataset.read().to_pandas() + n_legs animal year + 0 4 Dog 2021 + 1 4 Horse 2022 + """ + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: + """ + Schema of the Dataset. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_schema/") + + Read the schema: + + >>> dataset.schema + n_legs: int64 + animal: string + year: dictionary + """ + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read (multiple) Parquet files as a single pyarrow.Table. + + Parameters + ---------- + columns : List[str] + Names of columns to read from the dataset. The partition fields + are not automatically included. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_read/") + + Read the dataset: + + >>> dataset.read(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[5],[2],[4,100],[2,4]] + """ + def read_pandas(self, **kwargs) -> Table: + """ + Read dataset including pandas metadata, if any. Other arguments passed + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` + + Examples + -------- + Generate an example parquet file: + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "table_V2.parquet") + >>> dataset = pq.ParquetDataset("table_V2.parquet") + + Read the dataset with pandas metadata: + + >>> dataset.read_pandas(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,2,4,4,5,100]] + + >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} + """ + @property + def fragments(self) -> list[ParquetFileFragment]: + """ + A list of the Dataset source fragments or pieces with absolute + file paths. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") + + List the fragments: + + >>> dataset.fragments + [ list[str]: + """ + A list of absolute Parquet file paths in the Dataset source. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_files/") + + List the files: + + >>> dataset.files + ['dataset_v2_files/year=2019/...-0.parquet', ... + """ + @property + def filesystem(self) -> FileSystem: + """ + The filesystem type of the Dataset source. + """ + @property + def partitioning(self) -> Partitioning: + """ + The partitioning of the Dataset source, if discovered. + """ + +def read_table( + source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + filesystem: SupportedFileSystem | None = None, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: + """ + Read a Table from Parquet format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns) + + + Examples + -------- + + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) + + Read the data: + + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + + Read only a subset of columns: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] + + Read a subset of columns and read one column as DictionaryArray: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [ -- dictionary: + ["Brittle stars"] -- indices: + [0], -- dictionary: + ["Flamingo"] -- indices: + [0], -- dictionary: + ["Dog","Centipede"] -- indices: + [0,1], -- dictionary: + ["Parrot","Horse"] -- indices: + [0,1]] + + Read the table with filter: + + >>> pq.read_table( + ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] + ... ).to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + + Read data from a single Parquet file: + + >>> pq.write_table(table, "example.parquet") + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + """ + +def read_pandas( + source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs +) -> Table: + """ + + Read a Table from Parquet format, also reading DataFrame + index values if known in the file metadata + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + **kwargs + additional options for :func:`read_table` + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a Table of Columns, including DataFrame + indexes as columns + """ + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: + """ + + Write a Table to Parquet format. + + Parameters + ---------- + table : pyarrow.Table + where : string or pyarrow.NativeFile + row_group_size : int + Maximum number of rows in each written row group. If None, the + row group size will be the minimum of the Table size and + 1024 * 1024. + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + + **kwargs : optional + Additional options for ParquetWriter + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write the Table into Parquet file: + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Defining row group size for the Parquet file: + + >>> pq.write_table(table, "example.parquet", row_group_size=3) + + Defining row group compression (default is Snappy): + + >>> pq.write_table(table, "example.parquet", compression="none") + + Defining row group compression and encoding per-column: + + >>> pq.write_table( + ... table, + ... "example.parquet", + ... compression={"n_legs": "snappy", "animal": "gzip"}, + ... use_dictionary=["n_legs", "animal"], + ... ) + + Defining column encoding per-column: + + >>> pq.write_table( + ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False + ... ) + """ + +def write_to_dataset( + table: Table, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: + """ + Wrapper around dataset.write_dataset for writing a Table to + Parquet format by partitions. + For each combination of partition columns and values, + a subdirectories are created in the following + manner: + + root_dir/ + group1=value1 + group2=value1 + .parquet + group2=value2 + .parquet + group1=valueN + group2=value1 + .parquet + group2=valueN + .parquet + + Parameters + ---------- + table : pyarrow.Table + root_path : str, pathlib.Path + The root directory of the dataset. + partition_cols : list, + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : Schema, optional + This Schema of the dataset. + partitioning : Partitioning or list[str], optional + The partitioning scheme specified with the + ``pyarrow.dataset.partitioning()`` function or a list of field names. + When providing a list of field names, you can use + ``partitioning_flavor`` to drive which partitioning type should be + used. + basename_template : str, optional + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. If not specified, it defaults to "guid-{i}.parquet". + use_threads : bool, default True + Write files in parallel. If enabled, then maximum parallelism will be + used determined by the number of available CPU cores. + file_visitor : function + If set, this function will be called with a WrittenFile instance + for each file created during the call. This object will have both + a path attribute and a metadata attribute. + + The path attribute will be a string containing the path to + the created file. + + The metadata attribute will be the parquet metadata of the file. + This metadata will have the file path attribute set and can be used + to build a _metadata file. The metadata attribute will be None if + the format is not parquet. + + Example visitor which simple collects the filenames created:: + + visited_paths = [] + + def file_visitor(written_file): + visited_paths.append(written_file.path) + + existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' + Controls how the dataset will handle data that already exists in + the destination. The default behaviour is 'overwrite_or_ignore'. + + 'overwrite_or_ignore' will ignore any existing data and will + overwrite files with the same name as an output file. Other + existing files will be ignored. This behavior, in combination + with a unique basename_template for each write, will allow for + an append workflow. + + 'error' will raise an error if any data exists in the destination. + + 'delete_matching' is useful when you are writing a partitioned + dataset. The first time each partition directory is encountered + the entire directory will be deleted. This allows you to overwrite + old partitions completely. + **kwargs : dict, + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. + Using `metadata_collector` in kwargs allows one to collect the + file metadata instances of dataset pieces. The file paths in the + ColumnChunkMetaData will be set relative to `root_path`. + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write it to a partitioned dataset: + + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) + >>> pq.ParquetDataset("dataset_name_3").files + ['dataset_name_3/year=2019/...-0.parquet', ... + + Write a single Parquet file into the root folder: + + >>> pq.write_to_dataset(table, root_path="dataset_name_4") + >>> pq.ParquetDataset("dataset_name_4/").files + ['dataset_name_4/...-0.parquet'] + """ + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: + """ + Write metadata-only Parquet file from schema. This can be used with + `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar + files. + + Parameters + ---------- + schema : pyarrow.Schema + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + **kwargs : dict, + Additional kwargs for ParquetWriter class. See docstring for + `ParquetWriter` for more information. + + Examples + -------- + Generate example data: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Write a dataset and collect metadata information. + + >>> metadata_collector = [] + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) + + Write the `_common_metadata` parquet file without row groups statistics. + + >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") + + Write the `_metadata` parquet file with row groups statistics. + + >>> pq.write_metadata( + ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector + ... ) + """ + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> FileMetaData: + """ + Read FileMetaData from footer of a single Parquet file. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + metadata : FileMetaData + The metadata of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_metadata("example.parquet") + + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... + """ + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Schema: + """ + Read effective Arrow schema from Parquet file metadata. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + schema : pyarrow.Schema + The schema of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_schema("example.parquet") + n_legs: int64 + animal: string + """ diff --git a/python/pyarrow-stubs/parquet/encryption.pyi b/python/pyarrow-stubs/parquet/encryption.pyi new file mode 100644 index 00000000000..5a77dae7ef7 --- /dev/null +++ b/python/pyarrow-stubs/parquet/encryption.pyi @@ -0,0 +1,15 @@ +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/pyarrow-stubs/substrait.pyi b/python/pyarrow-stubs/substrait.pyi new file mode 100644 index 00000000000..a56a8a5b40f --- /dev/null +++ b/python/pyarrow-stubs/substrait.pyi @@ -0,0 +1,21 @@ +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] From 45f301c533346d1183504f6aa60973dfeaddbf31 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 03:00:02 +0200 Subject: [PATCH 15/26] workflow --- .github/workflows/python.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 8630dab7e93..443f336a975 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -142,11 +142,11 @@ jobs: run: |- python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-requests griffe libcst pushd python; - pip install -e . - python -m mypy pyarrow/*.pyi pyarrow/__lib_pxi/*.pyi pyarrow/tests/test_array.py pyarrow/tests/test_io.py - python -m pyright pyarrow/*.pyi pyarrow/__lib_pxi/*.pyi - python ../dev/update_stub_docstrings.py -f ./pyarrow - git status --porcelain=1 + # pip install -e . + python -m mypy pyarrow-stubs/ pyarrow/tests/test_array.py pyarrow/tests/test_io.py + python -m pyright pyarrow-stubs/ + # python ../dev/update_stub_docstrings.py -f ./pyarrow + # git status --porcelain=1 macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 From 3e516e2c560b95ca619689b9ccb5ad257eb432ac Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 17:56:10 +0200 Subject: [PATCH 16/26] work --- .github/workflows/python.yml | 6 +- python/pyarrow-stubs/__init__.pyi | 52 +- python/pyarrow-stubs/_benchmark.pyi | 3 - python/pyarrow-stubs/_csv.pyi | 1101 ++++++++++++++------------ python/pyarrow-stubs/_cuda.pyi | 53 +- python/pyarrow-stubs/_fs.pyi | 4 +- python/pyarrow-stubs/compute.pyi | 2 +- python/pyarrow-stubs/lib.pyi | 52 +- python/pyarrow-stubs/pandas_shim.pyi | 22 +- 9 files changed, 649 insertions(+), 646 deletions(-) delete mode 100644 python/pyarrow-stubs/_benchmark.pyi diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 443f336a975..700218024a5 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -140,11 +140,11 @@ jobs: - name: Type check with mypy and pyright run: |- - python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-requests griffe libcst + python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst pushd python; # pip install -e . - python -m mypy pyarrow-stubs/ pyarrow/tests/test_array.py pyarrow/tests/test_io.py - python -m pyright pyarrow-stubs/ + mypy pyarrow-stubs pyarrow/tests/test_array.py pyarrow/tests/test_io.py + pyright pyarrow-stubs # python ../dev/update_stub_docstrings.py -f ./pyarrow # git status --porcelain=1 diff --git a/python/pyarrow-stubs/__init__.pyi b/python/pyarrow-stubs/__init__.pyi index d74b486fd55..6df38801de1 100644 --- a/python/pyarrow-stubs/__init__.pyi +++ b/python/pyarrow-stubs/__init__.pyi @@ -39,14 +39,8 @@ from pyarrow.lib import ( set_io_thread_count, ) -def show_versions() -> None: - """ - Print various version information, to help with error reporting. - """ -def show_info() -> None: - """ - Print detailed version and platform information, for error reporting - """ +def show_versions() -> None: ... +def show_info() -> None: ... def _module_is_available(module: str) -> bool: ... def _filesystem_is_available(fs: str) -> bool: ... @@ -335,14 +329,15 @@ from pyarrow.lib import ( ArrowSerializationError, ) -from .ipc import serialize_pandas, deserialize_pandas +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc -import types as types +import pyarrow.types as types # ---------------------------------------------------------------------- # Deprecations -from .util import _deprecate_api, _deprecate_class +from pyarrow.util import _deprecate_api, _deprecate_class from pyarrow.ipc import ( Message, @@ -357,39 +352,13 @@ from pyarrow.ipc import ( # ---------------------------------------------------------------------- # Returning absolute path to the pyarrow include directory (if bundled, e.g. in # wheels) -def get_include() -> str: - """ - Return absolute path to directory containing Arrow C++ include - headers. Similar to numpy.get_include - """ +def get_include() -> str: ... def _get_pkg_config_executable() -> str: ... def _has_pkg_config(pkgname: str) -> bool: ... def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... -def get_libraries() -> list[str]: - """ - Return list of library names to include in the `libraries` argument for C - or Cython extensions using pyarrow - """ -def create_library_symlinks() -> None: - """ - With Linux and macOS wheels, the bundled shared libraries have an embedded - ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them - with -larrow won't work unless we create symlinks at locations like - site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses - prior problems we had with shipping two copies of the shared libraries to - permit third party projects like turbodbc to build their C++ extensions - against the pyarrow wheels. - - This function must only be invoked once and only when the shared libraries - are bundled with the Python package, which should only apply to wheel-based - installs. It requires write access to the site-packages/pyarrow directory - and so depending on your system may need to be run with root. - """ -def get_library_dirs() -> list[str]: - """ - Return lists of directories likely to contain Arrow C++ libraries for - linking C or Cython extensions using pyarrow - """ +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... __all__ = [ "__version__", @@ -681,6 +650,7 @@ __all__ = [ "ArrowSerializationError", "serialize_pandas", "deserialize_pandas", + "ipc", "types", "_deprecate_api", "_deprecate_class", diff --git a/python/pyarrow-stubs/_benchmark.pyi b/python/pyarrow-stubs/_benchmark.pyi deleted file mode 100644 index 048973301dc..00000000000 --- a/python/pyarrow-stubs/_benchmark.pyi +++ /dev/null @@ -1,3 +0,0 @@ -from pyarrow.lib import benchmark_PandasObjectIsNull - -__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/pyarrow-stubs/_csv.pyi b/python/pyarrow-stubs/_csv.pyi index ad52b2f380f..2f49f8c9a6c 100644 --- a/python/pyarrow-stubs/_csv.pyi +++ b/python/pyarrow-stubs/_csv.pyi @@ -1,556 +1,641 @@ -from typing import Any +from dataclasses import dataclass, field +from typing import IO, Any, Callable, Literal -import cuda # type: ignore[import-not-found] - -from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] +from _typeshed import StrPath from . import lib -from ._stubs_typing import ArrayLike -class Context(lib._Weakrefable): +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): """ - CUDA driver context. + Options for reading CSV files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + Minimum valid value for block size is 1 + skip_rows : int, optional (default 0) + The number of rows to skip before the column names (if any) + and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + column_names : list, optional + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + autogenerate_column_names : bool, optional (default False) + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + encoding : str, optional (default 'utf8') + The character encoding of the CSV data. Columns that cannot + decode using this encoding can still be read as Binary. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" + >>> print(s) + 1,2,3 + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + + Ignore the first numbered row and substitute it with defined + or autogenerated column names: + + >>> from pyarrow import csv + >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + f0: string + f1: int64 + f2: date32[day] + ---- + f0: [["Flamingo","Horse","Brittle stars","Centipede"]] + f1: [[2,4,5,100]] + f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + Remove the first 2 rows of the data: + + >>> read_options = csv.ReadOptions(skip_rows_after_names=2) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + 1: string + 2: int64 + 3: date32[day] + ---- + 1: [["Brittle stars","Centipede"]] + 2: [[5,100]] + 3: [[2022-03-03,2022-03-04]] """ - def __init__(self, device_number: int = 0, handle: int | None = None) -> None: - """ - Create a CUDA driver context for a particular device. - - If a CUDA context handle is passed, it is wrapped, otherwise - a default CUDA context for the given device is requested. - - Parameters - ---------- - device_number : int (default 0) - Specify the GPU device for which the CUDA driver context is - requested. - handle : int, optional - Specify CUDA handle for a shared context that has been created - by another library. - """ - @staticmethod - def from_numba(context: _numba_driver.Context | None = None) -> Context: - """ - Create a Context instance from a Numba CUDA context. - - Parameters - ---------- - context : {numba.cuda.cudadrv.driver.Context, None} - A Numba CUDA context instance. - If None, the current Numba context is used. - - Returns - ------- - shared_context : pyarrow.cuda.Context - Context instance. - """ - def to_numba(self) -> _numba_driver.Context: - """ - Convert Context to a Numba CUDA context. - - Returns - ------- - context : numba.cuda.cudadrv.driver.Context - Numba CUDA context instance. - """ - @staticmethod - def get_num_devices() -> int: - """Return the number of GPU devices.""" - @property - def device_number(self) -> int: - """Return context device number.""" - @property - def handle(self) -> int: - """Return pointer to context handle.""" - def synchronize(self) -> None: - """Blocks until the device has completed all preceding requested - tasks. - """ - @property - def bytes_allocated(self) -> int: - """Return the number of allocated bytes.""" - def get_device_address(self, address: int) -> int: - """Return the device address that is reachable from kernels running in - the context - - Parameters - ---------- - address : int - Specify memory address value - - Returns - ------- - device_address : int - Device address accessible from device context - - Notes - ----- - The device address is defined as a memory address accessible - by device. While it is often a device memory address but it - can be also a host memory address, for instance, when the - memory is allocated as host memory (using cudaMallocHost or - cudaHostAlloc) or as managed memory (using cudaMallocManaged) - or the host memory is page-locked (using cudaHostRegister). - """ - def new_buffer(self, nbytes: int) -> CudaBuffer: - """Return new device buffer. - - Parameters - ---------- - nbytes : int - Specify the number of bytes to be allocated. - - Returns - ------- - buf : CudaBuffer - Allocated buffer. - """ - @property - def memory_manager(self) -> lib.MemoryManager: - """ - The default memory manager tied to this context's device. - - Returns - ------- - MemoryManager - """ - @property - def device(self) -> lib.Device: - """ - The device instance associated with this context. - - Returns - ------- - Device - """ - def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: - """ - Create device buffer from address and size as a view. - - The caller is responsible for allocating and freeing the - memory. When `address==size==0` then a new zero-sized buffer - is returned. - - Parameters - ---------- - address : int - Specify the starting address of the buffer. The address can - refer to both device or host memory but it must be - accessible from device after mapping it with - `get_device_address` method. - size : int - Specify the size of device buffer in bytes. - base : {None, object} - Specify object that owns the referenced memory. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device reachable memory. - - """ - def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: - """Open existing CUDA IPC memory handle - - Parameters - ---------- - ipc_handle : IpcMemHandle - Specify opaque pointer to CUipcMemHandle (driver API). - - Returns - ------- - buf : CudaBuffer - referencing device buffer - """ - def buffer_from_data( - self, - data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, - offset: int = 0, - size: int = -1, - ) -> CudaBuffer: - """Create device buffer and initialize with data. - - Parameters - ---------- - data : {CudaBuffer, HostBuffer, Buffer, array-like} - Specify data to be copied to device buffer. - offset : int - Specify the offset of input buffer for device data - buffering. Default: 0. - size : int - Specify the size of device buffer in bytes. Default: all - (starting from input offset) - - Returns - ------- - cbuf : CudaBuffer - Device buffer with copied data. - """ - def buffer_from_object(self, obj: Any) -> CudaBuffer: - """Create device buffer view of arbitrary object that references - device accessible memory. - - When the object contains a non-contiguous view of device - accessible memory then the returned device buffer will contain - contiguous view of the memory, that is, including the - intermediate data that is otherwise invisible to the input - object. - - Parameters - ---------- - obj : {object, Buffer, HostBuffer, CudaBuffer, ...} - Specify an object that holds (device or host) address that - can be accessed from device. This includes objects with - types defined in pyarrow.cuda as well as arbitrary objects - that implement the CUDA array interface as defined by numba. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device accessible memory. - - """ - -class IpcMemHandle(lib._Weakrefable): - """A serializable container for a CUDA IPC handle.""" - @staticmethod - def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: - """Create IpcMemHandle from opaque buffer (e.g. from another - process) - - Parameters - ---------- - opaque_handle : - a CUipcMemHandle as a const void* - - Returns - ------- - ipc_handle : IpcMemHandle - """ - def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: - """Write IpcMemHandle to a Buffer - - Parameters - ---------- - pool : {MemoryPool, None} - Specify a pool to allocate memory from - - Returns - ------- - buf : Buffer - The serialized buffer. - """ - -class CudaBuffer(lib.Buffer): - """An Arrow buffer with data located in a GPU device. - - To create a CudaBuffer instance, use Context.device_buffer(). - - The memory allocated in a CudaBuffer is freed when the buffer object - is deleted. + use_threads: bool = field(default=True, kw_only=False) + block_size: int | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: list[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): """ + Options for parsing CSV files. - @staticmethod - def from_buffer(buf: lib.Buffer) -> CudaBuffer: - """Convert back generic buffer into CudaBuffer - - Parameters - ---------- - buf : Buffer - Specify buffer containing CudaBuffer - - Returns - ------- - dbuf : CudaBuffer - Resulting device buffer. - """ - @staticmethod - def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: - """Create a CudaBuffer view from numba MemoryPointer instance. - - Parameters - ---------- - mem : numba.cuda.cudadrv.driver.MemoryPointer - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of numba MemoryPointer. - """ - def to_numba(self) -> _numba_driver.MemoryPointer: - """Return numba memory pointer of CudaBuffer instance.""" - def copy_to_host( - self, - position: int = 0, - nbytes: int = -1, - buf: lib.Buffer | None = None, - memory_pool: lib.MemoryPool | None = None, - resizable: bool = False, - ) -> lib.Buffer: - """Copy memory from GPU device to CPU host - - Caller is responsible for ensuring that all tasks affecting - the memory are finished. Use - - `.context.synchronize()` - - when needed. - - Parameters - ---------- - position : int - Specify the starting position of the source data in GPU - device buffer. Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - the position until host buffer is full). - buf : Buffer - Specify a pre-allocated output buffer in host. Default: None - (allocate new output buffer). - memory_pool : MemoryPool - resizable : bool - Specify extra arguments to allocate_buffer. Used only when - buf is None. - - Returns - ------- - buf : Buffer - Output buffer in host. - - """ - def copy_from_host( - self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 - ) -> int: - """Copy data from host to device. - - The device buffer must be pre-allocated. - - Parameters - ---------- - data : {Buffer, array-like} - Specify data in host. It can be array-like that is valid - argument to py_buffer - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - """ - def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: - """Copy data from device to device. - - Parameters - ---------- - buf : CudaBuffer - Specify source device buffer. - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - - """ - def export_for_ipc(self) -> IpcMemHandle: - """ - Expose this device buffer as IPC memory which can be used in other - processes. - - After calling this function, this device memory will not be - freed when the CudaBuffer is destructed. - - Returns - ------- - ipc_handle : IpcMemHandle - The exported IPC handle - - """ - @property - def context(self) -> Context: - """Returns the CUDA driver context of this buffer.""" - def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: - """Return slice of device buffer - - Parameters - ---------- - offset : int, default 0 - Specify offset from the start of device buffer to slice - length : int, default None - Specify the length of slice (default is until end of device - buffer starting from offset). If the length is larger than - the data available, the returned slice will have a size of - the available data starting from the offset. - - Returns - ------- - sliced : CudaBuffer - Zero-copy slice of device buffer. - - """ - def to_pybytes(self) -> bytes: - """Return device buffer content as Python bytes.""" - -class HostBuffer(lib.Buffer): - """Device-accessible CPU memory created using cudaHostAlloc. - - To create a HostBuffer instance, use - - cuda.new_host_buffer() + Parameters + ---------- + delimiter : 1-character string, optional (default ',') + The character delimiting individual cells in the CSV data. + quote_char : 1-character string or False, optional (default '"') + The character used optionally for quoting CSV values + (False if quoting is not allowed). + double_quote : bool, optional (default True) + Whether two quotes in a quoted CSV value denote a single quote + in the data. + escape_char : 1-character string or False, optional (default False) + The character used optionally for escaping special characters + (False if escaping is not allowed). + newlines_in_values : bool, optional (default False) + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + ignore_empty_lines : bool, optional (default True) + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + invalid_row_handler : callable, optional (default None) + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals;n_legs;entry\\n" + ... "Flamingo;2;2022-03-01\\n" + ... "# Comment here:\\n" + ... "Horse;4;2022-03-02\\n" + ... "Brittle stars;5;2022-03-03\\n" + ... "Centipede;100;2022-03-04" + ... ) + >>> print(s) + animals;n_legs;entry + Flamingo;2;2022-03-01 + # Comment here: + Horse;4;2022-03-02 + Brittle stars;5;2022-03-03 + Centipede;100;2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Read the data from a file skipping rows with comments + and defining the delimiter: + + >>> from pyarrow import csv + >>> def skip_comment(row): + ... if row.text.startswith("# "): + ... return "skip" + ... else: + ... return "error" + >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) + >>> csv.read_csv(source, parse_options=parse_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] """ - @property - def size(self) -> int: ... -class BufferReader(lib.NativeFile): - """File interface for zero-copy read from CUDA buffers. + delimiter: str = field(default=",", kw_only=False) + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None - Note: Read methods return pointers to device memory. This means - you must be careful using this interface with any Arrow code which - may expect to be able to do anything other than pointer arithmetic - on the returned buffers. + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): """ - def __init__(self, obj: CudaBuffer) -> None: ... - def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: - """Return a slice view of the underlying device buffer. + Options for converting CSV data. - The slice will start at the current reader position and will - have specified size in bytes. + Parameters + ---------- + check_utf8 : bool, optional (default True) + Whether to check UTF8 validity of string columns. + column_types : pyarrow.Schema or dict, optional + Explicitly map column names to column types. Passing this argument + disables type inference on the defined columns. + null_values : list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). Note that by default, + string columns are not checked for null values. To enable + null checking for those, specify ``strings_can_be_null=True``. + true_values : list, optional + A sequence of strings that denote true booleans in the data + (defaults are appropriate in most cases). + false_values : list, optional + A sequence of strings that denote false booleans in the data + (defaults are appropriate in most cases). + decimal_point : 1-character string, optional (default '.') + The character used as decimal point in floating-point and decimal + data. + strings_can_be_null : bool, optional (default False) + Whether string / binary columns can have null values. + If true, then strings in null_values are considered null for + string columns. + If false, then all strings are valid string values. + quoted_strings_can_be_null : bool, optional (default True) + Whether quoted values can be null. + If true, then strings in "null_values" are also considered null + when they appear quoted in the CSV file. Otherwise, quoted values + are never considered null. + include_columns : list, optional + The names of columns to include in the Table. + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + include_missing_columns : bool, optional (default False) + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a column of nulls (whose type is selected using + `column_types`, or null by default). + This option is ignored if `include_columns` is empty. + auto_dict_encode : bool, optional (default False) + Whether to try to automatically dict-encode string / binary data. + If true, then when type inference detects a string or binary column, + it it dict-encoded up to `auto_dict_max_cardinality` distinct values + (per chunk), after which it switches to regular encoding. + This setting is ignored for non-inferred columns (those in + `column_types`). + auto_dict_max_cardinality : int, optional + The maximum dictionary cardinality for `auto_dict_encode`. + This value is per chunk. + timestamp_parsers : list, optional + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry,fast\\n" + ... "Flamingo,2,01/03/2022,Yes\\n" + ... "Horse,4,02/03/2022,Yes\\n" + ... "Brittle stars,5,03/03/2022,No\\n" + ... "Centipede,100,04/03/2022,No\\n" + ... ",6,05/03/2022," + ... ) + >>> print(s) + animals,n_legs,entry,fast + Flamingo,2,01/03/2022,Yes + Horse,4,02/03/2022,Yes + Brittle stars,5,03/03/2022,No + Centipede,100,04/03/2022,No + ,6,05/03/2022, + + Change the type of a column: + + >>> import pyarrow as pa + >>> from pyarrow import csv + >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: double + entry: string + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] + fast: [["Yes","Yes","No","No",""]] + + Define a date parsing format to get a timestamp type column + (in case dates are not in ISO format and not converted by default): + + >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: timestamp[s] + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [["Yes","Yes","No","No",""]] + + Specify a subset of columns to be read: + + >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + + List additional column to be included as a null typed column: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + location: null + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + location: [5 nulls] + + Define columns as dictionary type (by default only the + string/binary columns are dictionary encoded): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: dictionary + n_legs: int64 + entry: timestamp[s] + fast: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: + [0,1,2,3,4]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [ -- dictionary: + ["Yes","No",""] -- indices: + [0,0,1,1,2]] + + Set upper limit for the number of categories. If the categories + is more than the limit, the conversion to dictionary will not + happen: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + + Set empty strings to missing values: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs"], strings_can_be_null=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] + n_legs: [[2,4,5,100,6]] + + Define values to be True and False when converting a column + into a bool type: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + fast: bool + ---- + fast: [[true,true,false,false,null]] + """ - Parameters - ---------- - nbytes : int, default None - Specify the number of bytes to read. Default: None (read all - remaining bytes). + check_utf8: bool = field(default=True, kw_only=False) + column_types: lib.Schema | dict | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: list[str] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + """ + Options for writing CSV files. - Returns - ------- - cbuf : CudaBuffer - New device buffer. + Parameters + ---------- + include_header : bool, optional (default True) + Whether to write an initial header line with column names + batch_size : int, optional (default 1024) + How many rows to process together when converting and writing + CSV data + delimiter : 1-character string, optional (default ",") + The character delimiting individual cells in the CSV data. + quoting_style : str, optional (default "needed") + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ - """ + include_header: bool = field(default=True, kw_only=False) + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" -class BufferWriter(lib.NativeFile): - """File interface for writing to CUDA buffers. + def validate(self) -> None: ... - By default writes are unbuffered. Use set_buffer_size to enable - buffering. +@dataclass +class InvalidRow(lib._Weakrefable): """ - def __init__(self, obj: CudaBuffer) -> None: ... - def writeat(self, position: int, data: ArrayLike) -> None: - """Write data to buffer starting from position. - - Parameters - ---------- - position : int - Specify device buffer position where the data will be - written. - data : array-like - Specify data, the data instance must implement buffer - protocol. - """ - @property - def buffer_size(self) -> int: - """Returns size of host (CPU) buffer, 0 for unbuffered""" - @buffer_size.setter - def buffer_size(self, buffer_size: int): - """Set CPU buffer size to limit calls to cudaMemcpy - - Parameters - ---------- - buffer_size : int - Specify the size of CPU buffer to allocate in bytes. - """ - @property - def num_bytes_buffered(self) -> int: - """Returns number of bytes buffered on host""" - -def new_host_buffer(size: int, device: int = 0) -> HostBuffer: - """Return buffer with CUDA-accessible memory on CPU host + Description of an invalid row in a CSV file. Parameters ---------- - size : int - Specify the number of bytes to be allocated. - device : int - Specify GPU device number. + expected_columns : int + The expected number of columns in the row. + actual_columns : int + The actual number of columns in the row. + number : int or None + The physical row number if known, otherwise None. + text : str + The contents of the row. + """ - Returns - ------- - dbuf : HostBuffer - Allocated host buffer + expected_columns: int + actual_columns: int + number: int | None + text: str + +class CSVWriter(lib._CRecordBatchWriter): + """ + Writer to create a CSV file. + + Parameters + ---------- + sink : str, path, pyarrow.OutputStream or file-like object + The location where to write the CSV data. + schema : pyarrow.Schema + The schema of the data to be written. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. """ -def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: - """Write record batch message to GPU device memory + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class CSVStreamingReader(lib.RecordBatchReader): ... + +ISO8601: lib._Weakrefable + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: + """ + Open a streaming reader of CSV data. + + Reading using this function is always single-threaded. Parameters ---------- - batch : RecordBatch - Record batch to write - ctx : Context - CUDA Context to allocate device memory from + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from Returns ------- - dbuf : CudaBuffer - device buffer which contains the record batch message + :class:`pyarrow.csv.CSVStreamingReader` """ -def read_message( - source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None -) -> lib.Message: - """Read Arrow IPC message located on GPU device +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: + """ + Read a Table from a stream of CSV data. Parameters ---------- - source : {CudaBuffer, cuda.BufferReader} - Device buffer or reader of device buffer. - pool : MemoryPool (optional) - Pool to allocate CPU memory for the metadata + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from Returns ------- - message : Message - The deserialized message, body still on device + :class:`pyarrow.Table` + Contents of the CSV file as a in-memory table. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry\\n" + ... "Flamingo,2,2022-03-01\\n" + ... "Horse,4,2022-03-02\\n" + ... "Brittle stars,5,2022-03-03\\n" + ... "Centipede,100,2022-03-04" + ... ) + >>> print(s) + animals,n_legs,entry + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Reading from the file + + >>> from pyarrow import csv + >>> csv.read_csv(source) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] """ -def read_record_batch( - buffer: lib.Buffer, - object: lib.Schema, - *, - dictionary_memo: lib.DictionaryMemo | None = None, - pool: lib.MemoryPool | None = None, -) -> lib.RecordBatch: - """Construct RecordBatch referencing IPC message located on CUDA device. - - While the metadata is copied to host memory for deserialization, - the record batch data remains on the device. +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: + """ + Write record batch or table to a CSV file. Parameters ---------- - buffer : - Device buffer containing the complete IPC message - schema : Schema - The schema for the record batch - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - pool : MemoryPool (optional) - Pool to allocate metadata from + data : pyarrow.RecordBatch or pyarrow.Table + The data to write. + output_file : string, path, pyarrow.NativeFile, or file-like object + The location where to write the CSV data. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. - Returns - ------- - batch : RecordBatch - Reconstructed record batch, with device pointers + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow import csv + + >>> legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) + >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) + + >>> csv.write_csv(table, "animals.csv") + + >>> write_options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + >>> write_options = csv.WriteOptions(delimiter=";") + >>> csv.write_csv(table, "animals.csv", write_options=write_options) """ diff --git a/python/pyarrow-stubs/_cuda.pyi b/python/pyarrow-stubs/_cuda.pyi index 94f1b33e2e0..da769f1713f 100644 --- a/python/pyarrow-stubs/_cuda.pyi +++ b/python/pyarrow-stubs/_cuda.pyi @@ -2,13 +2,12 @@ from typing import Any import cuda # type: ignore[import-not-found] -from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-untyped] -# from . import lib -from .lib import _Weakrefable, Buffer, MemoryPool, NativeFile, RecordBatch, Schema, DictionaryMemo, Message, MemoryManager, Device +from . import lib from ._stubs_typing import ArrayLike -class Context(_Weakrefable): +class Context(lib._Weakrefable): """ CUDA driver context. """ @@ -107,7 +106,7 @@ class Context(_Weakrefable): Allocated buffer. """ @property - def memory_manager(self) -> MemoryManager: + def memory_manager(self) -> lib.MemoryManager: """ The default memory manager tied to this context's device. @@ -116,7 +115,7 @@ class Context(_Weakrefable): MemoryManager """ @property - def device(self) -> Device: + def device(self) -> lib.Device: """ The device instance associated with this context. @@ -165,7 +164,7 @@ class Context(_Weakrefable): """ def buffer_from_data( self, - data: CudaBuffer | HostBuffer | Buffer | ArrayLike, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, offset: int = 0, size: int = -1, ) -> CudaBuffer: @@ -212,10 +211,10 @@ class Context(_Weakrefable): """ -class IpcMemHandle(_Weakrefable): +class IpcMemHandle(lib._Weakrefable): """A serializable container for a CUDA IPC handle.""" @staticmethod - def from_buffer(opaque_handle: Buffer) -> IpcMemHandle: + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: """Create IpcMemHandle from opaque buffer (e.g. from another process) @@ -228,7 +227,7 @@ class IpcMemHandle(_Weakrefable): ------- ipc_handle : IpcMemHandle """ - def serialize(self, pool: MemoryPool | None = None) -> Buffer: + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: """Write IpcMemHandle to a Buffer Parameters @@ -242,7 +241,7 @@ class IpcMemHandle(_Weakrefable): The serialized buffer. """ -class CudaBuffer(Buffer): +class CudaBuffer(lib.Buffer): """An Arrow buffer with data located in a GPU device. To create a CudaBuffer instance, use Context.device_buffer(). @@ -252,7 +251,7 @@ class CudaBuffer(Buffer): """ @staticmethod - def from_buffer(buf: Buffer) -> CudaBuffer: + def from_buffer(buf: lib.Buffer) -> CudaBuffer: """Convert back generic buffer into CudaBuffer Parameters @@ -284,10 +283,10 @@ class CudaBuffer(Buffer): self, position: int = 0, nbytes: int = -1, - buf: Buffer | None = None, - memory_pool: MemoryPool | None = None, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, resizable: bool = False, - ) -> Buffer: + ) -> lib.Buffer: """Copy memory from GPU device to CPU host Caller is responsible for ensuring that all tasks affecting @@ -320,7 +319,7 @@ class CudaBuffer(Buffer): """ def copy_from_host( - self, data: Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 ) -> int: """Copy data from host to device. @@ -402,7 +401,7 @@ class CudaBuffer(Buffer): def to_pybytes(self) -> bytes: """Return device buffer content as Python bytes.""" -class HostBuffer(Buffer): +class HostBuffer(lib.Buffer): """Device-accessible CPU memory created using cudaHostAlloc. To create a HostBuffer instance, use @@ -412,7 +411,7 @@ class HostBuffer(Buffer): @property def size(self) -> int: ... -class BufferReader(NativeFile): +class BufferReader(lib.NativeFile): """File interface for zero-copy read from CUDA buffers. Note: Read methods return pointers to device memory. This means @@ -440,7 +439,7 @@ class BufferReader(NativeFile): """ -class BufferWriter(NativeFile): +class BufferWriter(lib.NativeFile): """File interface for writing to CUDA buffers. By default writes are unbuffered. Use set_buffer_size to enable @@ -491,7 +490,7 @@ def new_host_buffer(size: int, device: int = 0) -> HostBuffer: Allocated host buffer """ -def serialize_record_batch(batch: RecordBatch, ctx: Context) -> CudaBuffer: +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: """Write record batch message to GPU device memory Parameters @@ -508,8 +507,8 @@ def serialize_record_batch(batch: RecordBatch, ctx: Context) -> CudaBuffer: """ def read_message( - source: CudaBuffer | cuda.BufferReader, pool: MemoryManager | None = None -) -> Message: + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: """Read Arrow IPC message located on GPU device Parameters @@ -526,12 +525,12 @@ def read_message( """ def read_record_batch( - buffer: Buffer, - object: Schema, + buffer: lib.Buffer, + object: lib.Schema, *, - dictionary_memo: DictionaryMemo | None = None, - pool: MemoryPool | None = None, -) -> RecordBatch: + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: """Construct RecordBatch referencing IPC message located on CUDA device. While the metadata is copied to host memory for deserialization, diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi index 9b0f0ceaa20..df6f30ab509 100644 --- a/python/pyarrow-stubs/_fs.pyi +++ b/python/pyarrow-stubs/_fs.pyi @@ -19,8 +19,6 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() @@ -999,3 +997,5 @@ class FileSystemHandler(ABC): path : str path of what should be normalized. """ + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] diff --git a/python/pyarrow-stubs/compute.pyi b/python/pyarrow-stubs/compute.pyi index 5c816773c62..8ee0a929ffd 100644 --- a/python/pyarrow-stubs/compute.pyi +++ b/python/pyarrow-stubs/compute.pyi @@ -216,9 +216,9 @@ NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar _NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi index 57e23c3eaea..565feb4b3db 100644 --- a/python/pyarrow-stubs/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -16,13 +16,10 @@ # under the License. # ruff: noqa: F403 -from collections.abc import Mapping -import datetime as dt -from typing import NamedTuple, Literal -from typing_extensions import TypeVar +from typing import NamedTuple from .array import * -from ._benchmark import * +# from .benchmark import * from .builder import * from .compat import * from .config import * @@ -83,51 +80,6 @@ def is_threading_enabled() -> bool: threading doesn't work (e.g. Emscripten). """ -def ensure_metadata( - meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False -) -> KeyValueMetadata | None: ... - -def tzinfo_to_string(tz: dt.tzinfo) -> str: - """ - Converts a time zone object into a string indicating the name of a time - zone, one of: - * As used in the Olson time zone database (the "tz database" or - "tzdata"), such as "America/New_York" - * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - - Parameters - ---------- - tz : datetime.tzinfo - Time zone object - - Returns - ------- - name : str - Time zone name - """ - -def string_to_tzinfo(name: str) -> dt.tzinfo: - """ - Convert a time zone name into a time zone object. - - Supported input strings are: - * As used in the Olson time zone database (the "tz database" or - "tzdata"), such as "America/New_York" - * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - - Parameters - ---------- - name: str - Time zone name. - - Returns - ------- - tz : datetime.tzinfo - Time zone object - """ - -def ensure_type(ty: _DataTypeT | None, allow_none: Literal[True] | Literal[False] | None = None) -> _DataTypeT | None: ... - Type_NA: int Type_BOOL: int Type_UINT8: int diff --git a/python/pyarrow-stubs/pandas_shim.pyi b/python/pyarrow-stubs/pandas_shim.pyi index 0e80fae4ebf..2e5f1502fb6 100644 --- a/python/pyarrow-stubs/pandas_shim.pyi +++ b/python/pyarrow-stubs/pandas_shim.pyi @@ -1,7 +1,7 @@ from types import ModuleType from typing import Any, Iterable, TypeGuard -import pandas as pd +from pandas import Categorical, DatetimeTZDtype, Index, Series, DataFrame from numpy import dtype from pandas.core.dtypes.base import ExtensionDtype @@ -9,8 +9,8 @@ from pandas.core.dtypes.base import ExtensionDtype class _PandasAPIShim: has_sparse: bool - def series(self, *args, **kwargs) -> pd.Series: ... - def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + def series(self, *args, **kwargs) -> Series: ... + def data_frame(self, *args, **kwargs) -> DataFrame: ... @property def have_pandas(self) -> bool: ... @property @@ -28,21 +28,21 @@ class _PandasAPIShim: def is_ge_v23(self) -> bool: ... def is_ge_v3(self) -> bool: ... @property - def categorical_type(self) -> type[pd.Categorical]: ... + def categorical_type(self) -> type[Categorical]: ... @property - def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + def datetimetz_type(self) -> type[DatetimeTZDtype]: ... @property def extension_dtype(self) -> type[ExtensionDtype]: ... def is_array_like( self, obj: Any - ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... - def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... - def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + ) -> TypeGuard[Series | Index | Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[DatetimeTZDtype]: ... def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... def is_sparse(self, obj: Any) -> bool: ... - def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... - def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... - def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def is_data_frame(self, obj: Any) -> TypeGuard[DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[Series]: ... + def is_index(self, obj: Any) -> TypeGuard[Index]: ... def get_values(self, obj: Any) -> bool: ... def get_rangeindex_attribute(self, level, name): ... From 95769edf85945ce86ac13500a21900804c2c243c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 18:06:02 +0200 Subject: [PATCH 17/26] add license --- python/pyarrow-stubs/_compute.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_csv.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_cuda.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_dataset.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_dataset_orc.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_dataset_parquet.pyi | 17 +++++++++++++++++ .../_dataset_parquet_encryption.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_feather.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_flight.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_fs.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_gcsfs.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_hdfs.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_ipc.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_json.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_orc.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_parquet.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_parquet_encryption.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_s3fs.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/_substrait.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/acero.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/builder.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/cffi.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/compute.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/config.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/csv.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/cuda.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/dataset.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/device.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/error.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/feather.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/flight.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/fs.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/gandiva.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/interchange/__init__.pyi | 16 ++++++++++++++++ python/pyarrow-stubs/interchange/buffer.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/interchange/column.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/interchange/dataframe.pyi | 17 +++++++++++++++++ .../interchange/from_dataframe.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/ipc.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/json.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/orc.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/pandas_compat.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/pandas_shim.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/parquet/__init__.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/parquet/core.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/parquet/encryption.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/substrait.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/table.pyi | 17 +++++++++++++++++ python/pyarrow-stubs/util.pyi | 17 +++++++++++++++++ 49 files changed, 832 insertions(+) diff --git a/python/pyarrow-stubs/_compute.pyi b/python/pyarrow-stubs/_compute.pyi index 3d61ae42787..e8360b48edc 100644 --- a/python/pyarrow-stubs/_compute.pyi +++ b/python/pyarrow-stubs/_compute.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import ( Any, Callable, diff --git a/python/pyarrow-stubs/_csv.pyi b/python/pyarrow-stubs/_csv.pyi index 2f49f8c9a6c..c490d6be93a 100644 --- a/python/pyarrow-stubs/_csv.pyi +++ b/python/pyarrow-stubs/_csv.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from dataclasses import dataclass, field from typing import IO, Any, Callable, Literal diff --git a/python/pyarrow-stubs/_cuda.pyi b/python/pyarrow-stubs/_cuda.pyi index da769f1713f..c96951b863c 100644 --- a/python/pyarrow-stubs/_cuda.pyi +++ b/python/pyarrow-stubs/_cuda.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any import cuda # type: ignore[import-not-found] diff --git a/python/pyarrow-stubs/_dataset.pyi b/python/pyarrow-stubs/_dataset.pyi index e0f38d54eff..3665bdba00b 100644 --- a/python/pyarrow-stubs/_dataset.pyi +++ b/python/pyarrow-stubs/_dataset.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow-stubs/_dataset_orc.pyi b/python/pyarrow-stubs/_dataset_orc.pyi index 9c4ac04198f..d4e5784750f 100644 --- a/python/pyarrow-stubs/_dataset_orc.pyi +++ b/python/pyarrow-stubs/_dataset_orc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from ._dataset import FileFormat class OrcFileFormat(FileFormat): diff --git a/python/pyarrow-stubs/_dataset_parquet.pyi b/python/pyarrow-stubs/_dataset_parquet.pyi index cbcc17235f1..007d3404a18 100644 --- a/python/pyarrow-stubs/_dataset_parquet.pyi +++ b/python/pyarrow-stubs/_dataset_parquet.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from dataclasses import dataclass from typing import IO, Any, Iterable, TypedDict diff --git a/python/pyarrow-stubs/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi index 7623275b865..be40c0b39b3 100644 --- a/python/pyarrow-stubs/_dataset_parquet_encryption.pyi +++ b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions from ._parquet import FileDecryptionProperties from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig diff --git a/python/pyarrow-stubs/_feather.pyi b/python/pyarrow-stubs/_feather.pyi index 8bb914ba45d..373fe38cdce 100644 --- a/python/pyarrow-stubs/_feather.pyi +++ b/python/pyarrow-stubs/_feather.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO from _typeshed import StrPath diff --git a/python/pyarrow-stubs/_flight.pyi b/python/pyarrow-stubs/_flight.pyi index 4450c42df49..a79475a8796 100644 --- a/python/pyarrow-stubs/_flight.pyi +++ b/python/pyarrow-stubs/_flight.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import asyncio import enum import sys diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi index df6f30ab509..1f3667ef413 100644 --- a/python/pyarrow-stubs/_fs.pyi +++ b/python/pyarrow-stubs/_fs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt import enum import sys diff --git a/python/pyarrow-stubs/_gcsfs.pyi b/python/pyarrow-stubs/_gcsfs.pyi index 4fc7ea68e48..0ced106615a 100644 --- a/python/pyarrow-stubs/_gcsfs.pyi +++ b/python/pyarrow-stubs/_gcsfs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt from ._fs import FileSystem diff --git a/python/pyarrow-stubs/_hdfs.pyi b/python/pyarrow-stubs/_hdfs.pyi index 200f669379b..ed367379171 100644 --- a/python/pyarrow-stubs/_hdfs.pyi +++ b/python/pyarrow-stubs/_hdfs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from _typeshed import StrPath from ._fs import FileSystem diff --git a/python/pyarrow-stubs/_ipc.pyi b/python/pyarrow-stubs/_ipc.pyi index fc48cae3c04..1676e49e962 100644 --- a/python/pyarrow-stubs/_ipc.pyi +++ b/python/pyarrow-stubs/_ipc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum import sys diff --git a/python/pyarrow-stubs/_json.pyi b/python/pyarrow-stubs/_json.pyi index 43d2ae83cd8..f416b4b29c6 100644 --- a/python/pyarrow-stubs/_json.pyi +++ b/python/pyarrow-stubs/_json.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Any, Literal from _typeshed import StrPath diff --git a/python/pyarrow-stubs/_orc.pyi b/python/pyarrow-stubs/_orc.pyi index 71bf0dde9ba..7587cc121c3 100644 --- a/python/pyarrow-stubs/_orc.pyi +++ b/python/pyarrow-stubs/_orc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Literal from .lib import ( diff --git a/python/pyarrow-stubs/_parquet.pyi b/python/pyarrow-stubs/_parquet.pyi index a9187df0428..c75337cbf3b 100644 --- a/python/pyarrow-stubs/_parquet.pyi +++ b/python/pyarrow-stubs/_parquet.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict from _typeshed import StrPath diff --git a/python/pyarrow-stubs/_parquet_encryption.pyi b/python/pyarrow-stubs/_parquet_encryption.pyi index c707edb844a..e1228cbdb5a 100644 --- a/python/pyarrow-stubs/_parquet_encryption.pyi +++ b/python/pyarrow-stubs/_parquet_encryption.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt from typing import Callable diff --git a/python/pyarrow-stubs/_s3fs.pyi b/python/pyarrow-stubs/_s3fs.pyi index 50f63cd7e32..f1399bc4b1e 100644 --- a/python/pyarrow-stubs/_s3fs.pyi +++ b/python/pyarrow-stubs/_s3fs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from typing import Literal, TypedDict diff --git a/python/pyarrow-stubs/_substrait.pyi b/python/pyarrow-stubs/_substrait.pyi index ff226e9521b..ee78e9720fe 100644 --- a/python/pyarrow-stubs/_substrait.pyi +++ b/python/pyarrow-stubs/_substrait.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, Callable from ._compute import Expression diff --git a/python/pyarrow-stubs/acero.pyi b/python/pyarrow-stubs/acero.pyi index 8a520bdc24a..2abb608b32c 100644 --- a/python/pyarrow-stubs/acero.pyi +++ b/python/pyarrow-stubs/acero.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow-stubs/builder.pyi b/python/pyarrow-stubs/builder.pyi index 4a0e9ca4708..39372f8e512 100644 --- a/python/pyarrow-stubs/builder.pyi +++ b/python/pyarrow-stubs/builder.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Iterable from pyarrow.lib import MemoryPool, _Weakrefable diff --git a/python/pyarrow-stubs/cffi.pyi b/python/pyarrow-stubs/cffi.pyi index 2ae945c5974..e4f077d7155 100644 --- a/python/pyarrow-stubs/cffi.pyi +++ b/python/pyarrow-stubs/cffi.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import cffi c_source: str diff --git a/python/pyarrow-stubs/compute.pyi b/python/pyarrow-stubs/compute.pyi index 8ee0a929ffd..dcedb34b14a 100644 --- a/python/pyarrow-stubs/compute.pyi +++ b/python/pyarrow-stubs/compute.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # ruff: noqa: I001 from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence from collections.abc import Callable diff --git a/python/pyarrow-stubs/config.pyi b/python/pyarrow-stubs/config.pyi index 166e10c9734..7c2eb8a9c98 100644 --- a/python/pyarrow-stubs/config.pyi +++ b/python/pyarrow-stubs/config.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import NamedTuple class VersionInfo(NamedTuple): diff --git a/python/pyarrow-stubs/csv.pyi b/python/pyarrow-stubs/csv.pyi index 510229d7e72..a7abd413aab 100644 --- a/python/pyarrow-stubs/csv.pyi +++ b/python/pyarrow-stubs/csv.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._csv import ( ISO8601, ConvertOptions, diff --git a/python/pyarrow-stubs/cuda.pyi b/python/pyarrow-stubs/cuda.pyi index e11baf7d4e7..0394965bb73 100644 --- a/python/pyarrow-stubs/cuda.pyi +++ b/python/pyarrow-stubs/cuda.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._cuda import ( BufferReader, BufferWriter, diff --git a/python/pyarrow-stubs/dataset.pyi b/python/pyarrow-stubs/dataset.pyi index 98f1a38aa85..6cb7fed43e6 100644 --- a/python/pyarrow-stubs/dataset.pyi +++ b/python/pyarrow-stubs/dataset.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload from _typeshed import StrPath diff --git a/python/pyarrow-stubs/device.pyi b/python/pyarrow-stubs/device.pyi index d1b9f39eedd..6c4f1fdeeea 100644 --- a/python/pyarrow-stubs/device.pyi +++ b/python/pyarrow-stubs/device.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from pyarrow.lib import _Weakrefable diff --git a/python/pyarrow-stubs/error.pyi b/python/pyarrow-stubs/error.pyi index 981ed51e680..c1e1a04ee40 100644 --- a/python/pyarrow-stubs/error.pyi +++ b/python/pyarrow-stubs/error.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow-stubs/feather.pyi b/python/pyarrow-stubs/feather.pyi index 9451ee15763..ce8d83dbcd9 100644 --- a/python/pyarrow-stubs/feather.pyi +++ b/python/pyarrow-stubs/feather.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import IO, Literal import pandas as pd diff --git a/python/pyarrow-stubs/flight.pyi b/python/pyarrow-stubs/flight.pyi index 9b806ccf305..dcc6ee2244b 100644 --- a/python/pyarrow-stubs/flight.pyi +++ b/python/pyarrow-stubs/flight.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._flight import ( Action, ActionType, diff --git a/python/pyarrow-stubs/fs.pyi b/python/pyarrow-stubs/fs.pyi index 6bf75616c13..6c5a0af8d19 100644 --- a/python/pyarrow-stubs/fs.pyi +++ b/python/pyarrow-stubs/fs.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._fs import ( # noqa FileSelector, FileType, diff --git a/python/pyarrow-stubs/gandiva.pyi b/python/pyarrow-stubs/gandiva.pyi index a344f885b29..bc07e15c4a6 100644 --- a/python/pyarrow-stubs/gandiva.pyi +++ b/python/pyarrow-stubs/gandiva.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Iterable, Literal from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable diff --git a/python/pyarrow-stubs/interchange/__init__.pyi b/python/pyarrow-stubs/interchange/__init__.pyi index e69de29bb2d..13a83393a91 100644 --- a/python/pyarrow-stubs/interchange/__init__.pyi +++ b/python/pyarrow-stubs/interchange/__init__.pyi @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow-stubs/interchange/buffer.pyi b/python/pyarrow-stubs/interchange/buffer.pyi index 46673961a75..78d1dabb8b7 100644 --- a/python/pyarrow-stubs/interchange/buffer.pyi +++ b/python/pyarrow-stubs/interchange/buffer.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from pyarrow.lib import Buffer diff --git a/python/pyarrow-stubs/interchange/column.pyi b/python/pyarrow-stubs/interchange/column.pyi index e6662867b6b..ce7e169bfb5 100644 --- a/python/pyarrow-stubs/interchange/column.pyi +++ b/python/pyarrow-stubs/interchange/column.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import enum from typing import Any, Iterable, TypeAlias, TypedDict diff --git a/python/pyarrow-stubs/interchange/dataframe.pyi b/python/pyarrow-stubs/interchange/dataframe.pyi index 526a58926a9..a7ea6aeac74 100644 --- a/python/pyarrow-stubs/interchange/dataframe.pyi +++ b/python/pyarrow-stubs/interchange/dataframe.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow-stubs/interchange/from_dataframe.pyi b/python/pyarrow-stubs/interchange/from_dataframe.pyi index b04b6268975..aa6217b6181 100644 --- a/python/pyarrow-stubs/interchange/from_dataframe.pyi +++ b/python/pyarrow-stubs/interchange/from_dataframe.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, Protocol, TypeAlias from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table diff --git a/python/pyarrow-stubs/ipc.pyi b/python/pyarrow-stubs/ipc.pyi index c7f2af004d4..985cf0678f9 100644 --- a/python/pyarrow-stubs/ipc.pyi +++ b/python/pyarrow-stubs/ipc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from io import IOBase import pandas as pd diff --git a/python/pyarrow-stubs/json.pyi b/python/pyarrow-stubs/json.pyi index db1d35e0b8b..67768db42e4 100644 --- a/python/pyarrow-stubs/json.pyi +++ b/python/pyarrow-stubs/json.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json __all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow-stubs/orc.pyi b/python/pyarrow-stubs/orc.pyi index 2eba8d40a11..557f38a2b9e 100644 --- a/python/pyarrow-stubs/orc.pyi +++ b/python/pyarrow-stubs/orc.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys if sys.version_info >= (3, 11): diff --git a/python/pyarrow-stubs/pandas_compat.pyi b/python/pyarrow-stubs/pandas_compat.pyi index efbd05ac2fe..82fcb19ad97 100644 --- a/python/pyarrow-stubs/pandas_compat.pyi +++ b/python/pyarrow-stubs/pandas_compat.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, TypedDict, TypeVar import numpy as np diff --git a/python/pyarrow-stubs/pandas_shim.pyi b/python/pyarrow-stubs/pandas_shim.pyi index 2e5f1502fb6..e62767b1591 100644 --- a/python/pyarrow-stubs/pandas_shim.pyi +++ b/python/pyarrow-stubs/pandas_shim.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from types import ModuleType from typing import Any, Iterable, TypeGuard diff --git a/python/pyarrow-stubs/parquet/__init__.pyi b/python/pyarrow-stubs/parquet/__init__.pyi index 151ee188f84..8d0b5374ea0 100644 --- a/python/pyarrow-stubs/parquet/__init__.pyi +++ b/python/pyarrow-stubs/parquet/__init__.pyi @@ -1 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from .core import * # noqa diff --git a/python/pyarrow-stubs/parquet/core.pyi b/python/pyarrow-stubs/parquet/core.pyi index 56b2c8447d9..f5ac0510ffc 100644 --- a/python/pyarrow-stubs/parquet/core.pyi +++ b/python/pyarrow-stubs/parquet/core.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import sys from pathlib import Path diff --git a/python/pyarrow-stubs/parquet/encryption.pyi b/python/pyarrow-stubs/parquet/encryption.pyi index 5a77dae7ef7..fe9a454e593 100644 --- a/python/pyarrow-stubs/parquet/encryption.pyi +++ b/python/pyarrow-stubs/parquet/encryption.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._parquet_encryption import ( CryptoFactory, DecryptionConfiguration, diff --git a/python/pyarrow-stubs/substrait.pyi b/python/pyarrow-stubs/substrait.pyi index a56a8a5b40f..b78bbd8aebd 100644 --- a/python/pyarrow-stubs/substrait.pyi +++ b/python/pyarrow-stubs/substrait.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyarrow._substrait import ( BoundExpressions, SubstraitSchema, diff --git a/python/pyarrow-stubs/table.pyi b/python/pyarrow-stubs/table.pyi index 685ae725d4b..a9b861e2b78 100644 --- a/python/pyarrow-stubs/table.pyi +++ b/python/pyarrow-stubs/table.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import datetime as dt import sys diff --git a/python/pyarrow-stubs/util.pyi b/python/pyarrow-stubs/util.pyi index c2ecf7d6b61..5c9687bb83f 100644 --- a/python/pyarrow-stubs/util.pyi +++ b/python/pyarrow-stubs/util.pyi @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from collections.abc import Callable from os import PathLike from typing import Any, Protocol, Sequence, TypeVar From 9924db05e61a87c6a4ab43f9e3bd5012f7bbdfb4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 18:16:53 +0200 Subject: [PATCH 18/26] remove docs --- python/pyarrow-stubs/_azurefs.pyi | 59 - python/pyarrow-stubs/interchange/buffer.pyi | 44 +- python/pyarrow-stubs/interchange/column.pyi | 212 +- .../pyarrow-stubs/interchange/dataframe.pyi | 96 +- .../interchange/from_dataframe.pyi | 206 +- python/pyarrow-stubs/parquet/core.pyi | 1806 +---------------- 6 files changed, 66 insertions(+), 2357 deletions(-) diff --git a/python/pyarrow-stubs/_azurefs.pyi b/python/pyarrow-stubs/_azurefs.pyi index b9a83f01c56..37fcec2c9bd 100644 --- a/python/pyarrow-stubs/_azurefs.pyi +++ b/python/pyarrow-stubs/_azurefs.pyi @@ -20,65 +20,6 @@ from typing import Literal from ._fs import FileSystem class AzureFileSystem(FileSystem): - """ - Azure Blob Storage backed FileSystem implementation - - This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. - Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific - features will be used when they provide a performance advantage. Azurite emulator is - also supported. Note: `/` is the only supported delimiter. - - The storage account is considered the root of the filesystem. When enabled, containers - will be created or deleted during relevant directory operations. Obviously, this also - requires authentication with the additional permissions. - - By default `DefaultAzureCredential `__ - is used for authentication. This means it will try several types of authentication - and go with the first one that works. If any authentication parameters are provided when - initialising the FileSystem, they will be used instead of the default credential. - - Parameters - ---------- - account_name : str - Azure Blob Storage account name. This is the globally unique identifier for the - storage account. - account_key : str, default None - Account key of the storage account. If sas_token and account_key are None the - default credential will be used. The parameters account_key and sas_token are - mutually exclusive. - blob_storage_authority : str, default None - hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful - for connecting to a local emulator, like Azurite. - dfs_storage_authority : str, default None - hostname[:port] of the Data Lake Gen 2 Service. Defaults to - `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. - blob_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like Azurite. - dfs_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like Azurite. - sas_token : str, default None - SAS token for the storage account, used as an alternative to account_key. If sas_token - and account_key are None the default credential will be used. The parameters - account_key and sas_token are mutually exclusive. - - Examples - -------- - >>> from pyarrow import fs - >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") - >>> azurite_fs = fs.AzureFileSystem( - ... account_name="devstoreaccount1", - ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", - ... blob_storage_authority="127.0.0.1:10000", - ... dfs_storage_authority="127.0.0.1:10000", - ... blob_storage_scheme="http", - ... dfs_storage_scheme="http", - ... ) - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - def __init__( self, account_name: str, diff --git a/python/pyarrow-stubs/interchange/buffer.pyi b/python/pyarrow-stubs/interchange/buffer.pyi index 78d1dabb8b7..6890a24030c 100644 --- a/python/pyarrow-stubs/interchange/buffer.pyi +++ b/python/pyarrow-stubs/interchange/buffer.pyi @@ -20,8 +20,6 @@ import enum from pyarrow.lib import Buffer class DlpackDeviceType(enum.IntEnum): - """Integer enum for device type codes matching DLPack.""" - CPU = 1 CUDA = 2 CPU_PINNED = 3 @@ -32,44 +30,10 @@ class DlpackDeviceType(enum.IntEnum): ROCM = 10 class _PyArrowBuffer: - """ - Data in the buffer is guaranteed to be contiguous in memory. - - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - - This distinction is useful to support both data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. - """ def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... @property - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ + def bufsize(self) -> int: ... @property - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - def __dlpack__(self): - """ - Produce DLPack capsule (see array API standard). - - Raises: - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - """ - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. - Note: must be implemented even if ``__dlpack__`` is not. - """ + def ptr(self) -> int: ... + def __dlpack__(self): ... + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ... diff --git a/python/pyarrow-stubs/interchange/column.pyi b/python/pyarrow-stubs/interchange/column.pyi index ce7e169bfb5..970ad3e07be 100644 --- a/python/pyarrow-stubs/interchange/column.pyi +++ b/python/pyarrow-stubs/interchange/column.pyi @@ -24,27 +24,6 @@ from pyarrow.lib import Array, ChunkedArray from .buffer import _PyArrowBuffer class DtypeKind(enum.IntEnum): - """ - Integer enum for data types. - - Attributes - ---------- - INT : int - Matches to signed integer data type. - UINT : int - Matches to unsigned integer data type. - FLOAT : int - Matches to floating point data type. - BOOL : int - Matches to boolean data type. - STRING : int - Matches to string data type (UTF-8 encoded). - DATETIME : int - Matches to datetime data type. - CATEGORICAL : int - Matches to categorical data type. - """ - INT = 0 UINT = 1 FLOAT = 2 @@ -56,23 +35,6 @@ class DtypeKind(enum.IntEnum): Dtype: TypeAlias = tuple[DtypeKind, int, str, str] class ColumnNullType(enum.IntEnum): - """ - Integer enum for null type representation. - - Attributes - ---------- - NON_NULLABLE : int - Non-nullable column. - USE_NAN : int - Use explicit float NaN value. - USE_SENTINEL : int - Sentinel value besides NaN. - USE_BITMASK : int - The bit is set/unset representing a null on a certain position. - USE_BYTEMASK : int - The byte is set/unset representing a null on a certain position. - """ - NON_NULLABLE = 0 USE_NAN = 1 USE_SENTINEL = 2 @@ -95,175 +57,23 @@ class Endianness(enum.Enum): NATIVE = "=" NA = "|" -class NoBufferPresent(Exception): - """Exception to signal that there is no requested buffer.""" +class NoBufferPresent(Exception): ... class _PyArrowColumn: - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? - - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - """ def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... - def size(self) -> int: - """ - Size of the column, in elements. - - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. - - Is a method rather than a property because it may cause a (potentially - expensive) computation for some dataframe implementations. - """ + def size(self) -> int: ... @property - def offset(self) -> int: - """ - Offset of first element. - - May be > 0 if using chunks; for example for a column with N chunks of - equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. - """ + def offset(self) -> int: ... @property - def dtype(self) -> tuple[DtypeKind, int, str, str]: - """ - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - - Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow C - Data Interface format. - Endianness : current only native endianness (``=``) is supported - - Notes: - - Kind specifiers are aligned with DLPack where possible (hence the - jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for - bit masks) or 8 (for byte masks). - - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the - future we need to support non-native endianness - - Went with Apache Arrow format strings over NumPy format strings - because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, and - for categoricals. - - For categoricals, the format string describes the type of the - categorical in the data buffer. In case of a separate encoding of - the categorical (e.g. an integer to string mapping), this can - be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, - decimal, and nested (list, struct, map, union) dtypes. - """ + def dtype(self) -> tuple[DtypeKind, int, str, str]: ... @property - def describe_categorical(self) -> CategoricalDescription: - """ - If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate non-categorical Column encoding categorical - values. - - Raises TypeError if the dtype is not categorical - - Returns the dictionary with description on how to interpret the - data buffer: - - "is_ordered" : bool, whether the ordering of dictionary indices - is semantically meaningful. - - "is_dictionary" : bool, whether a mapping of - categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of - indices to category values (e.g. an array of - cat1, cat2, ...). None if not a dictionary-style - categorical. - - TBD: are there any other in-memory representations that are needed? - """ + def describe_categorical(self) -> CategoricalDescription: ... @property - def describe_null(self) -> tuple[ColumnNullType, Any]: - """ - Return the missing value (or "null") representation the column dtype - uses, as a tuple ``(kind, value)``. - - Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. - None otherwise. - """ + def describe_null(self) -> tuple[ColumnNullType, Any]: ... @property - def null_count(self) -> int: - """ - Number of null elements, if known. - - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. - """ + def null_count(self) -> int: ... @property - def metadata(self) -> dict[str, Any]: - """ - The metadata for the column. See `DataFrame.metadata` for more details. - """ - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: - """ - Return an iterator yielding the chunks. - - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - def get_buffers(self) -> ColumnBuffers: - """ - Return a dictionary containing the underlying buffers. - - The returned dictionary has the following contents: - - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ + def metadata(self) -> dict[str, Any]: ... + def num_chunks(self) -> int: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ... + def get_buffers(self) -> ColumnBuffers: ... diff --git a/python/pyarrow-stubs/interchange/dataframe.pyi b/python/pyarrow-stubs/interchange/dataframe.pyi index a7ea6aeac74..fb97e9a414f 100644 --- a/python/pyarrow-stubs/interchange/dataframe.pyi +++ b/python/pyarrow-stubs/interchange/dataframe.pyi @@ -27,93 +27,21 @@ from pyarrow.interchange.column import _PyArrowColumn from pyarrow.lib import RecordBatch, Table class _PyArrowDataFrame: - """ - A data frame class, with only the methods required by the interchange - protocol defined. - - A "data frame" represents an ordered collection of named columns. - A column's "name" must be a unique string. - Columns may be accessed by name or by position. - - This could be a public data frame class, or an object with the methods and - attributes defined on this DataFrame class could be returned from the - ``__dataframe__`` method of a public data frame class in a library adhering - to the dataframe interchange protocol specification. - """ - def __init__( self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True ) -> None: ... def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: - """ - Construct a new exchange object, potentially changing the parameters. - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN``. - It is intended for cases where the consumer does not support the bit - mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this - protocol specifies contiguous buffers. - """ + ) -> _PyArrowDataFrame: ... @property - def metadata(self) -> dict[str, Any]: - """ - The metadata for the data frame, as a dictionary with string keys. The - contents of `metadata` may be anything, they are meant for a library - to store information that it needs to, e.g., roundtrip losslessly or - for two implementations to share data that is not (yet) part of the - interchange protocol specification. For avoiding collisions with other - entries, please add name the keys with the name of the library - followed by a period and the desired name, e.g, ``pandas.indexcol``. - """ - def num_columns(self) -> int: - """ - Return the number of columns in the DataFrame. - """ - def num_rows(self) -> int: - """ - Return the number of rows in the DataFrame, if available. - """ - def num_chunks(self) -> int: - """ - Return the number of chunks the DataFrame consists of. - """ - def column_names(self) -> Iterable[str]: - """ - Return an iterator yielding the column names. - """ - def get_column(self, i: int) -> _PyArrowColumn: - """ - Return the column at the indicated position. - """ - def get_column_by_name(self, name: str) -> _PyArrowColumn: - """ - Return the column whose name is the indicated name. - """ - def get_columns(self) -> Iterable[_PyArrowColumn]: - """ - Return an iterator yielding the columns. - """ - def select_columns(self, indices: Sequence[int]) -> Self: - """ - Create a new DataFrame by selecting a subset of columns by index. - """ - def select_columns_by_name(self, names: Sequence[str]) -> Self: - """ - Create a new DataFrame by selecting a subset of columns by name. - """ - def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: - """ - Return an iterator yielding the chunks. - - By default (None), yields the chunks that the data is stored as by the - producer. If given, ``n_chunks`` must be a multiple of - ``self.num_chunks()``, meaning the producer must subdivide each chunk - before yielding it. - - Note that the producer must ensure that all columns are chunked the - same way. - """ + def metadata(self) -> dict[str, Any]: ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def num_chunks(self) -> int: ... + def column_names(self) -> Iterable[str]: ... + def get_column(self, i: int) -> _PyArrowColumn: ... + def get_column_by_name(self, name: str) -> _PyArrowColumn: ... + def get_columns(self) -> Iterable[_PyArrowColumn]: ... + def select_columns(self, indices: Sequence[int]) -> Self: ... + def select_columns_by_name(self, names: Sequence[str]) -> Self: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ... diff --git a/python/pyarrow-stubs/interchange/from_dataframe.pyi b/python/pyarrow-stubs/interchange/from_dataframe.pyi index aa6217b6181..b13d5976337 100644 --- a/python/pyarrow-stubs/interchange/from_dataframe.pyi +++ b/python/pyarrow-stubs/interchange/from_dataframe.pyi @@ -31,125 +31,21 @@ class DataFrameObject(Protocol): ColumnObject: TypeAlias = Any -def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: - """ - Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... - Parameters - ---------- - df : DataFrameObject - Object supporting the interchange protocol, i.e. `__dataframe__` - method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... - Returns - ------- - pa.Table +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... - Examples - -------- - >>> import pyarrow - >>> from pyarrow.interchange import from_dataframe - - Convert a pandas dataframe to a pyarrow table: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_attendees": [100, 10, 1], - ... "country": ["Italy", "Spain", "Slovenia"], - ... } - ... ) - >>> df - n_attendees country - 0 100 Italy - 1 10 Spain - 2 1 Slovenia - >>> from_dataframe(df) - pyarrow.Table - n_attendees: int64 - country: large_string - ---- - n_attendees: [[100,10,1]] - country: [["Italy","Spain","Slovenia"]] - """ - -def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: - """ - Convert interchange protocol chunk to ``pa.RecordBatch``. - - Parameters - ---------- - df : DataFrameObject - Object supporting the interchange protocol, i.e. `__dataframe__` - method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.RecordBatch - """ - -def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: - """ - Convert a column holding one of the primitive dtypes to a PyArrow array. - A primitive type is one of: int, uint, float, bool (1 bit). - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - """ - -def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: - """ - Convert a column holding boolean dtype to a PyArrow array. - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - """ +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... def categorical_column_to_dictionary( col: ColumnObject, allow_copy: bool = True -) -> DictionaryArray: - """ - Convert a column holding categorical data to a pa.DictionaryArray. - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.DictionaryArray - """ +) -> DictionaryArray: ... -def parse_datetime_format_str(format_str: str) -> tuple[str, str]: - """Parse datetime `format_str` to interpret the `data`.""" +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... -def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: - """Map column date type to pyarrow date type.""" +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... def buffers_to_array( buffers: ColumnBuffers, @@ -158,39 +54,7 @@ def buffers_to_array( describe_null: ColumnNullType, offset: int = 0, allow_copy: bool = True, -) -> Array: - """ - Build a PyArrow array from the passed buffer. - - Parameters - ---------- - buffer : ColumnBuffers - Dictionary containing tuples of underlying buffers and - their associated dtype. - data_type : Tuple[DtypeKind, int, str, str], - Dtype description of the column as a tuple ``(kind, bit-width, format string, - endianness)``. - length : int - The number of values in the array. - describe_null: ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - - Notes - ----- - The returned array doesn't own the memory. The caller of this function - is responsible for keeping the memory owner object alive as long as - the returned PyArrow array is being used. - """ +) -> Array: ... def validity_buffer_from_mask( validity_buff: Buffer, @@ -199,32 +63,7 @@ def validity_buffer_from_mask( length: int, offset: int = 0, allow_copy: bool = True, -) -> Buffer: - """ - Build a PyArrow buffer from the passed mask buffer. - - Parameters - ---------- - validity_buff : BufferObject - Tuple of underlying validity buffer and associated dtype. - validity_dtype : Dtype - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - describe_null : ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - length : int - The number of values in the array. - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Buffer - """ +) -> Buffer: ... def validity_buffer_nan_sentinel( data_pa_buffer: Buffer, @@ -233,29 +72,4 @@ def validity_buffer_nan_sentinel( length: int, offset: int = 0, allow_copy: bool = True, -) -> Buffer: - """ - Build a PyArrow buffer from NaN or sentinel values. - - Parameters - ---------- - data_pa_buffer : pa.Buffer - PyArrow buffer for the column data. - data_type : Dtype - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - describe_null : ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - length : int - The number of values in the array. - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Buffer - """ +) -> Buffer: ... diff --git a/python/pyarrow-stubs/parquet/core.pyi b/python/pyarrow-stubs/parquet/core.pyi index f5ac0510ffc..67882f3a747 100644 --- a/python/pyarrow-stubs/parquet/core.pyi +++ b/python/pyarrow-stubs/parquet/core.pyi @@ -77,29 +77,7 @@ __all__ = ( "filters_to_expression", ) -def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: - """ - Check if filters are well-formed and convert to an ``Expression``. - - Parameters - ---------- - filters : List[Tuple] or List[List[Tuple]] - - Notes - ----- - See internal ``pyarrow._DNF_filter_doc`` attribute for more details. - - Examples - -------- - - >>> filters_to_expression([("foo", "==", "bar")]) - - - Returns - ------- - pyarrow.compute.Expression - An Expression representing the filters - """ +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... @deprecated("use filters_to_expression") def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... @@ -107,97 +85,6 @@ def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Ex _Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] class ParquetFile: - """ - Reader interface for a single Parquet file. - - Parameters - ---------- - source : str, pathlib.Path, pyarrow.NativeFile, or file-like object - Readable source. For passing bytes or buffer-like file containing a - Parquet file, use pyarrow.BufferReader. - metadata : FileMetaData, default None - Use existing metadata object, rather than reading from file. - common_metadata : FileMetaData, default None - Will be used in reads for pandas schema metadata if not found in the - main file's metadata, no other uses at the moment. - read_dictionary : list - List of column names to read directly as DictionaryArray. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - pre_buffer : bool, default False - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties, default None - File decryption properties for Parquet Modular Encryption. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Examples - -------- - - Generate an example PyArrow Table and write it to Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - - Create a ``ParquetFile`` object from the Parquet file: - - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read the data: - - >>> parquet_file.read() - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] - - Create a ParquetFile object with "animal" column as DictionaryArray: - - >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) - >>> parquet_file.read() - pyarrow.Table - n_legs: int64 - animal: dictionary - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [ -- dictionary: - ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: - [0,1,2,3,4,5]] - """ - reader: ParquetReader common_metadata: FileMetaData @@ -221,63 +108,13 @@ class ParquetFile: def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> None: ... @property - def metadata(self) -> FileMetaData: - """ - Return the Parquet metadata. - """ + def metadata(self) -> FileMetaData: ... @property - def schema(self) -> ParquetSchema: - """ - Return the Parquet schema, unconverted to Arrow types - """ + def schema(self) -> ParquetSchema: ... @property - def schema_arrow(self) -> Schema: - """ - Return the inferred Arrow schema, converted from the whole Parquet - file's schema - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read the Arrow schema: - - >>> parquet_file.schema_arrow - n_legs: int64 - animal: string - """ + def schema_arrow(self) -> Schema: ... @property - def num_row_groups(self) -> int: - """ - Return the number of row groups of the Parquet file. - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.num_row_groups - 1 - """ + def num_row_groups(self) -> int: ... def close(self, force: bool = False) -> None: ... @property def closed(self) -> bool: ... @@ -287,100 +124,14 @@ class ParquetFile: columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a single row group from a Parquet file. - - Parameters - ---------- - i : int - Index of the individual row group that we want to read. - columns : list - If not None, only these columns will be read from the row group. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the row group as a table (of columns) - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.read_row_group(0) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] - """ + ) -> Table: ... def read_row_groups( self, row_groups: list, columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a multiple row groups from a Parquet file. - - Parameters - ---------- - row_groups : list - Only these row groups will be read from the file. - columns : list - If not None, only these columns will be read from the row group. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the row groups as a table (of columns). - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.read_row_groups([0, 0]) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] - animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] - """ + ) -> Table: ... def iter_batches( self, batch_size: int = 65536, @@ -388,375 +139,16 @@ class ParquetFile: columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Iterator[RecordBatch]: - """ - Read streaming batches from a Parquet file. - - Parameters - ---------- - batch_size : int, default 64K - Maximum number of records to yield per batch. Batches may be - smaller if there aren't enough rows in the file. - row_groups : list - Only these row groups will be read from the file. - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : boolean, default True - Perform multi-threaded column reads. - use_pandas_metadata : boolean, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Yields - ------ - pyarrow.RecordBatch - Contents of each batch as a record batch - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - >>> for i in parquet_file.iter_batches(): - ... print("RecordBatch") - ... print(i.to_pandas()) - RecordBatch - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - """ + ) -> Iterator[RecordBatch]: ... def read( self, columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a Table from Parquet format. - - Parameters - ---------- - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read a Table: - - >>> parquet_file.read(columns=["animal"]) - pyarrow.Table - animal: string - ---- - animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] - """ - def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: - """ - Read contents of file for the given columns and batch size. - - Notes - ----- - This function's primary purpose is benchmarking. - The scan is executed on a single thread. - - Parameters - ---------- - columns : list of integers, default None - Select columns to read, if None scan all columns. - batch_size : int, default 64K - Number of rows to read at a time internally. - - Returns - ------- - num_rows : int - Number of rows in file - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.scan_contents() - 6 - """ + ) -> Table: ... + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: ... class ParquetWriter: - """ - Class for incrementally building a Parquet file for Arrow tables. - - Parameters - ---------- - where : path or file-like object - schema : pyarrow.Schema - version : {"1.0", "2.4", "2.6"}, default "2.6" - Determine which Parquet logical types are available for use, whether the - reduced set from the Parquet 1.x.x format or the expanded logical types - added in later format versions. - Files written with version='2.4' or '2.6' may not be readable in all - Parquet implementations, so version='1.0' is likely the choice that - maximizes file compatibility. - UINT32 and some logical types are only available with version '2.4'. - Nanosecond timestamps are only available with version '2.6'. - Other features such as compression algorithms or the new serialized - data page format must be enabled separately (see 'compression' and - 'data_page_version'). - use_dictionary : bool or list, default True - Specify if we should use dictionary encoding in general or only for - some columns. - When encoding the column, if the dictionary size is too large, the - column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type - doesn't support dictionary encoding. - compression : str or dict, default 'snappy' - Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. - write_statistics : bool or list, default True - Specify if we should write statistics in general (default is True) or only - for some columns. - use_deprecated_int96_timestamps : bool, default None - Write timestamps to INT96 Parquet format. Defaults to False unless enabled - by flavor argument. This take priority over the coerce_timestamps option. - coerce_timestamps : str, default None - Cast timestamps to a particular resolution. If omitted, defaults are chosen - depending on `version`. For ``version='1.0'`` and ``version='2.4'``, - nanoseconds are cast to microseconds ('us'), while for - ``version='2.6'`` (the default), they are written natively without loss - of resolution. Seconds are always cast to milliseconds ('ms') by default, - as Parquet does not have any temporal type with seconds resolution. - If the casting results in loss of data, it will raise an exception - unless ``allow_truncated_timestamps=True`` is given. - Valid values: {None, 'ms', 'us'} - allow_truncated_timestamps : bool, default False - Allow loss of data when coercing timestamps to a particular - resolution. E.g. if microsecond or nanosecond data is lost when coercing to - 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` - will NOT result in the truncation exception being ignored unless - ``coerce_timestamps`` is not None. - data_page_size : int, default None - Set a target threshold for the approximate encoded size of data - pages within a column chunk (in bytes). If None, use the default data page - size of 1MByte. - flavor : {'spark'}, default None - Sanitize schema or set other compatibility options to work with - various target systems. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - compression_level : int or dict, default None - Specify the compression level for a codec, either on a general basis or - per-column. If None is passed, arrow selects the compression level for - the compression codec in use. The compression level has a different - meaning for each codec, so you have to read the documentation of the - codec you are using. - An exception is thrown if the compression codec does not allow specifying - a compression level. - use_byte_stream_split : bool or list, default False - Specify if the byte_stream_split encoding should be used in general or - only for some columns. If both dictionary and byte_stream_stream are - enabled, then dictionary is preferred. - The byte_stream_split encoding is valid for integer, floating-point - and fixed-size binary data types (including decimals); it should be - combined with a compression codec so as to achieve size reduction. - column_encoding : string or dict, default None - Specify the encoding scheme on a per column basis. - Can only be used when ``use_dictionary`` is set to False, and - cannot be used in combination with ``use_byte_stream_split``. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. - Certain encodings are only compatible with certain data types. - Please refer to the encodings section of `Reading and writing Parquet - files `_. - data_page_version : {"1.0", "2.0"}, default "1.0" - The serialized Parquet data page format version to write, defaults to - 1.0. This does not impact the file schema logical types and Arrow to - Parquet type casting behavior; for that use the "version" option. - use_compliant_nested_type : bool, default True - Whether to write compliant Parquet nested type (lists) as defined - `here `_, defaults to ``True``. - For ``use_compliant_nested_type=True``, this will write into a list - with 3-level structure where the middle level, named ``list``, - is a repeated group with a single field named ``element``:: - - group (LIST) { - repeated group list { - element; - } - } - - For ``use_compliant_nested_type=False``, this will also write into a list - with 3-level structure, where the name of the single field of the middle - level ``list`` is taken from the element name for nested columns in Arrow, - which defaults to ``item``:: - - group (LIST) { - repeated group list { - item; - } - } - encryption_properties : FileEncryptionProperties, default None - File encryption properties for Parquet Modular Encryption. - If None, no encryption will be done. - The encryption properties can be created using: - ``CryptoFactory.file_encryption_properties()``. - write_batch_size : int, default None - Number of values to write to a page at a time. If None, use the default of - 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages - are exceeding the ``data_page_size`` due to large column values, lowering - the batch size can help keep page sizes closer to the intended size. - dictionary_pagesize_limit : int, default None - Specify the dictionary page size limit per row group. If None, use the - default 1MB. - store_schema : bool, default True - By default, the Arrow schema is serialized and stored in the Parquet - file metadata (in the "ARROW:schema" key). When reading the file, - if this key is available, it will be used to more faithfully recreate - the original Arrow data. For example, for tz-aware timestamp columns - it will restore the timezone (Parquet only stores the UTC values without - timezone), or columns with duration type will be restored from the int64 - Parquet column. - write_page_index : bool, default False - Whether to write a page index in general for all columns. - Writing statistics to the page index disables the old method of writing - statistics to each data page header. The page index makes statistics-based - filtering more efficient than the page header, as it gathers all the - statistics for a Parquet file in a single place, avoiding scattered I/O. - Note that the page index is not yet used on the read size by PyArrow. - write_page_checksum : bool, default False - Whether to write page checksums in general for all columns. - Page checksums enable detection of data corruption, which might occur during - transmission or in the storage. - sorting_columns : Sequence of SortingColumn, default None - Specify the sort order of the data being written. The writer does not sort - the data nor does it verify that the data is sorted. The sort order is - written to the row group metadata, which can then be used by readers. - store_decimal_as_integer : bool, default False - Allow decimals with 1 <= precision <= 18 to be stored as integers. - In Parquet, DECIMAL can be stored in any of the following physical types: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. - Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the - unscaled value is used. - - By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use the following physical types to store decimals: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: for precision > 18. - - As a consequence, decimal columns stored in integer types are more compact. - writer_engine_version : unused - **options : dict - If options contains a key `metadata_collector` then the - corresponding value is assumed to be a list (or any object with - `.append` method) that will be filled with the file metadata instance - of the written file. - - Examples - -------- - Generate an example PyArrow Table and RecordBatch: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.record_batch( - ... [ - ... [2, 2, 4, 4, 5, 100], - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... ], - ... names=["n_legs", "animal"], - ... ) - - create a ParquetWriter object: - - >>> import pyarrow.parquet as pq - >>> writer = pq.ParquetWriter("example.parquet", table.schema) - - and write the Table into the Parquet file: - - >>> writer.write_table(table) - >>> writer.close() - - >>> pq.read_table("example.parquet").to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - create a ParquetWriter object for the RecordBatch: - - >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) - - and write the RecordBatch into the Parquet file: - - >>> writer2.write_batch(batch) - >>> writer2.close() - - >>> pq.read_table("example2.parquet").to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - """ - flavor: str schema_changed: bool schema: ParquetSchema @@ -796,210 +188,13 @@ class ParquetWriter: def __exit__(self, *args, **kwargs) -> Literal[False]: ... def write( self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None - ) -> None: - """ - Write RecordBatch or Table to the Parquet file. - - Parameters - ---------- - table_or_batch : {RecordBatch, Table} - row_group_size : int, default None - Maximum number of rows in each written row group. If None, - the row group size will be the minimum of the input - table or batch length and 1024 * 1024. - """ - def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: - """ - Write RecordBatch to the Parquet file. - - Parameters - ---------- - batch : RecordBatch - row_group_size : int, default None - Maximum number of rows in written row group. If None, the - row group size will be the minimum of the RecordBatch - size and 1024 * 1024. If set larger than 64Mi then 64Mi - will be used instead. - """ - def write_table(self, table: Table, row_group_size: int | None = None) -> None: - """ - Write Table to the Parquet file. - - Parameters - ---------- - table : Table - row_group_size : int, default None - Maximum number of rows in each written row group. If None, - the row group size will be the minimum of the Table size - and 1024 * 1024. If set larger than 64Mi then 64Mi will - be used instead. - - """ - def close(self) -> None: - """ - Close the connection to the Parquet file. - """ - def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: - """ - Add key-value metadata to the file. - This will overwrite any existing metadata with the same key. - - Parameters - ---------- - key_value_metadata : dict - Keys and values must be string-like / coercible to bytes. - """ + ) -> None: ... + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def close(self) -> None: ... + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... class ParquetDataset: - """ - Encapsulates details of reading a complete Parquet dataset possibly - consisting of multiple files and partitions in subdirectories. - - Parameters - ---------- - path_or_paths : str or List[str] - A directory name, single file name, or list of file names. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - schema : pyarrow.parquet.Schema - Optionally provide the Schema for the Dataset, in which case it will - not be inferred from the source. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. Set to False if you want to prioritize minimal memory usage - over maximum speed. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular resolution - (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 - timestamps will be inferred as timestamps in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) - - create a ParquetDataset object from the dataset source: - - >>> dataset = pq.ParquetDataset("dataset_v2/") - - and read the data: - - >>> dataset.read().to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - create a ParquetDataset object with filter: - - >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) - >>> dataset.read().to_pandas() - n_legs animal year - 0 4 Dog 2021 - 1 4 Horse 2022 - """ def __init__( self, path_or_paths: SingleOrList[str] @@ -1024,184 +219,22 @@ class ParquetDataset: ): ... def equals(self, other: ParquetDataset) -> bool: ... @property - def schema(self) -> Schema: - """ - Schema of the Dataset. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_schema/") - - Read the schema: - - >>> dataset.schema - n_legs: int64 - animal: string - year: dictionary - """ + def schema(self) -> Schema: ... def read( self, columns: list[str] | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: - """ - Read (multiple) Parquet files as a single pyarrow.Table. - - Parameters - ---------- - columns : List[str] - Names of columns to read from the dataset. The partition fields - are not automatically included. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_read/") - - Read the dataset: - - >>> dataset.read(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[5],[2],[4,100],[2,4]] - """ - def read_pandas(self, **kwargs) -> Table: - """ - Read dataset including pandas metadata, if any. Other arguments passed - through to :func:`read`, see docstring for further details. - - Parameters - ---------- - **kwargs : optional - Additional options for :func:`read` - - Examples - -------- - Generate an example parquet file: - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "table_V2.parquet") - >>> dataset = pq.ParquetDataset("table_V2.parquet") - - Read the dataset with pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,2,4,4,5,100]] - - >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} - """ + ) -> Table: ... + def read_pandas(self, **kwargs) -> Table: ... @property - def fragments(self) -> list[ParquetFileFragment]: - """ - A list of the Dataset source fragments or pieces with absolute - file paths. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") - - List the fragments: - - >>> dataset.fragments - [ list[ParquetFileFragment]: ... @property - def files(self) -> list[str]: - """ - A list of absolute Parquet file paths in the Dataset source. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_files/") - - List the files: - - >>> dataset.files - ['dataset_v2_files/year=2019/...-0.parquet', ... - """ + def files(self) -> list[str]: ... @property - def filesystem(self) -> FileSystem: - """ - The filesystem type of the Dataset source. - """ + def filesystem(self) -> FileSystem: ... @property - def partitioning(self) -> Partitioning: - """ - The partitioning of the Dataset source, if discovered. - """ + def partitioning(self) -> Partitioning: ... def read_table( source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], @@ -1223,347 +256,11 @@ def read_table( thrift_string_size_limit: int | None = None, thrift_container_size_limit: int | None = None, page_checksum_verification: bool = False, -) -> Table: - """ - Read a Table from Parquet format - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - use_threads : bool, default True - Perform multi-threaded column reads. - schema : Schema, optional - Optionally provide the Schema for the parquet dataset, in which case it - will not be inferred from the source. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns) - - - Examples - -------- - - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) - - Read the data: - - >>> pq.read_table("dataset_name_2").to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - - Read only a subset of columns: - - >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[5],[2],[4,100],[2,4]] - animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] - - Read a subset of columns and read one column as DictionaryArray: - - >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) - pyarrow.Table - n_legs: int64 - animal: dictionary - ---- - n_legs: [[5],[2],[4,100],[2,4]] - animal: [ -- dictionary: - ["Brittle stars"] -- indices: - [0], -- dictionary: - ["Flamingo"] -- indices: - [0], -- dictionary: - ["Dog","Centipede"] -- indices: - [0,1], -- dictionary: - ["Parrot","Horse"] -- indices: - [0,1]] - - Read the table with filter: - - >>> pq.read_table( - ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] - ... ).to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - - Read data from a single Parquet file: - - >>> pq.write_table(table, "example.parquet") - >>> pq.read_table("dataset_name_2").to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - """ +) -> Table: ... def read_pandas( source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs -) -> Table: - """ - - Read a Table from Parquet format, also reading DataFrame - index values if known in the file metadata - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - use_threads : bool, default True - Perform multi-threaded column reads. - schema : Schema, optional - Optionally provide the Schema for the parquet dataset, in which case it - will not be inferred from the source. - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - **kwargs - additional options for :func:`read_table` - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Returns - ------- - pyarrow.Table - Content of the file as a Table of Columns, including DataFrame - indexes as columns - """ +) -> Table: ... def write_table( table: Table, @@ -1593,221 +290,7 @@ def write_table( sorting_columns: Sequence[SortingColumn] | None = None, store_decimal_as_integer: bool = False, **kwargs, -) -> None: - """ - - Write a Table to Parquet format. - - Parameters - ---------- - table : pyarrow.Table - where : string or pyarrow.NativeFile - row_group_size : int - Maximum number of rows in each written row group. If None, the - row group size will be the minimum of the Table size and - 1024 * 1024. - version : {"1.0", "2.4", "2.6"}, default "2.6" - Determine which Parquet logical types are available for use, whether the - reduced set from the Parquet 1.x.x format or the expanded logical types - added in later format versions. - Files written with version='2.4' or '2.6' may not be readable in all - Parquet implementations, so version='1.0' is likely the choice that - maximizes file compatibility. - UINT32 and some logical types are only available with version '2.4'. - Nanosecond timestamps are only available with version '2.6'. - Other features such as compression algorithms or the new serialized - data page format must be enabled separately (see 'compression' and - 'data_page_version'). - use_dictionary : bool or list, default True - Specify if we should use dictionary encoding in general or only for - some columns. - When encoding the column, if the dictionary size is too large, the - column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type - doesn't support dictionary encoding. - compression : str or dict, default 'snappy' - Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. - write_statistics : bool or list, default True - Specify if we should write statistics in general (default is True) or only - for some columns. - use_deprecated_int96_timestamps : bool, default None - Write timestamps to INT96 Parquet format. Defaults to False unless enabled - by flavor argument. This take priority over the coerce_timestamps option. - coerce_timestamps : str, default None - Cast timestamps to a particular resolution. If omitted, defaults are chosen - depending on `version`. For ``version='1.0'`` and ``version='2.4'``, - nanoseconds are cast to microseconds ('us'), while for - ``version='2.6'`` (the default), they are written natively without loss - of resolution. Seconds are always cast to milliseconds ('ms') by default, - as Parquet does not have any temporal type with seconds resolution. - If the casting results in loss of data, it will raise an exception - unless ``allow_truncated_timestamps=True`` is given. - Valid values: {None, 'ms', 'us'} - allow_truncated_timestamps : bool, default False - Allow loss of data when coercing timestamps to a particular - resolution. E.g. if microsecond or nanosecond data is lost when coercing to - 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` - will NOT result in the truncation exception being ignored unless - ``coerce_timestamps`` is not None. - data_page_size : int, default None - Set a target threshold for the approximate encoded size of data - pages within a column chunk (in bytes). If None, use the default data page - size of 1MByte. - flavor : {'spark'}, default None - Sanitize schema or set other compatibility options to work with - various target systems. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - compression_level : int or dict, default None - Specify the compression level for a codec, either on a general basis or - per-column. If None is passed, arrow selects the compression level for - the compression codec in use. The compression level has a different - meaning for each codec, so you have to read the documentation of the - codec you are using. - An exception is thrown if the compression codec does not allow specifying - a compression level. - use_byte_stream_split : bool or list, default False - Specify if the byte_stream_split encoding should be used in general or - only for some columns. If both dictionary and byte_stream_stream are - enabled, then dictionary is preferred. - The byte_stream_split encoding is valid for integer, floating-point - and fixed-size binary data types (including decimals); it should be - combined with a compression codec so as to achieve size reduction. - column_encoding : string or dict, default None - Specify the encoding scheme on a per column basis. - Can only be used when ``use_dictionary`` is set to False, and - cannot be used in combination with ``use_byte_stream_split``. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. - Certain encodings are only compatible with certain data types. - Please refer to the encodings section of `Reading and writing Parquet - files `_. - data_page_version : {"1.0", "2.0"}, default "1.0" - The serialized Parquet data page format version to write, defaults to - 1.0. This does not impact the file schema logical types and Arrow to - Parquet type casting behavior; for that use the "version" option. - use_compliant_nested_type : bool, default True - Whether to write compliant Parquet nested type (lists) as defined - `here `_, defaults to ``True``. - For ``use_compliant_nested_type=True``, this will write into a list - with 3-level structure where the middle level, named ``list``, - is a repeated group with a single field named ``element``:: - - group (LIST) { - repeated group list { - element; - } - } - - For ``use_compliant_nested_type=False``, this will also write into a list - with 3-level structure, where the name of the single field of the middle - level ``list`` is taken from the element name for nested columns in Arrow, - which defaults to ``item``:: - - group (LIST) { - repeated group list { - item; - } - } - encryption_properties : FileEncryptionProperties, default None - File encryption properties for Parquet Modular Encryption. - If None, no encryption will be done. - The encryption properties can be created using: - ``CryptoFactory.file_encryption_properties()``. - write_batch_size : int, default None - Number of values to write to a page at a time. If None, use the default of - 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages - are exceeding the ``data_page_size`` due to large column values, lowering - the batch size can help keep page sizes closer to the intended size. - dictionary_pagesize_limit : int, default None - Specify the dictionary page size limit per row group. If None, use the - default 1MB. - store_schema : bool, default True - By default, the Arrow schema is serialized and stored in the Parquet - file metadata (in the "ARROW:schema" key). When reading the file, - if this key is available, it will be used to more faithfully recreate - the original Arrow data. For example, for tz-aware timestamp columns - it will restore the timezone (Parquet only stores the UTC values without - timezone), or columns with duration type will be restored from the int64 - Parquet column. - write_page_index : bool, default False - Whether to write a page index in general for all columns. - Writing statistics to the page index disables the old method of writing - statistics to each data page header. The page index makes statistics-based - filtering more efficient than the page header, as it gathers all the - statistics for a Parquet file in a single place, avoiding scattered I/O. - Note that the page index is not yet used on the read size by PyArrow. - write_page_checksum : bool, default False - Whether to write page checksums in general for all columns. - Page checksums enable detection of data corruption, which might occur during - transmission or in the storage. - sorting_columns : Sequence of SortingColumn, default None - Specify the sort order of the data being written. The writer does not sort - the data nor does it verify that the data is sorted. The sort order is - written to the row group metadata, which can then be used by readers. - store_decimal_as_integer : bool, default False - Allow decimals with 1 <= precision <= 18 to be stored as integers. - In Parquet, DECIMAL can be stored in any of the following physical types: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. - Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the - unscaled value is used. - - By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use the following physical types to store decimals: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: for precision > 18. - - As a consequence, decimal columns stored in integer types are more compact. - - **kwargs : optional - Additional options for ParquetWriter - - Examples - -------- - Generate an example PyArrow Table: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - and write the Table into Parquet file: - - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - - Defining row group size for the Parquet file: - - >>> pq.write_table(table, "example.parquet", row_group_size=3) - - Defining row group compression (default is Snappy): - - >>> pq.write_table(table, "example.parquet", compression="none") - - Defining row group compression and encoding per-column: - - >>> pq.write_table( - ... table, - ... "example.parquet", - ... compression={"n_legs": "snappy", "animal": "gzip"}, - ... use_dictionary=["n_legs", "animal"], - ... ) - - Defining column encoding per-column: - - >>> pq.write_table( - ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False - ... ) - """ +) -> None: ... def write_to_dataset( table: Table, @@ -1822,125 +305,7 @@ def write_to_dataset( existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] | None = None, **kwargs, -) -> None: - """ - Wrapper around dataset.write_dataset for writing a Table to - Parquet format by partitions. - For each combination of partition columns and values, - a subdirectories are created in the following - manner: - - root_dir/ - group1=value1 - group2=value1 - .parquet - group2=value2 - .parquet - group1=valueN - group2=value1 - .parquet - group2=valueN - .parquet - - Parameters - ---------- - table : pyarrow.Table - root_path : str, pathlib.Path - The root directory of the dataset. - partition_cols : list, - Column names by which to partition the dataset. - Columns are partitioned in the order they are given. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - schema : Schema, optional - This Schema of the dataset. - partitioning : Partitioning or list[str], optional - The partitioning scheme specified with the - ``pyarrow.dataset.partitioning()`` function or a list of field names. - When providing a list of field names, you can use - ``partitioning_flavor`` to drive which partitioning type should be - used. - basename_template : str, optional - A template string used to generate basenames of written data files. - The token '{i}' will be replaced with an automatically incremented - integer. If not specified, it defaults to "guid-{i}.parquet". - use_threads : bool, default True - Write files in parallel. If enabled, then maximum parallelism will be - used determined by the number of available CPU cores. - file_visitor : function - If set, this function will be called with a WrittenFile instance - for each file created during the call. This object will have both - a path attribute and a metadata attribute. - - The path attribute will be a string containing the path to - the created file. - - The metadata attribute will be the parquet metadata of the file. - This metadata will have the file path attribute set and can be used - to build a _metadata file. The metadata attribute will be None if - the format is not parquet. - - Example visitor which simple collects the filenames created:: - - visited_paths = [] - - def file_visitor(written_file): - visited_paths.append(written_file.path) - - existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' - Controls how the dataset will handle data that already exists in - the destination. The default behaviour is 'overwrite_or_ignore'. - - 'overwrite_or_ignore' will ignore any existing data and will - overwrite files with the same name as an output file. Other - existing files will be ignored. This behavior, in combination - with a unique basename_template for each write, will allow for - an append workflow. - - 'error' will raise an error if any data exists in the destination. - - 'delete_matching' is useful when you are writing a partitioned - dataset. The first time each partition directory is encountered - the entire directory will be deleted. This allows you to overwrite - old partitions completely. - **kwargs : dict, - Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` - function for matching kwargs, and remainder to - :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. - See the docstring of :func:`write_table` and - :func:`pyarrow.dataset.write_dataset` for the available options. - Using `metadata_collector` in kwargs allows one to collect the - file metadata instances of dataset pieces. The file paths in the - ColumnChunkMetaData will be set relative to `root_path`. - - Examples - -------- - Generate an example PyArrow Table: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - and write it to a partitioned dataset: - - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) - >>> pq.ParquetDataset("dataset_name_3").files - ['dataset_name_3/year=2019/...-0.parquet', ... - - Write a single Parquet file into the root folder: - - >>> pq.write_to_dataset(table, root_path="dataset_name_4") - >>> pq.ParquetDataset("dataset_name_4/").files - ['dataset_name_4/...-0.parquet'] - """ +) -> None: ... def write_metadata( schema: Schema, @@ -1948,131 +313,18 @@ def write_metadata( metadata_collector: list[FileMetaData] | None = None, filesystem: SupportedFileSystem | None = None, **kwargs, -) -> None: - """ - Write metadata-only Parquet file from schema. This can be used with - `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar - files. - - Parameters - ---------- - schema : pyarrow.Schema - where : string or pyarrow.NativeFile - metadata_collector : list - where to collect metadata information. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - **kwargs : dict, - Additional kwargs for ParquetWriter class. See docstring for - `ParquetWriter` for more information. - - Examples - -------- - Generate example data: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Write a dataset and collect metadata information. - - >>> metadata_collector = [] - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) - - Write the `_common_metadata` parquet file without row groups statistics. - - >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") - - Write the `_metadata` parquet file with row groups statistics. - - >>> pq.write_metadata( - ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector - ... ) - """ +) -> None: ... def read_metadata( where: str | Path | IO | NativeFile, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, -) -> FileMetaData: - """ - Read FileMetaData from footer of a single Parquet file. - - Parameters - ---------- - where : str (file path) or file-like object - memory_map : bool, default False - Create memory map when the source is a file path. - decryption_properties : FileDecryptionProperties, default None - Decryption properties for reading encrypted Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - - Returns - ------- - metadata : FileMetaData - The metadata of the Parquet file - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) - >>> pq.write_table(table, "example.parquet") - - >>> pq.read_metadata("example.parquet") - - created_by: parquet-cpp-arrow version ... - num_columns: 2 - num_rows: 3 - num_row_groups: 1 - format_version: 2.6 - serialized_size: ... - """ +) -> FileMetaData: ... def read_schema( where: str | Path | IO | NativeFile, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, -) -> Schema: - """ - Read effective Arrow schema from Parquet file metadata. - - Parameters - ---------- - where : str (file path) or file-like object - memory_map : bool, default False - Create memory map when the source is a file path. - decryption_properties : FileDecryptionProperties, default None - Decryption properties for reading encrypted Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - - Returns - ------- - schema : pyarrow.Schema - The schema of the Parquet file - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) - >>> pq.write_table(table, "example.parquet") - - >>> pq.read_schema("example.parquet") - n_legs: int64 - animal: string - """ +) -> Schema: ... From e23d97fb383145481e7708ba05ac91f449edfaca Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Sep 2025 19:18:52 +0200 Subject: [PATCH 19/26] Remove docsting --- python/pyarrow-stubs/__init__.pyi | 5 + python/pyarrow-stubs/_azurefs.pyi | 1 + python/pyarrow-stubs/_compute.pyi | 1464 +---- python/pyarrow-stubs/_csv.pyi | 555 +- python/pyarrow-stubs/_cuda.pyi | 567 +- python/pyarrow-stubs/_dataset.pyi | 1940 +------ python/pyarrow-stubs/_dataset_orc.pyi | 1 + python/pyarrow-stubs/_dataset_parquet.pyi | 212 +- .../_dataset_parquet_encryption.pyi | 57 +- python/pyarrow-stubs/_feather.pyi | 6 +- python/pyarrow-stubs/_flight.pyi | 1177 +--- python/pyarrow-stubs/_fs.pyi | 962 +--- python/pyarrow-stubs/_gcsfs.pyi | 64 +- python/pyarrow-stubs/_hdfs.pyi | 58 +- python/pyarrow-stubs/_ipc.pyi | 632 +- python/pyarrow-stubs/_json.pyi | 140 +- python/pyarrow-stubs/_parquet.pyi | 38 +- python/pyarrow-stubs/_parquet_encryption.pyi | 11 +- python/pyarrow-stubs/_s3fs.pyi | 15 +- python/pyarrow-stubs/_stubs_typing.pyi | 19 +- python/pyarrow-stubs/_substrait.pyi | 7 + python/pyarrow-stubs/_types.pyi | 4280 ++------------ python/pyarrow-stubs/acero.pyi | 15 +- python/pyarrow-stubs/array.pyi | 3122 ++-------- python/pyarrow-stubs/builder.pyi | 83 +- python/pyarrow-stubs/compute.pyi | 5085 +---------------- python/pyarrow-stubs/config.pyi | 6 + python/pyarrow-stubs/dataset.pyi | 28 +- python/pyarrow-stubs/device.pyi | 65 +- python/pyarrow-stubs/feather.pyi | 13 +- python/pyarrow-stubs/fs.pyi | 3 + python/pyarrow-stubs/interchange/buffer.pyi | 2 + python/pyarrow-stubs/interchange/column.pyi | 15 +- .../pyarrow-stubs/interchange/dataframe.pyi | 2 + .../interchange/from_dataframe.pyi | 18 +- python/pyarrow-stubs/io.pyi | 1402 +---- python/pyarrow-stubs/ipc.pyi | 19 +- python/pyarrow-stubs/lib.pyi | 36 +- python/pyarrow-stubs/memory.pyi | 207 +- python/pyarrow-stubs/orc.pyi | 254 +- python/pyarrow-stubs/pandas_compat.pyi | 11 + python/pyarrow-stubs/parquet/core.pyi | 35 +- python/pyarrow-stubs/scalar.pyi | 627 +- python/pyarrow-stubs/table.pyi | 5026 +--------------- python/pyarrow-stubs/tensor.pyi | 660 +-- python/pyarrow-stubs/types.pyi | 5 +- python/pyarrow-stubs/util.pyi | 4 + 47 files changed, 2946 insertions(+), 26008 deletions(-) diff --git a/python/pyarrow-stubs/__init__.pyi b/python/pyarrow-stubs/__init__.pyi index 6df38801de1..1a188eccd45 100644 --- a/python/pyarrow-stubs/__init__.pyi +++ b/python/pyarrow-stubs/__init__.pyi @@ -39,11 +39,13 @@ from pyarrow.lib import ( set_io_thread_count, ) + def show_versions() -> None: ... def show_info() -> None: ... def _module_is_available(module: str) -> bool: ... def _filesystem_is_available(fs: str) -> bool: ... + from pyarrow.lib import ( null, bool_, @@ -352,6 +354,8 @@ from pyarrow.ipc import ( # ---------------------------------------------------------------------- # Returning absolute path to the pyarrow include directory (if bundled, e.g. in # wheels) + + def get_include() -> str: ... def _get_pkg_config_executable() -> str: ... def _has_pkg_config(pkgname: str) -> bool: ... @@ -360,6 +364,7 @@ def get_libraries() -> list[str]: ... def create_library_symlinks() -> None: ... def get_library_dirs() -> list[str]: ... + __all__ = [ "__version__", "_lib", diff --git a/python/pyarrow-stubs/_azurefs.pyi b/python/pyarrow-stubs/_azurefs.pyi index 37fcec2c9bd..2d866f34dbd 100644 --- a/python/pyarrow-stubs/_azurefs.pyi +++ b/python/pyarrow-stubs/_azurefs.pyi @@ -19,6 +19,7 @@ from typing import Literal from ._fs import FileSystem + class AzureFileSystem(FileSystem): def __init__( self, diff --git a/python/pyarrow-stubs/_compute.pyi b/python/pyarrow-stubs/_compute.pyi index e8360b48edc..7742dbda539 100644 --- a/python/pyarrow-stubs/_compute.pyi +++ b/python/pyarrow-stubs/_compute.pyi @@ -31,158 +31,87 @@ from . import lib _Order: TypeAlias = Literal["ascending", "descending"] _Placement: TypeAlias = Literal["at_start", "at_end"] + class Kernel(lib._Weakrefable): - """ - A kernel object. + ... - Kernels handle the execution of a Function for a certain signature. - """ class Function(lib._Weakrefable): - """ - A compute function. - - A function implements a certain logical computation over a range of - possible input signatures. Each signature accepts a range of input - types and is implemented by a given Kernel. - - Functions can be of different kinds: - - * "scalar" functions apply an item-wise computation over all items - of their inputs. Each item in the output only depends on the values - of the inputs at the same position. Examples: addition, comparisons, - string predicates... - - * "vector" functions apply a collection-wise computation, such that - each item in the output may depend on the values of several items - in each input. Examples: dictionary encoding, sorting, extracting - unique values... - - * "scalar_aggregate" functions reduce the dimensionality of the inputs by - applying a reduction function. Examples: sum, min_max, mode... - - * "hash_aggregate" functions apply a reduction function to an input - subdivided by grouping criteria. They may not be directly called. - Examples: hash_sum, hash_min_max... - - * "meta" functions dispatch to other functions. - """ @property - def arity(self) -> int: - """ - The function arity. + def arity(self) -> int: ... - If Ellipsis (i.e. `...`) is returned, the function takes a variable - number of arguments. - """ @property def kind( self, - ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: - """ - The function kind. - """ + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ... @property - def name(self) -> str: - """ - The function name. - """ + def name(self) -> str: ... @property - def num_kernels(self) -> int: - """ - The number of kernels implementing this function. - """ + def num_kernels(self) -> int: ... + def call( self, args: Iterable, options: FunctionOptions | None = None, memory_pool: lib.MemoryPool | None = None, length: int | None = None, - ) -> Any: - """ - Call the function on the given arguments. - - Parameters - ---------- - args : iterable - The arguments to pass to the function. Accepted types depend - on the specific function. - options : FunctionOptions, optional - Options instance for executing this function. This should have - the right concrete options type. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - length : int, optional - Batch size for execution, for nullary (no argument) functions. If - not passed, will be inferred from passed data. - """ + ) -> Any: ... + class FunctionOptions(lib._Weakrefable): def serialize(self) -> lib.Buffer: ... @classmethod def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + class FunctionRegistry(lib._Weakrefable): - def get_function(self, name: str) -> Function: - """ - Look up a function by name in the registry. - - Parameters - ---------- - name : str - The name of the function to lookup - """ - - def list_functions(self) -> list[str]: - """ - Return all function names in the registry. - """ - -class HashAggregateFunction(Function): ... -class HashAggregateKernel(Kernel): ... -class ScalarAggregateFunction(Function): ... -class ScalarAggregateKernel(Kernel): ... -class ScalarFunction(Function): ... -class ScalarKernel(Kernel): ... -class VectorFunction(Function): ... -class VectorKernel(Kernel): ... + def get_function(self, name: str) -> Function: ... + def list_functions(self) -> list[str]: ... + + +class HashAggregateFunction(Function): + ... + + +class HashAggregateKernel(Kernel): + ... + + +class ScalarAggregateFunction(Function): + ... + + +class ScalarAggregateKernel(Kernel): + ... + + +class ScalarFunction(Function): + ... + + +class ScalarKernel(Kernel): + ... + + +class VectorFunction(Function): + ... + + +class VectorKernel(Kernel): + ... # ==================== _compute.pyx Option classes ==================== + + class ArraySortOptions(FunctionOptions): - """ - Options for the `array_sort_indices` function. - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - null_placement : str, default "at_end" - Where nulls in the input should be sorted. - Accepted values are "at_start", "at_end". - """ def __init__( self, order: _Order = "ascending", null_placement: _Placement = "at_end", ) -> None: ... -class AssumeTimezoneOptions(FunctionOptions): - """ - Options for the `assume_timezone` function. - - Parameters - ---------- - timezone : str - Timezone to assume for the input. - ambiguous : str, default "raise" - How to handle timestamps that are ambiguous in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - nonexistent : str, default "raise" - How to handle timestamps that don't exist in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - """ +class AssumeTimezoneOptions(FunctionOptions): def __init__( self, timezone: str, @@ -191,28 +120,8 @@ class AssumeTimezoneOptions(FunctionOptions): nonexistent: Literal["raise", "earliest", "latest"] = "raise", ) -> None: ... -class CastOptions(FunctionOptions): - """ - Options for the `cast` function. - - Parameters - ---------- - target_type : DataType, optional - The PyArrow type to cast to. - allow_int_overflow : bool, default False - Whether integer overflow is allowed when casting. - allow_time_truncate : bool, default False - Whether time precision truncation is allowed when casting. - allow_time_overflow : bool, default False - Whether date/time range overflow is allowed when casting. - allow_decimal_truncate : bool, default False - Whether decimal precision truncation is allowed when casting. - allow_float_truncate : bool, default False - Whether floating-point precision truncation is allowed when casting. - allow_invalid_utf8 : bool, default False - Whether producing invalid utf8 data is allowed when casting. - """ +class CastOptions(FunctionOptions): allow_int_overflow: bool allow_time_truncate: bool allow_time_overflow: bool @@ -237,190 +146,68 @@ class CastOptions(FunctionOptions): def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... def is_safe(self) -> bool: ... + class CountOptions(FunctionOptions): - """ - Options for the `count` function. + def __init__(self, mode: Literal["only_valid", + "only_null", "all"] = "only_valid") -> None: ... - Parameters - ---------- - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - """ - def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... class CumulativeOptions(FunctionOptions): - """ - Options for `cumulative_*` functions. - - - cumulative_sum - - cumulative_sum_checked - - cumulative_prod - - cumulative_prod_checked - - cumulative_max - - cumulative_min - - Parameters - ---------- - start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. - skip_nulls : bool, default False - When false, the first encountered null is propagated. - """ - def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + class CumulativeSumOptions(FunctionOptions): - """ - Options for `cumulative_sum` function. - - Parameters - ---------- - start : Scalar, default None - Starting value for sum computation - skip_nulls : bool, default False - When false, the first encountered null is propagated. - """ - def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... -class DayOfWeekOptions(FunctionOptions): - """ - Options for the `day_of_week` function. - Parameters - ---------- - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - """ +class DayOfWeekOptions(FunctionOptions): + def __init__(self, *, count_from_zero: bool = True, + week_start: int = 1) -> None: ... - def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... class DictionaryEncodeOptions(FunctionOptions): - """ - Options for dictionary encoding. - - Parameters - ---------- - null_encoding : str, default "mask" - How to encode nulls in the input. - Accepted values are "mask" (null inputs emit a null in the indices - array), "encode" (null inputs emit a non-null index pointing to - a null value in the dictionary array). - """ def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... -class RunEndEncodeOptions(FunctionOptions): - """ - Options for run-end encoding. - Parameters - ---------- - run_end_type : DataType, default pyarrow.int32() - The data type of the run_ends array. - - Accepted values are pyarrow.{int16(), int32(), int64()}. - """ +class RunEndEncodeOptions(FunctionOptions): # TODO: default is DataType(int32) def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + class ElementWiseAggregateOptions(FunctionOptions): - """ - Options for element-wise aggregate functions. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - """ def __init__(self, *, skip_nulls: bool = True) -> None: ... + class ExtractRegexOptions(FunctionOptions): - """ - Options for the `extract_regex` function. - - Parameters - ---------- - pattern : str - Regular expression with named capture fields. - """ def __init__(self, pattern: str) -> None: ... + class ExtractRegexSpanOptions(FunctionOptions): - """ - Options for the `extract_regex_span` function. - - Parameters - ---------- - pattern : str - Regular expression with named capture fields. - """ def __init__(self, pattern: str) -> None: ... -class FilterOptions(FunctionOptions): - """ - Options for selecting with a boolean filter. - Parameters - ---------- - null_selection_behavior : str, default "drop" - How to handle nulls in the selection filter. - Accepted values are "drop", "emit_null". - """ +class FilterOptions(FunctionOptions): + def __init__( + self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... - def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... class IndexOptions(FunctionOptions): - """ - Options for the `index` function. - - Parameters - ---------- - value : Scalar - The value to search for. - """ def __init__(self, value: lib.Scalar) -> None: ... + class JoinOptions(FunctionOptions): - """ - Options for the `binary_join_element_wise` function. - - Parameters - ---------- - null_handling : str, default "emit_null" - How to handle null values in the inputs. - Accepted values are "emit_null", "skip", "replace". - null_replacement : str, default "" - Replacement string to emit for null inputs if `null_handling` - is "replace". - """ @overload - def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + def __init__( + self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + @overload - def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... + def __init__(self, null_handling: Literal["replace"], + null_replacement: str = "") -> None: ... + class ListSliceOptions(FunctionOptions): - """ - Options for list array slicing. - - Parameters - ---------- - start : int - Index to start slicing inner list elements (inclusive). - stop : Optional[int], default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. (NotImplemented) - step : int, default 1 - Slice step. - return_fixed_size_list : Optional[bool], default None - Whether to return a FixedSizeListArray. If true _and_ stop is after - a list element's length, nulls will be appended to create the - requested slice size. The default of `None` will return the same - type which was passed in. - """ def __init__( self, start: int, @@ -429,32 +216,12 @@ class ListSliceOptions(FunctionOptions): return_fixed_size_list: bool | None = None, ) -> None: ... + class ListFlattenOptions(FunctionOptions): - """ - Options for `list_flatten` function - - Parameters - ---------- - recursive : bool, default False - When True, the list array is flattened recursively until an array - of non-list values is formed. - """ def __init__(self, recursive: bool = False) -> None: ... + class MakeStructOptions(FunctionOptions): - """ - Options for the `make_struct` function. - - Parameters - ---------- - field_names : sequence of str - Names of the struct fields to create. - field_nullability : sequence of bool, optional - Nullability information for each struct field. - If omitted, all fields are nullable. - field_metadata : sequence of KeyValueMetadata, optional - Metadata for each struct field. - """ def __init__( self, field_names: Sequence[str] = (), @@ -463,199 +230,63 @@ class MakeStructOptions(FunctionOptions): field_metadata: Sequence[lib.KeyValueMetadata] | None = None, ) -> None: ... + class MapLookupOptions(FunctionOptions): - """ - Options for the `map_lookup` function. - - Parameters - ---------- - query_key : Scalar or Object can be converted to Scalar - The key to search for. - occurrence : str - The occurrence(s) to return from the Map - Accepted values are "first", "last", or "all". - """ # TODO: query_key: Scalar or Object can be converted to Scalar def __init__( self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] ) -> None: ... -class MatchSubstringOptions(FunctionOptions): - """ - Options for looking for a substring. - - Parameters - ---------- - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - """ +class MatchSubstringOptions(FunctionOptions): def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + class ModeOptions(FunctionOptions): - """ - Options for the `mode` function. - - Parameters - ---------- - n : int, default 1 - Number of distinct most-common values to return. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... + def __init__(self, n: int = 1, *, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + class NullOptions(FunctionOptions): - """ - Options for the `is_null` function. - - Parameters - ---------- - nan_is_null : bool, default False - Whether floating-point NaN values are considered null. - """ def __init__(self, *, nan_is_null: bool = False) -> None: ... + class PadOptions(FunctionOptions): - """ - Options for padding strings. - - Parameters - ---------- - width : int - Desired string length. - padding : str, default " " - What to pad the string with. Should be one byte or codepoint. - lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). - """ def __init__( self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True ) -> None: ... + class PairwiseOptions(FunctionOptions): - """ - Options for `pairwise` functions. - - Parameters - ---------- - period : int, default 1 - Period for applying the period function. - """ def __init__(self, period: int = 1) -> None: ... + class PartitionNthOptions(FunctionOptions): - """ - Options for the `partition_nth_indices` function. - - Parameters - ---------- - pivot : int - Index into the equivalent sorted array of the pivot element. - null_placement : str, default "at_end" - Where nulls in the input should be partitioned. - Accepted values are "at_start", "at_end". - """ - def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... + def __init__(self, pivot: int, *, + null_placement: _Placement = "at_end") -> None: ... + class WinsorizeOptions(FunctionOptions): - """ - Options for the `winsorize` function. - - Parameters - ---------- - lower_limit : float, between 0 and 1 - The quantile below which all values are replaced with the quantile's value. - upper_limit : float, between 0 and 1 - The quantile above which all values are replaced with the quantile's value. - """ def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + class QuantileOptions(FunctionOptions): - """ - Options for the `quantile` function. - - Parameters - ---------- - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to compute. All values must be in - [0, 1]. - interpolation : str, default "linear" - How to break ties between competing data points for a given quantile. - Accepted values are: - - - "linear": compute an interpolation - - "lower": always use the smallest of the two data points - - "higher": always use the largest of the two data points - - "nearest": select the data point that is closest to the quantile - - "midpoint": compute the (unweighted) mean of the two data points - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ def __init__( self, q: float | Sequence[float], *, - interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + interpolation: Literal["linear", "lower", + "higher", "nearest", "midpoint"] = "linear", skip_nulls: bool = True, min_count: int = 0, ) -> None: ... + class RandomOptions(FunctionOptions): - """ - Options for random generation. - - Parameters - ---------- - initializer : int or str - How to initialize the underlying random generator. - If an integer is given, it is used as a seed. - If "system" is given, the random generator is initialized with - a system-specific source of (hopefully true) randomness. - Other values are invalid. - """ def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + class RankOptions(FunctionOptions): - """ - Options for the `rank` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - tiebreaker : str, default "first" - Configure how ties between equal values are handled. - Accepted values are: - - - "min": Ties get the smallest possible rank in sorted order. - - "max": Ties get the largest possible rank in sorted order. - - "first": Ranks are assigned in order of when ties appear in the - input. This ensures the ranks are a stable permutation - of the input. - - "dense": The ranks span a dense [1, M] interval where M is the - number of distinct values in the input. - """ def __init__( self, sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", @@ -664,24 +295,8 @@ class RankOptions(FunctionOptions): tiebreaker: Literal["min", "max", "first", "dense"] = "first", ) -> None: ... -class RankQuantileOptions(FunctionOptions): - """ - Options for the `rank_quantile` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - """ +class RankQuantileOptions(FunctionOptions): def __init__( self, sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", @@ -689,22 +304,8 @@ class RankQuantileOptions(FunctionOptions): null_placement: _Placement = "at_end", ) -> None: ... + class PivotWiderOptions(FunctionOptions): - """ - Options for the `pivot_wider` function. - - Parameters - ---------- - key_names : sequence of str - The pivot key names expected in the pivot key column. - For each entry in `key_names`, a column with the same name is emitted - in the struct output. - unexpected_key_behavior : str, default "ignore" - The behavior when pivot keys not in `key_names` are encountered. - Accepted values are "ignore", "raise". - If "ignore", unexpected keys are silently ignored. - If "raise", unexpected keys raise a KeyError. - """ def __init__( self, key_names: Sequence[str], @@ -712,39 +313,17 @@ class PivotWiderOptions(FunctionOptions): unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", ) -> None: ... + class ReplaceSliceOptions(FunctionOptions): - """ - Options for replacing slices. - - Parameters - ---------- - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - """ def __init__(self, start: int, stop: int, replacement: str) -> None: ... + class ReplaceSubstringOptions(FunctionOptions): - """ - Options for replacing matched substrings. - - Parameters - ---------- - pattern : str - Substring pattern to look for inside input values. - replacement : str - What to replace the pattern with. - max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). - """ def __init__( self, pattern: str, replacement: str, *, max_replacements: int | None = None ) -> None: ... + _RoundMode: TypeAlias = Literal[ "down", "up", @@ -758,43 +337,22 @@ _RoundMode: TypeAlias = Literal[ "half_to_odd", ] + class RoundBinaryOptions(FunctionOptions): - """ - Options for rounding numbers when ndigits is provided by a second array - - Parameters - ---------- - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ def __init__( self, round_mode: _RoundMode = "half_to_even", ) -> None: ... + class RoundOptions(FunctionOptions): - """ - Options for rounding numbers. - - Parameters - ---------- - ndigits : int, default 0 - Number of fractional digits to round to. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ def __init__( self, ndigits: int = 0, round_mode: _RoundMode = "half_to_even", ) -> None: ... + _DateTimeUint: TypeAlias = Literal[ "year", "quarter", @@ -809,48 +367,8 @@ _DateTimeUint: TypeAlias = Literal[ "nanosecond", ] + class RoundTemporalOptions(FunctionOptions): - """ - Options for rounding temporal values. - - Parameters - ---------- - multiple : int, default 1 - Number of units to round to. - unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. - calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. - """ def __init__( self, multiple: int = 1, @@ -861,223 +379,67 @@ class RoundTemporalOptions(FunctionOptions): calendar_based_origin: bool = False, ) -> None: ... + class RoundToMultipleOptions(FunctionOptions): - """ - Options for rounding numbers to a multiple. - - Parameters - ---------- - multiple : numeric scalar, default 1.0 - Multiple to round to. Should be a scalar of a type compatible - with the argument to be rounded. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... + def __init__(self, multiple: float = 1.0, + round_mode: _RoundMode = "half_to_even") -> None: ... + class ScalarAggregateOptions(FunctionOptions): - """ - Options for scalar aggregations. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... -class SelectKOptions(FunctionOptions): - """ - Options for top/bottom k-selection. - - Parameters - ---------- - k : int - Number of leading values to select in sorted order - (i.e. the largest values if sort order is "descending", - the smallest otherwise). - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - """ +class SelectKOptions(FunctionOptions): def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + class SetLookupOptions(FunctionOptions): - """ - Options for the `is_in` and `index_in` functions. - - Parameters - ---------- - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - """ def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... -class SliceOptions(FunctionOptions): - """ - Options for slicing. - - Parameters - ---------- - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - """ +class SliceOptions(FunctionOptions): def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... + class SortOptions(FunctionOptions): - """ - Options for the `sort_indices` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - null_placement : str, default "at_end" - Where nulls in input should be sorted, only applying to - columns/fields mentioned in `sort_keys`. - Accepted values are "at_start", "at_end". - """ def __init__( self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" ) -> None: ... -class SplitOptions(FunctionOptions): - """ - Options for splitting on whitespace. - Parameters - ---------- - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - """ +class SplitOptions(FunctionOptions): + def __init__(self, *, max_splits: int | None = None, + reverse: bool = False) -> None: ... - def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... class SplitPatternOptions(FunctionOptions): - """ - Options for splitting on a string pattern. - - Parameters - ---------- - pattern : str - String pattern to split on. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - """ def __init__( self, pattern: str, *, max_splits: int | None = None, reverse: bool = False ) -> None: ... + class StrftimeOptions(FunctionOptions): - """ - Options for the `strftime` function. - - Parameters - ---------- - format : str, default "%Y-%m-%dT%H:%M:%S" - Pattern for formatting input values. - locale : str, default "C" - Locale to use for locale-specific format specifiers. - """ - def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C") -> None: ... + class StrptimeOptions(FunctionOptions): - """ - Options for the `strptime` function. - - Parameters - ---------- - format : str - Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". - Note that the semantics of the format follow the C/C++ strptime, not the Python one. - There are differences in behavior, for example how the "%y" placeholder - handles years with less than four digits. - unit : str - Timestamp unit of the output. - Accepted values are "s", "ms", "us", "ns". - error_is_null : boolean, default False - Return null on parsing errors if true or raise if false. - """ def __init__( self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False ) -> None: ... + class StructFieldOptions(FunctionOptions): - """ - Options for the `struct_field` function. - - Parameters - ---------- - indices : List[str], List[bytes], List[int], Expression, bytes, str, or int - List of indices for chained field lookup, for example `[4, 1]` - will look up the second nested field in the fifth outer field. - """ def __init__( self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int ) -> None: ... + class TakeOptions(FunctionOptions): - """ - Options for the `take` and `array_take` functions. - - Parameters - ---------- - boundscheck : boolean, default True - Whether to check indices are within bounds. If False and an - index is out of bounds, behavior is undefined (the process - may crash). - """ def __init__(self, boundscheck: bool = True) -> None: ... + class TDigestOptions(FunctionOptions): - """ - Options for the `tdigest` function. - - Parameters - ---------- - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to approximate. All values must be - in [0, 1]. - delta : int, default 100 - Compression parameter for the T-digest algorithm. - buffer_size : int, default 500 - Buffer size for the T-digest algorithm. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ def __init__( self, q: float | Sequence[float] = 0.5, @@ -1088,85 +450,27 @@ class TDigestOptions(FunctionOptions): min_count: int = 0, ) -> None: ... + class TrimOptions(FunctionOptions): - """ - Options for trimming characters from strings. - - Parameters - ---------- - characters : str - Individual characters to be trimmed from the string. - """ def __init__(self, characters: str) -> None: ... -class Utf8NormalizeOptions(FunctionOptions): - """ - Options for the `utf8_normalize` function. - - Parameters - ---------- - form : str - Unicode normalization form. - Accepted values are "NFC", "NFKC", "NFD", NFKD". - """ +class Utf8NormalizeOptions(FunctionOptions): def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + class VarianceOptions(FunctionOptions): - """ - Options for the `variance` and `stddev` functions. - - Parameters - ---------- - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + class SkewOptions(FunctionOptions): - """ - Options for the `skew` and `kurtosis` functions. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - biased : bool, default True - Whether the calculated value is biased. - If False, the value computed includes a correction factor to reduce bias. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ def __init__( self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 ) -> None: ... + class WeekOptions(FunctionOptions): - """ - Options for the `week` function. - - Parameters - ---------- - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - count_from_zero : bool, default False - If True, dates at the start of a year that fall into the last week - of the previous year emit 0. - If False, they emit 52 or 53 (the week number of the last week - of the previous year). - first_week_is_fully_in_year : bool, default False - If True, week number 0 is fully in January. - If False, a week that begins on December 29, 30 or 31 is considered - to be week number 0 of the following year. - """ def __init__( self, *, @@ -1177,76 +481,31 @@ class WeekOptions(FunctionOptions): # ==================== _compute.pyx Functions ==================== + def call_function( name: str, args: list, options: FunctionOptions | None = None, memory_pool: lib.MemoryPool | None = None, length: int | None = None, -) -> Any: - """ - Call a named function. - - The function is looked up in the global registry - (as returned by `function_registry()`). - - Parameters - ---------- - name : str - The name of the function to call. - args : list - The arguments to the function. - options : optional - options provided to the function. - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - length : int, optional - Batch size for execution, for nullary (no argument) functions. If not - passed, inferred from data. - """ - +) -> Any: ... def function_registry() -> FunctionRegistry: ... -def get_function(name: str) -> Function: - """ - Get a function by name. - - The function is looked up in the global registry - (as returned by `function_registry()`). - - Parameters - ---------- - name : str - The name of the function to lookup - """ - -def list_functions() -> list[str]: - """ - Return all function names in the global registry. - """ +def get_function(name: str) -> Function: ... +def list_functions() -> list[str]: ... # ==================== _compute.pyx Udf ==================== + def call_tabular_function( function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None -) -> lib.RecordBatchReader: - """ - Get a record batch iterator from a tabular function. - - Parameters - ---------- - function_name : str - Name of the function. - args : iterable - The arguments to pass to the function. Accepted types depend - on the specific function. Currently, only an empty args is supported. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - """ +) -> lib.RecordBatchReader: ... + class _FunctionDoc(TypedDict): summary: str description: str + def register_scalar_function( func: Callable, function_name: str, @@ -1254,80 +513,8 @@ def register_scalar_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined scalar function. - - This API is EXPERIMENTAL. - - A scalar function is a function that executes elementwise - operations on arrays or scalars, i.e. a scalar function must - be computed row-by-row with no state where each output row - is computed only from its corresponding input row. - In other words, all argument arrays have the same length, - and the output array is of the same length as the arguments. - Scalar functions are the only functions allowed in query engine - expressions. - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return an Array or Scalar - matching the out_type. It must return a Scalar if - all arguments are scalar, else it must return an Array. - - To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs - arguments. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "simple udf" - >>> func_doc["description"] = "add a constant to a scalar" - >>> - >>> def add_constant(ctx, array): - ... return pc.add(array, 1, memory_pool=ctx.memory_pool) - >>> - >>> func_name = "py_add_func" - >>> in_types = {"array": pa.int64()} - >>> out_type = pa.int64() - >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) - >>> - >>> func = pc.get_function(func_name) - >>> func.name - 'py_add_func' - >>> answer = pc.call_function(func_name, [pa.array([20])]) - >>> answer - - [ - 21 - ] - """ +) -> None: ... + def register_tabular_function( func: Callable, @@ -1336,39 +523,8 @@ def register_tabular_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined tabular function. - - This API is EXPERIMENTAL. - - A tabular function is one accepting a context argument of type - UdfContext and returning a generator of struct arrays. - The in_types argument must be empty and the out_type argument - specifies a schema. Each struct array must have field types - corresponding to the schema. - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The only argument is the context argument of type - UdfContext. It must return a callable that - returns on each invocation a StructArray matching - the out_type, where an empty array indicates end. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - Must be an empty dictionary (reserved for future use). - out_type : Union[Schema, DataType] - Schema of the function's output, or a corresponding flat struct type. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - """ +) -> None: ... + def register_aggregate_function( func: Callable, @@ -1377,88 +533,8 @@ def register_aggregate_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined non-decomposable aggregate function. - - This API is EXPERIMENTAL. - - A non-decomposable aggregation function is a function that executes - aggregate operations on the whole data that it is aggregating. - In other words, non-decomposable aggregate function cannot be - split into consume/merge/finalize steps. - - This is often used with ordered or segmented aggregation where groups - can be emit before accumulating all of the input data. - - Note that currently the size of any input column cannot exceed 2 GB - for a single segment (all groups combined). - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return a Scalar matching the - out_type. - To define a varargs function, pass a callable that takes - *args. The in_type needs to match in type of inputs when - the function gets called. - function_name : str - Name of the function. This name must be unique, i.e., - there should only be one function registered with - this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import numpy as np - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "simple median udf" - >>> func_doc["description"] = "compute median" - >>> - >>> def compute_median(ctx, array): - ... return pa.scalar(np.median(array)) - >>> - >>> func_name = "py_compute_median" - >>> in_types = {"array": pa.int64()} - >>> out_type = pa.float64() - >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) - >>> - >>> func = pc.get_function(func_name) - >>> func.name - 'py_compute_median' - >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) - >>> answer - - >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) - >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) - >>> result - pyarrow.Table - k: int64 - v_py_compute_median: double - ---- - k: [[1,2]] - v_py_compute_median: [[15,35]] - """ +) -> None: ... + def register_vector_function( func: Callable, @@ -1467,182 +543,24 @@ def register_vector_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined vector function. - - This API is EXPERIMENTAL. - - A vector function is a function that executes vector - operations on arrays. Vector function is often used - when compute doesn't fit other more specific types of - functions (e.g., scalar and aggregate). - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return an Array or Scalar - matching the out_type. It must return a Scalar if - all arguments are scalar, else it must return an Array. - - To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs - arguments. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "percent rank" - >>> func_doc["description"] = "compute percent rank" - >>> - >>> def list_flatten_udf(ctx, x): - ... return pc.list_flatten(x) - >>> - >>> func_name = "list_flatten_udf" - >>> in_types = {"array": pa.list_(pa.int64())} - >>> out_type = pa.int64() - >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) - >>> - >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) - >>> answer - - [ - 1, - 2, - 3, - 4 - ] - """ - -class UdfContext: - """ - Per-invocation function context/state. +) -> None: ... - This object will always be the first argument to a user-defined - function. It should not be used outside of a call to the function. - """ +class UdfContext: @property - def batch_length(self) -> int: - """ - The common length of all input arguments (int). - - In the case that all arguments are scalars, this value - is used to pass the "actual length" of the arguments, - e.g. because the scalar values are encoding a column - with a constant value. - """ + def batch_length(self) -> int: ... @property - def memory_pool(self) -> lib.MemoryPool: - """ - A memory pool for allocations (:class:`MemoryPool`). - - This is the memory pool supplied by the user when they invoked - the function and it should be used in any calls to arrow that the - UDF makes if that call accepts a memory_pool. - """ + def memory_pool(self) -> lib.MemoryPool: ... # ==================== _compute.pyx Expression ==================== -class Expression(lib._Weakrefable): - """ - A logical expression to be evaluated against some input. - - To create an expression: - - - Use the factory function ``pyarrow.compute.scalar()`` to create a - scalar (not necessary when combined, see example below). - - Use the factory function ``pyarrow.compute.field()`` to reference - a field (column in table). - - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. - - Combine expressions using python operators ``&`` (logical and), - ``|`` (logical or) and ``~`` (logical not). - Note: python keywords ``and``, ``or`` and ``not`` cannot be used - to combine expressions. - - Create expression predicates using Expression methods such as - ``pyarrow.compute.Expression.isin()``. - - Examples - -------- - - >>> import pyarrow.compute as pc - >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) - 7))> - >>> pc.field("a") != 3 - - >>> pc.field("a").isin([1, 2, 3]) - - """ + +class Expression(lib._Weakrefable): @staticmethod - def from_substrait(buffer: bytes | lib.Buffer) -> Expression: - """ - Deserialize an expression from Substrait - - The serialized message must be an ExtendedExpression message that has - only a single expression. The name of the expression and the schema - the expression was bound to will be ignored. Use - pyarrow.substrait.deserialize_expressions if this information is needed - or if the message might contain multiple expressions. - - Parameters - ---------- - message : bytes or Buffer or a protobuf Message - The Substrait message to deserialize - - Returns - ------- - Expression - The deserialized expression - """ - def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: - """ - Serialize the expression using Substrait - - The expression will be serialized as an ExtendedExpression message that has a - single expression named "expression" - - Parameters - ---------- - schema : Schema - The input schema the expression will be bound to - allow_arrow_extensions : bool, default False - If False then only functions that are part of the core Substrait function - definitions will be allowed. Set this to True to allow pyarrow-specific functions - but the result may not be accepted by other compute libraries. - - Returns - ------- - Buffer - A buffer containing the serialized Protobuf plan. - """ + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ... + def to_substrait(self, schema: lib.Schema, + allow_arrow_extensions: bool = False) -> lib.Buffer: ... + def __invert__(self) -> Expression: ... def __and__(self, other) -> Expression: ... def __or__(self, other) -> Expression: ... @@ -1656,83 +574,13 @@ class Expression(lib._Weakrefable): def __ge__(self, value: object) -> Expression: ... # type: ignore[override] def __le__(self, value: object) -> Expression: ... # type: ignore[override] def __truediv__(self, other) -> Expression: ... - def is_valid(self) -> bool: - """ - Check whether the expression is not-null (valid). - - This creates a new expression equivalent to calling the - `is_valid` compute function on this expression. - - Returns - ------- - is_valid : Expression - """ - def is_null(self, nan_is_null: bool = False) -> Expression: - """ - Check whether the expression is null. - - This creates a new expression equivalent to calling the - `is_null` compute function on this expression. - - Parameters - ---------- - nan_is_null : boolean, default False - Whether floating-point NaNs are considered null. - - Returns - ------- - is_null : Expression - """ - def is_nan(self) -> Expression: - """ - Check whether the expression is NaN. - - This creates a new expression equivalent to calling the - `is_nan` compute function on this expression. - - Returns - ------- - is_nan : Expression - """ + def is_valid(self) -> bool: ... + def is_null(self, nan_is_null: bool = False) -> Expression: ... + def is_nan(self) -> Expression: ... + def cast( self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None - ) -> Expression: - """ - Explicitly set or change the expression's data type. - - This creates a new expression equivalent to calling the - `cast` compute function on this expression. - - Parameters - ---------- - type : DataType, default None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - cast : Expression - """ - def isin(self, values: lib.Array | Iterable) -> Expression: - """ - Check whether the expression is contained in values. - - This creates a new expression equivalent to calling the - `is_in` compute function on this expression. - - Parameters - ---------- - values : Array or iterable - The values to check for. - - Returns - ------- - isin : Expression - A new expression that, when evaluated, checks whether - this expression's value is contained in `values`. - """ + ) -> Expression: ... + def isin(self, values: lib.Array | Iterable) -> Expression: ... # ==================== _compute.py ==================== diff --git a/python/pyarrow-stubs/_csv.pyi b/python/pyarrow-stubs/_csv.pyi index c490d6be93a..c62ae725ec1 100644 --- a/python/pyarrow-stubs/_csv.pyi +++ b/python/pyarrow-stubs/_csv.pyi @@ -22,97 +22,9 @@ from _typeshed import StrPath from . import lib + @dataclass(kw_only=True) class ReadOptions(lib._Weakrefable): - """ - Options for reading CSV files. - - Parameters - ---------- - use_threads : bool, optional (default True) - Whether to use multiple threads to accelerate reading - block_size : int, optional - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as - the size of individual record batches or table chunks. - Minimum valid value for block size is 1 - skip_rows : int, optional (default 0) - The number of rows to skip before the column names (if any) - and the CSV data. - skip_rows_after_names : int, optional (default 0) - The number of rows to skip after the column names. - This number can be larger than the number of rows in one - block, and empty rows are counted. - The order of application is as follows: - - `skip_rows` is applied (if non-zero); - - column names are read (unless `column_names` is set); - - `skip_rows_after_names` is applied (if non-zero). - column_names : list, optional - The column names of the target table. If empty, fall back on - `autogenerate_column_names`. - autogenerate_column_names : bool, optional (default False) - Whether to autogenerate column names if `column_names` is empty. - If true, column names will be of the form "f0", "f1"... - If false, column names will be read from the first CSV row - after `skip_rows`. - encoding : str, optional (default 'utf8') - The character encoding of the CSV data. Columns that cannot - decode using this encoding can still be read as Binary. - - Examples - -------- - - Defining an example data: - - >>> import io - >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" - >>> print(s) - 1,2,3 - Flamingo,2,2022-03-01 - Horse,4,2022-03-02 - Brittle stars,5,2022-03-03 - Centipede,100,2022-03-04 - - Ignore the first numbered row and substitute it with defined - or autogenerated column names: - - >>> from pyarrow import csv - >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - - >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - f0: string - f1: int64 - f2: date32[day] - ---- - f0: [["Flamingo","Horse","Brittle stars","Centipede"]] - f1: [[2,4,5,100]] - f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - - Remove the first 2 rows of the data: - - >>> read_options = csv.ReadOptions(skip_rows_after_names=2) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - 1: string - 2: int64 - 3: date32[day] - ---- - 1: [["Brittle stars","Centipede"]] - 2: [[5,100]] - 3: [[2022-03-03,2022-03-04]] - """ - use_threads: bool = field(default=True, kw_only=False) block_size: int | None = None skip_rows: int = 0 @@ -120,84 +32,11 @@ class ReadOptions(lib._Weakrefable): column_names: list[str] | None = None autogenerate_column_names: bool = False encoding: str = "utf8" - def validate(self) -> None: ... + @dataclass(kw_only=True) class ParseOptions(lib._Weakrefable): - """ - Options for parsing CSV files. - - Parameters - ---------- - delimiter : 1-character string, optional (default ',') - The character delimiting individual cells in the CSV data. - quote_char : 1-character string or False, optional (default '"') - The character used optionally for quoting CSV values - (False if quoting is not allowed). - double_quote : bool, optional (default True) - Whether two quotes in a quoted CSV value denote a single quote - in the data. - escape_char : 1-character string or False, optional (default False) - The character used optionally for escaping special characters - (False if escaping is not allowed). - newlines_in_values : bool, optional (default False) - Whether newline characters are allowed in CSV values. - Setting this to True reduces the performance of multi-threaded - CSV reading. - ignore_empty_lines : bool, optional (default True) - Whether empty lines are ignored in CSV input. - If False, an empty line is interpreted as containing a single empty - value (assuming a one-column CSV file). - invalid_row_handler : callable, optional (default None) - If not None, this object is called for each CSV row that fails - parsing (because of a mismatching number of columns). - It should accept a single InvalidRow argument and return either - "skip" or "error" depending on the desired outcome. - - Examples - -------- - - Defining an example file from bytes object: - - >>> import io - >>> s = ( - ... "animals;n_legs;entry\\n" - ... "Flamingo;2;2022-03-01\\n" - ... "# Comment here:\\n" - ... "Horse;4;2022-03-02\\n" - ... "Brittle stars;5;2022-03-03\\n" - ... "Centipede;100;2022-03-04" - ... ) - >>> print(s) - animals;n_legs;entry - Flamingo;2;2022-03-01 - # Comment here: - Horse;4;2022-03-02 - Brittle stars;5;2022-03-03 - Centipede;100;2022-03-04 - >>> source = io.BytesIO(s.encode()) - - Read the data from a file skipping rows with comments - and defining the delimiter: - - >>> from pyarrow import csv - >>> def skip_comment(row): - ... if row.text.startswith("# "): - ... return "skip" - ... else: - ... return "error" - >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) - >>> csv.read_csv(source, parse_options=parse_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - """ delimiter: str = field(default=",", kw_only=False) quote_char: str | Literal[False] = '"' @@ -209,210 +48,9 @@ class ParseOptions(lib._Weakrefable): def validate(self) -> None: ... + @dataclass(kw_only=True) class ConvertOptions(lib._Weakrefable): - """ - Options for converting CSV data. - - Parameters - ---------- - check_utf8 : bool, optional (default True) - Whether to check UTF8 validity of string columns. - column_types : pyarrow.Schema or dict, optional - Explicitly map column names to column types. Passing this argument - disables type inference on the defined columns. - null_values : list, optional - A sequence of strings that denote nulls in the data - (defaults are appropriate in most cases). Note that by default, - string columns are not checked for null values. To enable - null checking for those, specify ``strings_can_be_null=True``. - true_values : list, optional - A sequence of strings that denote true booleans in the data - (defaults are appropriate in most cases). - false_values : list, optional - A sequence of strings that denote false booleans in the data - (defaults are appropriate in most cases). - decimal_point : 1-character string, optional (default '.') - The character used as decimal point in floating-point and decimal - data. - strings_can_be_null : bool, optional (default False) - Whether string / binary columns can have null values. - If true, then strings in null_values are considered null for - string columns. - If false, then all strings are valid string values. - quoted_strings_can_be_null : bool, optional (default True) - Whether quoted values can be null. - If true, then strings in "null_values" are also considered null - when they appear quoted in the CSV file. Otherwise, quoted values - are never considered null. - include_columns : list, optional - The names of columns to include in the Table. - If empty, the Table will include all columns from the CSV file. - If not empty, only these columns will be included, in this order. - include_missing_columns : bool, optional (default False) - If false, columns in `include_columns` but not in the CSV file will - error out. - If true, columns in `include_columns` but not in the CSV file will - produce a column of nulls (whose type is selected using - `column_types`, or null by default). - This option is ignored if `include_columns` is empty. - auto_dict_encode : bool, optional (default False) - Whether to try to automatically dict-encode string / binary data. - If true, then when type inference detects a string or binary column, - it it dict-encoded up to `auto_dict_max_cardinality` distinct values - (per chunk), after which it switches to regular encoding. - This setting is ignored for non-inferred columns (those in - `column_types`). - auto_dict_max_cardinality : int, optional - The maximum dictionary cardinality for `auto_dict_encode`. - This value is per chunk. - timestamp_parsers : list, optional - A sequence of strptime()-compatible format strings, tried in order - when attempting to infer or convert timestamp values (the special - value ISO8601() can also be given). By default, a fast built-in - ISO-8601 parser is used. - - Examples - -------- - - Defining an example data: - - >>> import io - >>> s = ( - ... "animals,n_legs,entry,fast\\n" - ... "Flamingo,2,01/03/2022,Yes\\n" - ... "Horse,4,02/03/2022,Yes\\n" - ... "Brittle stars,5,03/03/2022,No\\n" - ... "Centipede,100,04/03/2022,No\\n" - ... ",6,05/03/2022," - ... ) - >>> print(s) - animals,n_legs,entry,fast - Flamingo,2,01/03/2022,Yes - Horse,4,02/03/2022,Yes - Brittle stars,5,03/03/2022,No - Centipede,100,04/03/2022,No - ,6,05/03/2022, - - Change the type of a column: - - >>> import pyarrow as pa - >>> from pyarrow import csv - >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: double - entry: string - fast: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] - fast: [["Yes","Yes","No","No",""]] - - Define a date parsing format to get a timestamp type column - (in case dates are not in ISO format and not converted by default): - - >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: timestamp[s] - fast: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] - fast: [["Yes","Yes","No","No",""]] - - Specify a subset of columns to be read: - - >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - - List additional column to be included as a null typed column: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - location: null - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - location: [5 nulls] - - Define columns as dictionary type (by default only the - string/binary columns are dictionary encoded): - - >>> convert_options = csv.ConvertOptions( - ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: dictionary - n_legs: int64 - entry: timestamp[s] - fast: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: - [0,1,2,3,4]] - n_legs: [[2,4,5,100,6]] - entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] - fast: [ -- dictionary: - ["Yes","No",""] -- indices: - [0,0,1,1,2]] - - Set upper limit for the number of categories. If the categories - is more than the limit, the conversion to dictionary will not - happen: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - - Set empty strings to missing values: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals", "n_legs"], strings_can_be_null=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] - n_legs: [[2,4,5,100,6]] - - Define values to be True and False when converting a column - into a bool type: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - fast: bool - ---- - fast: [[true,true,false,false,null]] - """ check_utf8: bool = field(default=True, kw_only=False) column_types: lib.Schema | dict | None = None @@ -430,30 +68,9 @@ class ConvertOptions(lib._Weakrefable): def validate(self) -> None: ... + @dataclass(kw_only=True) class WriteOptions(lib._Weakrefable): - """ - Options for writing CSV files. - - Parameters - ---------- - include_header : bool, optional (default True) - Whether to write an initial header line with column names - batch_size : int, optional (default 1024) - How many rows to process together when converting and writing - CSV data - delimiter : 1-character string, optional (default ",") - The character delimiting individual cells in the CSV data. - quoting_style : str, optional (default "needed") - Whether to quote values, and if so, which quoting style to use. - The following values are accepted: - - - "needed" (default): only enclose values in quotes when needed. - - "all_valid": enclose all valid values in quotes; nulls are not quoted. - - "none": do not enclose any values in quotes; values containing - special characters (such as quotes, cell delimiters or line endings) - will raise an error. - """ include_header: bool = field(default=True, kw_only=False) batch_size: int = 1024 @@ -462,43 +79,17 @@ class WriteOptions(lib._Weakrefable): def validate(self) -> None: ... + @dataclass class InvalidRow(lib._Weakrefable): - """ - Description of an invalid row in a CSV file. - - Parameters - ---------- - expected_columns : int - The expected number of columns in the row. - actual_columns : int - The actual number of columns in the row. - number : int or None - The physical row number if known, otherwise None. - text : str - The contents of the row. - """ expected_columns: int actual_columns: int number: int | None text: str + class CSVWriter(lib._CRecordBatchWriter): - """ - Writer to create a CSV file. - - Parameters - ---------- - sink : str, path, pyarrow.OutputStream or file-like object - The location where to write the CSV data. - schema : pyarrow.Schema - The schema of the data to be written. - write_options : pyarrow.csv.WriteOptions - Options to configure writing the CSV data. - memory_pool : MemoryPool, optional - Pool for temporary allocations. - """ def __init__( self, @@ -510,44 +101,22 @@ class CSVWriter(lib._CRecordBatchWriter): memory_pool: lib.MemoryPool | None = None, ) -> None: ... -class CSVStreamingReader(lib.RecordBatchReader): ... + +class CSVStreamingReader(lib.RecordBatchReader): + ... + ISO8601: lib._Weakrefable + def open_csv( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> CSVStreamingReader: - """ - Open a streaming reader of CSV data. - - Reading using this function is always single-threaded. - - Parameters - ---------- - input_file : string, path or file-like object - The location of CSV data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.csv.ReadOptions, optional - Options for the CSV reader (see pyarrow.csv.ReadOptions constructor - for defaults) - parse_options : pyarrow.csv.ParseOptions, optional - Options for the CSV parser - (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options : pyarrow.csv.ConvertOptions, optional - Options for converting CSV data - (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate RecordBatch memory from - - Returns - ------- - :class:`pyarrow.csv.CSVStreamingReader` - """ +) -> CSVStreamingReader: ... + def read_csv( input_file: StrPath | IO[Any], @@ -555,104 +124,12 @@ def read_csv( parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Table: - """ - Read a Table from a stream of CSV data. - - Parameters - ---------- - input_file : string, path or file-like object - The location of CSV data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.csv.ReadOptions, optional - Options for the CSV reader (see pyarrow.csv.ReadOptions constructor - for defaults) - parse_options : pyarrow.csv.ParseOptions, optional - Options for the CSV parser - (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options : pyarrow.csv.ConvertOptions, optional - Options for converting CSV data - (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate Table memory from - - Returns - ------- - :class:`pyarrow.Table` - Contents of the CSV file as a in-memory table. - - Examples - -------- - - Defining an example file from bytes object: - - >>> import io - >>> s = ( - ... "animals,n_legs,entry\\n" - ... "Flamingo,2,2022-03-01\\n" - ... "Horse,4,2022-03-02\\n" - ... "Brittle stars,5,2022-03-03\\n" - ... "Centipede,100,2022-03-04" - ... ) - >>> print(s) - animals,n_legs,entry - Flamingo,2,2022-03-01 - Horse,4,2022-03-02 - Brittle stars,5,2022-03-03 - Centipede,100,2022-03-04 - >>> source = io.BytesIO(s.encode()) - - Reading from the file - - >>> from pyarrow import csv - >>> csv.read_csv(source) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - """ +) -> lib.Table: ... + def write_csv( data: lib.RecordBatch | lib.Table, output_file: StrPath | lib.NativeFile | IO[Any], write_options: WriteOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> None: - """ - Write record batch or table to a CSV file. - - Parameters - ---------- - data : pyarrow.RecordBatch or pyarrow.Table - The data to write. - output_file : string, path, pyarrow.NativeFile, or file-like object - The location where to write the CSV data. - write_options : pyarrow.csv.WriteOptions - Options to configure writing the CSV data. - memory_pool : MemoryPool, optional - Pool for temporary allocations. - - Examples - -------- - - >>> import pyarrow as pa - >>> from pyarrow import csv - - >>> legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) - >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) - - >>> csv.write_csv(table, "animals.csv") - - >>> write_options = csv.WriteOptions(include_header=False) - >>> csv.write_csv(table, "animals.csv", write_options=write_options) - - >>> write_options = csv.WriteOptions(delimiter=";") - >>> csv.write_csv(table, "animals.csv", write_options=write_options) - """ +) -> None: ... diff --git a/python/pyarrow-stubs/_cuda.pyi b/python/pyarrow-stubs/_cuda.pyi index c96951b863c..3ec866ad668 100644 --- a/python/pyarrow-stubs/_cuda.pyi +++ b/python/pyarrow-stubs/_cuda.pyi @@ -24,278 +24,73 @@ from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-un from . import lib from ._stubs_typing import ArrayLike + class Context(lib._Weakrefable): - """ - CUDA driver context. - """ - - def __init__(self, device_number: int = 0, handle: int | None = None) -> None: - """ - Create a CUDA driver context for a particular device. - - If a CUDA context handle is passed, it is wrapped, otherwise - a default CUDA context for the given device is requested. - - Parameters - ---------- - device_number : int (default 0) - Specify the GPU device for which the CUDA driver context is - requested. - handle : int, optional - Specify CUDA handle for a shared context that has been created - by another library. - """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ... + @staticmethod - def from_numba(context: _numba_driver.Context | None = None) -> Context: - """ - Create a Context instance from a Numba CUDA context. - - Parameters - ---------- - context : {numba.cuda.cudadrv.driver.Context, None} - A Numba CUDA context instance. - If None, the current Numba context is used. - - Returns - ------- - shared_context : pyarrow.cuda.Context - Context instance. - """ - def to_numba(self) -> _numba_driver.Context: - """ - Convert Context to a Numba CUDA context. - - Returns - ------- - context : numba.cuda.cudadrv.driver.Context - Numba CUDA context instance. - """ + def from_numba(context: _numba_driver.Context | None = None) -> Context: ... + + def to_numba(self) -> _numba_driver.Context: ... + @staticmethod - def get_num_devices() -> int: - """Return the number of GPU devices.""" + def get_num_devices() -> int: ... + @property - def device_number(self) -> int: - """Return context device number.""" + def device_number(self) -> int: ... + @property - def handle(self) -> int: - """Return pointer to context handle.""" - def synchronize(self) -> None: - """Blocks until the device has completed all preceding requested - tasks. - """ + def handle(self) -> int: ... + + def synchronize(self) -> None: ... + @property - def bytes_allocated(self) -> int: - """Return the number of allocated bytes.""" - def get_device_address(self, address: int) -> int: - """Return the device address that is reachable from kernels running in - the context - - Parameters - ---------- - address : int - Specify memory address value - - Returns - ------- - device_address : int - Device address accessible from device context - - Notes - ----- - The device address is defined as a memory address accessible - by device. While it is often a device memory address but it - can be also a host memory address, for instance, when the - memory is allocated as host memory (using cudaMallocHost or - cudaHostAlloc) or as managed memory (using cudaMallocManaged) - or the host memory is page-locked (using cudaHostRegister). - """ - def new_buffer(self, nbytes: int) -> CudaBuffer: - """Return new device buffer. - - Parameters - ---------- - nbytes : int - Specify the number of bytes to be allocated. - - Returns - ------- - buf : CudaBuffer - Allocated buffer. - """ + def bytes_allocated(self) -> int: ... + + def get_device_address(self, address: int) -> int: ... + + def new_buffer(self, nbytes: int) -> CudaBuffer: ... + @property - def memory_manager(self) -> lib.MemoryManager: - """ - The default memory manager tied to this context's device. - - Returns - ------- - MemoryManager - """ + def memory_manager(self) -> lib.MemoryManager: ... + @property - def device(self) -> lib.Device: - """ - The device instance associated with this context. - - Returns - ------- - Device - """ - def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: - """ - Create device buffer from address and size as a view. - - The caller is responsible for allocating and freeing the - memory. When `address==size==0` then a new zero-sized buffer - is returned. - - Parameters - ---------- - address : int - Specify the starting address of the buffer. The address can - refer to both device or host memory but it must be - accessible from device after mapping it with - `get_device_address` method. - size : int - Specify the size of device buffer in bytes. - base : {None, object} - Specify object that owns the referenced memory. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device reachable memory. - - """ - def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: - """Open existing CUDA IPC memory handle - - Parameters - ---------- - ipc_handle : IpcMemHandle - Specify opaque pointer to CUipcMemHandle (driver API). - - Returns - ------- - buf : CudaBuffer - referencing device buffer - """ + def device(self) -> lib.Device: ... + + def foreign_buffer(self, address: int, size: int, base: Any | + None = None) -> CudaBuffer: ... + + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... + def buffer_from_data( self, data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, offset: int = 0, size: int = -1, - ) -> CudaBuffer: - """Create device buffer and initialize with data. - - Parameters - ---------- - data : {CudaBuffer, HostBuffer, Buffer, array-like} - Specify data to be copied to device buffer. - offset : int - Specify the offset of input buffer for device data - buffering. Default: 0. - size : int - Specify the size of device buffer in bytes. Default: all - (starting from input offset) - - Returns - ------- - cbuf : CudaBuffer - Device buffer with copied data. - """ - def buffer_from_object(self, obj: Any) -> CudaBuffer: - """Create device buffer view of arbitrary object that references - device accessible memory. - - When the object contains a non-contiguous view of device - accessible memory then the returned device buffer will contain - contiguous view of the memory, that is, including the - intermediate data that is otherwise invisible to the input - object. - - Parameters - ---------- - obj : {object, Buffer, HostBuffer, CudaBuffer, ...} - Specify an object that holds (device or host) address that - can be accessed from device. This includes objects with - types defined in pyarrow.cuda as well as arbitrary objects - that implement the CUDA array interface as defined by numba. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device accessible memory. - - """ + ) -> CudaBuffer: ... + + def buffer_from_object(self, obj: Any) -> CudaBuffer: ... + class IpcMemHandle(lib._Weakrefable): - """A serializable container for a CUDA IPC handle.""" + @staticmethod - def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: - """Create IpcMemHandle from opaque buffer (e.g. from another - process) - - Parameters - ---------- - opaque_handle : - a CUipcMemHandle as a const void* - - Returns - ------- - ipc_handle : IpcMemHandle - """ - def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: - """Write IpcMemHandle to a Buffer - - Parameters - ---------- - pool : {MemoryPool, None} - Specify a pool to allocate memory from - - Returns - ------- - buf : Buffer - The serialized buffer. - """ + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ... -class CudaBuffer(lib.Buffer): - """An Arrow buffer with data located in a GPU device. + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ... - To create a CudaBuffer instance, use Context.device_buffer(). - The memory allocated in a CudaBuffer is freed when the buffer object - is deleted. - """ +class CudaBuffer(lib.Buffer): @staticmethod - def from_buffer(buf: lib.Buffer) -> CudaBuffer: - """Convert back generic buffer into CudaBuffer - - Parameters - ---------- - buf : Buffer - Specify buffer containing CudaBuffer - - Returns - ------- - dbuf : CudaBuffer - Resulting device buffer. - """ + def from_buffer(buf: lib.Buffer) -> CudaBuffer: ... + @staticmethod - def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: - """Create a CudaBuffer view from numba MemoryPointer instance. - - Parameters - ---------- - mem : numba.cuda.cudadrv.driver.MemoryPointer - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of numba MemoryPointer. - """ - def to_numba(self) -> _numba_driver.MemoryPointer: - """Return numba memory pointer of CudaBuffer instance.""" + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ... + + def to_numba(self) -> _numba_driver.MemoryPointer: ... + def copy_to_host( self, position: int = 0, @@ -303,243 +98,62 @@ class CudaBuffer(lib.Buffer): buf: lib.Buffer | None = None, memory_pool: lib.MemoryPool | None = None, resizable: bool = False, - ) -> lib.Buffer: - """Copy memory from GPU device to CPU host - - Caller is responsible for ensuring that all tasks affecting - the memory are finished. Use - - `.context.synchronize()` - - when needed. - - Parameters - ---------- - position : int - Specify the starting position of the source data in GPU - device buffer. Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - the position until host buffer is full). - buf : Buffer - Specify a pre-allocated output buffer in host. Default: None - (allocate new output buffer). - memory_pool : MemoryPool - resizable : bool - Specify extra arguments to allocate_buffer. Used only when - buf is None. - - Returns - ------- - buf : Buffer - Output buffer in host. - - """ + ) -> lib.Buffer: ... + def copy_from_host( self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 - ) -> int: - """Copy data from host to device. - - The device buffer must be pre-allocated. - - Parameters - ---------- - data : {Buffer, array-like} - Specify data in host. It can be array-like that is valid - argument to py_buffer - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - """ - def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: - """Copy data from device to device. - - Parameters - ---------- - buf : CudaBuffer - Specify source device buffer. - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - - """ - def export_for_ipc(self) -> IpcMemHandle: - """ - Expose this device buffer as IPC memory which can be used in other - processes. - - After calling this function, this device memory will not be - freed when the CudaBuffer is destructed. - - Returns - ------- - ipc_handle : IpcMemHandle - The exported IPC handle - - """ + ) -> int: ... + + def copy_from_device(self, buf: CudaBuffer, position: int = 0, + nbytes: int = -1) -> int: ... + + def export_for_ipc(self) -> IpcMemHandle: ... + @property - def context(self) -> Context: - """Returns the CUDA driver context of this buffer.""" - def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: - """Return slice of device buffer - - Parameters - ---------- - offset : int, default 0 - Specify offset from the start of device buffer to slice - length : int, default None - Specify the length of slice (default is until end of device - buffer starting from offset). If the length is larger than - the data available, the returned slice will have a size of - the available data starting from the offset. - - Returns - ------- - sliced : CudaBuffer - Zero-copy slice of device buffer. - - """ - def to_pybytes(self) -> bytes: - """Return device buffer content as Python bytes.""" + def context(self) -> Context: ... -class HostBuffer(lib.Buffer): - """Device-accessible CPU memory created using cudaHostAlloc. + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ... - To create a HostBuffer instance, use + def to_pybytes(self) -> bytes: ... + + +class HostBuffer(lib.Buffer): - cuda.new_host_buffer() - """ @property def size(self) -> int: ... + class BufferReader(lib.NativeFile): - """File interface for zero-copy read from CUDA buffers. - Note: Read methods return pointers to device memory. This means - you must be careful using this interface with any Arrow code which - may expect to be able to do anything other than pointer arithmetic - on the returned buffers. - """ def __init__(self, obj: CudaBuffer) -> None: ... - def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: - """Return a slice view of the underlying device buffer. - - The slice will start at the current reader position and will - have specified size in bytes. + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ... - Parameters - ---------- - nbytes : int, default None - Specify the number of bytes to read. Default: None (read all - remaining bytes). - - Returns - ------- - cbuf : CudaBuffer - New device buffer. - - """ class BufferWriter(lib.NativeFile): - """File interface for writing to CUDA buffers. - By default writes are unbuffered. Use set_buffer_size to enable - buffering. - """ def __init__(self, obj: CudaBuffer) -> None: ... - def writeat(self, position: int, data: ArrayLike) -> None: - """Write data to buffer starting from position. - - Parameters - ---------- - position : int - Specify device buffer position where the data will be - written. - data : array-like - Specify data, the data instance must implement buffer - protocol. - """ + def writeat(self, position: int, data: ArrayLike) -> None: ... + @property - def buffer_size(self) -> int: - """Returns size of host (CPU) buffer, 0 for unbuffered""" + def buffer_size(self) -> int: ... + @buffer_size.setter - def buffer_size(self, buffer_size: int): - """Set CPU buffer size to limit calls to cudaMemcpy - - Parameters - ---------- - buffer_size : int - Specify the size of CPU buffer to allocate in bytes. - """ + def buffer_size(self, buffer_size: int): ... + @property - def num_bytes_buffered(self) -> int: - """Returns number of bytes buffered on host""" - -def new_host_buffer(size: int, device: int = 0) -> HostBuffer: - """Return buffer with CUDA-accessible memory on CPU host - - Parameters - ---------- - size : int - Specify the number of bytes to be allocated. - device : int - Specify GPU device number. - - Returns - ------- - dbuf : HostBuffer - Allocated host buffer - """ - -def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: - """Write record batch message to GPU device memory - - Parameters - ---------- - batch : RecordBatch - Record batch to write - ctx : Context - CUDA Context to allocate device memory from - - Returns - ------- - dbuf : CudaBuffer - device buffer which contains the record batch message - """ + def num_bytes_buffered(self) -> int: ... + + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ... + + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ... + def read_message( source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None -) -> lib.Message: - """Read Arrow IPC message located on GPU device - - Parameters - ---------- - source : {CudaBuffer, cuda.BufferReader} - Device buffer or reader of device buffer. - pool : MemoryPool (optional) - Pool to allocate CPU memory for the metadata - - Returns - ------- - message : Message - The deserialized message, body still on device - """ +) -> lib.Message: ... + def read_record_batch( buffer: lib.Buffer, @@ -547,27 +161,4 @@ def read_record_batch( *, dictionary_memo: lib.DictionaryMemo | None = None, pool: lib.MemoryPool | None = None, -) -> lib.RecordBatch: - """Construct RecordBatch referencing IPC message located on CUDA device. - - While the metadata is copied to host memory for deserialization, - the record batch data remains on the device. - - Parameters - ---------- - buffer : - Device buffer containing the complete IPC message - schema : Schema - The schema for the record batch - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - pool : MemoryPool (optional) - Pool to allocate metadata from - - Returns - ------- - batch : RecordBatch - Reconstructed record batch, with device pointers - - """ +) -> lib.RecordBatch: ... diff --git a/python/pyarrow-stubs/_dataset.pyi b/python/pyarrow-stubs/_dataset.pyi index 3665bdba00b..c3b3c4d9bec 100644 --- a/python/pyarrow-stubs/_dataset.pyi +++ b/python/pyarrow-stubs/_dataset.pyi @@ -42,48 +42,16 @@ from .acero import ExecNodeOptions from .compute import Expression from .ipc import IpcWriteOptions, RecordBatchReader -class Dataset(lib._Weakrefable): - """ - Collection of data fragments and potentially child datasets. - Arrow Datasets allow you to query against data that has been split across - multiple files. This sharding of data may indicate partitioning, which - can accelerate queries that only touch some partitions (files). - """ +class Dataset(lib._Weakrefable): @property - def partition_expression(self) -> Expression: - """ - An Expression which evaluates to true for all data viewed by this - Dataset. - """ - def replace_schema(self, schema: lib.Schema) -> None: - """ - Return a copy of this Dataset with a different schema. - - The copy will view the same Fragments. If the new schema is not - compatible with the original dataset's schema then an error will - be raised. - - Parameters - ---------- - schema : Schema - The new dataset schema. - """ - def get_fragments(self, filter: Expression | None = None): - """Returns an iterator over the fragments in this dataset. - - Parameters - ---------- - filter : Expression, default None - Return fragments matching the optional filter, either using the - partition_expression or internal information like Parquet's - statistics. - - Returns - ------- - fragments : iterator of Fragment - """ + def partition_expression(self) -> Expression: ... + + def replace_schema(self, schema: lib.Schema) -> None: ... + + def get_fragments(self, filter: Expression | None = None): ... + def scanner( self, columns: list[str] | None = None, @@ -95,122 +63,8 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Build a scan operation against the dataset. - - Data is not loaded immediately. Instead, this produces a Scanner, - which exposes further operations (e.g. loading all data as a - table, counting rows). - - See the :meth:`Scanner.from_dataset` method for further information. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - scanner : Scanner - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "dataset_scanner.parquet") - - >>> import pyarrow.dataset as ds - >>> dataset = ds.dataset("dataset_scanner.parquet") - - Selecting a subset of the columns: - - >>> dataset.scanner(columns=["year", "n_legs"]).to_table() - pyarrow.Table - year: int64 - n_legs: int64 - ---- - year: [[2020,2022,2021,2022,2019,2021]] - n_legs: [[2,2,4,4,5,100]] - - Projecting selected columns using an expression: - - >>> dataset.scanner( - ... columns={ - ... "n_legs_uint": ds.field("n_legs").cast("uint8"), - ... } - ... ).to_table() - pyarrow.Table - n_legs_uint: uint8 - ---- - n_legs_uint: [[2,2,4,4,5,100]] - - Filtering rows while scanning: - - >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() - pyarrow.Table - year: int64 - n_legs: int64 - animal: string - ---- - year: [[2022,2021,2022,2021]] - n_legs: [[2,4,4,100]] - animal: [["Parrot","Dog","Horse","Centipede"]] - """ + ) -> Scanner: ... + def to_batches( self, columns: list[str] | None = None, @@ -222,65 +76,8 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: - """ - Read the dataset as materialized record batches. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - record_batches : iterator of RecordBatch - """ + ) -> Iterator[lib.RecordBatch]: ... + def to_table( self, columns: list[str] | dict[str, Expression] | None = None, @@ -292,68 +89,8 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Read the dataset to an Arrow table. - - Note that this method reads all the selected data from the dataset - into memory. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ + ) -> lib.Table: ... + def take( self, indices: Indices, @@ -366,67 +103,8 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Select rows of data by index. - - Parameters - ---------- - indices : Array or array-like - indices of rows to select in the dataset. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ + ) -> lib.Table: ... + def head( self, num_rows: int, @@ -439,67 +117,8 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Load the first N rows of the dataset. - - Parameters - ---------- - num_rows : int - The number of rows to load. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ + ) -> lib.Table: ... + def count_rows( self, filter: Expression | None = None, @@ -510,82 +129,16 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> int: - """ - Count rows matching the scanner filter. - - Parameters - ---------- - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - count : int - """ + ) -> int: ... + @property - def schema(self) -> lib.Schema: - """The common schema of the full Dataset""" - def filter(self, expression: Expression) -> Self: - """ - Apply a row filter to the dataset. - - Parameters - ---------- - expression : Expression - The filter that should be applied to the dataset. - - Returns - ------- - Dataset - """ - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: - """ - Sort the Dataset by one or multiple columns. - - Parameters - ---------- - sorting : str or list[tuple(name, order)] - Name of the column to use to sort (ascending), or - a list of multiple sorting conditions where - each entry is a tuple with column name - and sorting order ("ascending" or "descending") - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - InMemoryDataset - A new dataset sorted according to the sort keys. - """ + def schema(self) -> lib.Schema: ... + + def filter(self, expression: Expression) -> Self: ... + + def sort_by(self, sorting: str | + list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + def join( self, right_dataset: Dataset, @@ -596,45 +149,8 @@ class Dataset(lib._Weakrefable): right_suffix: str | None = None, coalesce_keys: bool = True, use_threads: bool = True, - ) -> InMemoryDataset: - """ - Perform a join between this dataset and another one. - - Result of the join will be a new dataset, where further - operations can be applied. - - Parameters - ---------- - right_dataset : dataset - The dataset to join to the current one, acting as the right dataset - in the join operation. - keys : str or list[str] - The columns from current dataset that should be used as keys - of the join operation left side. - right_keys : str or list[str], default None - The columns from the right_dataset that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left dataset. - join_type : str, default "left outer" - The kind of join that should be performed, one of - ("left semi", "right semi", "left anti", "right anti", - "inner", "left outer", "right outer", "full outer") - left_suffix : str, default None - Which suffix to add to right column names. This prevents confusion - when the columns in left and right datasets have colliding names. - right_suffix : str, default None - Which suffix to add to the left column names. This prevents confusion - when the columns in left and right datasets have colliding names. - coalesce_keys : bool, default True - If the duplicated keys should be omitted from one of the sides - in the join result. - use_threads : bool, default True - Whenever to use multithreading or not. - - Returns - ------- - InMemoryDataset - """ + ) -> InMemoryDataset: ... + def join_asof( self, right_dataset: Dataset, @@ -643,114 +159,20 @@ class Dataset(lib._Weakrefable): tolerance: int, right_on: str | list[str] | None = None, right_by: str | list[str] | None = None, - ) -> InMemoryDataset: - """ - Perform an asof join between this dataset and another one. - - This is similar to a left-join except that we match on nearest key rather - than equal keys. Both datasets must be sorted by the key. This type of join - is most useful for time series data that are not perfectly aligned. - - Optionally match on equivalent keys with "by" before searching with "on". - - Result of the join will be a new Dataset, where further - operations can be applied. - - Parameters - ---------- - right_dataset : dataset - The dataset to join to the current one, acting as the right dataset - in the join operation. - on : str - The column from current dataset that should be used as the "on" key - of the join operation left side. - - An inexact match is used on the "on" key, i.e. a row is considered a - match if and only if left_on - tolerance <= right_on <= left_on. - - The input table must be sorted by the "on" key. Must be a single - field of a common type. - - Currently, the "on" key must be an integer, date, or timestamp type. - by : str or list[str] - The columns from current dataset that should be used as the keys - of the join operation left side. The join operation is then done - only for the matches in these columns. - tolerance : int - The tolerance for inexact "on" key matching. A right row is considered - a match with the left row `right.on - left.on <= tolerance`. The - `tolerance` may be: - - - negative, in which case a past-as-of-join occurs; - - or positive, in which case a future-as-of-join occurs; - - or zero, in which case an exact-as-of-join occurs. - - The tolerance is interpreted in the same units as the "on" key. - right_on : str or list[str], default None - The columns from the right_dataset that should be used as the on key - on the join operation right side. - When ``None`` use the same key name as the left dataset. - right_by : str or list[str], default None - The columns from the right_dataset that should be used as by keys - on the join operation right side. - When ``None`` use the same key names as the left dataset. - - Returns - ------- - InMemoryDataset - """ + ) -> InMemoryDataset: ... -class InMemoryDataset(Dataset): - """ - A Dataset wrapping in-memory data. - - Parameters - ---------- - source : RecordBatch, Table, list, tuple - The data for this dataset. Can be a RecordBatch, Table, list of - RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader - If an iterable is provided, the schema must also be provided. - schema : Schema, optional - Only required if passing an iterable as the source - """ -class UnionDataset(Dataset): - """ - A Dataset wrapping child datasets. +class InMemoryDataset(Dataset): + ... - Children's schemas must agree with the provided schema. - Parameters - ---------- - schema : Schema - A known schema to conform to. - children : list of Dataset - One or more input children - """ +class UnionDataset(Dataset): @property def children(self) -> list[Dataset]: ... + class FileSystemDataset(Dataset): - """ - A Dataset of file fragments. - - A FileSystemDataset is composed of one or more FileFragment. - - Parameters - ---------- - fragments : list[Fragments] - List of fragments to consume. - schema : Schema - The top-level schema of the Dataset. - format : FileFormat - File format of the fragments, currently only ParquetFileFormat, - IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. - filesystem : FileSystem - FileSystem of the fragments. - root_partition : Expression, optional - The top-level partition of the DataDataset. - """ def __init__( self, @@ -760,6 +182,7 @@ class FileSystemDataset(Dataset): filesystem: SupportedFileSystem | None = None, root_partition: Expression | None = None, ) -> None: ... + @classmethod def from_paths( cls, @@ -769,69 +192,30 @@ class FileSystemDataset(Dataset): filesystem: SupportedFileSystem | None = None, partitions: list[Expression] | None = None, root_partition: Expression | None = None, - ) -> FileSystemDataset: - """ - A Dataset created from a list of paths on a particular filesystem. - - Parameters - ---------- - paths : list of str - List of file paths to create the fragments from. - schema : Schema - The top-level schema of the DataDataset. - format : FileFormat - File format to create fragments from, currently only - ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. - filesystem : FileSystem - The filesystem which files are from. - partitions : list[Expression], optional - Attach additional partition information for the file paths. - root_partition : Expression, optional - The top-level partition of the DataDataset. - """ + ) -> FileSystemDataset: ... + @property def filesystem(self) -> FileSystem: ... @property - def partitioning(self) -> Partitioning | None: - """ - The partitioning of the Dataset source, if discovered. - - If the FileSystemDataset is created using the ``dataset()`` factory - function with a partitioning specified, this will return the - finalized Partitioning object from the dataset discovery. In all - other cases, this returns None. - """ + def partitioning(self) -> Partitioning | None: ... + @property - def files(self) -> list[str]: - """List of the files""" + def files(self) -> list[str]: ... + @property - def format(self) -> FileFormat: - """The FileFormat of this source.""" + def format(self) -> FileFormat: ... + class FileWriteOptions(lib._Weakrefable): @property def format(self) -> FileFormat: ... + class FileFormat(lib._Weakrefable): def inspect( self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None - ) -> lib.Schema: - """ - Infer the schema of a file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to infer a schema from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - - Returns - ------- - schema : Schema - The schema inferred from the file - """ + ) -> lib.Schema: ... + def make_fragment( self, file: StrPath | IO, @@ -839,29 +223,8 @@ class FileFormat(lib._Weakrefable): partition_expression: Expression | None = None, *, file_size: int | None = None, - ) -> Fragment: - """ - Make a FileFragment from a given file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to make a fragment from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - partition_expression : Expression, optional - An expression that is guaranteed true for all rows in the fragment. Allows - fragment to be potentially skipped while scanning with a filter. - file_size : int, optional - The size of the file in bytes. Can improve performance with high-latency filesystems - when file size needs to be known before reading. - - Returns - ------- - fragment : Fragment - The file fragment - """ + ) -> Fragment: ... + def make_write_options(self) -> FileWriteOptions: ... @property def default_extname(self) -> str: ... @@ -870,17 +233,15 @@ class FileFormat(lib._Weakrefable): @default_fragment_scan_options.setter def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + class Fragment(lib._Weakrefable): - """Fragment of data from a Dataset.""" + @property - def physical_schema(self) -> lib.Schema: - """Return the physical schema of this Fragment. This schema can be - different from the dataset read schema.""" + def physical_schema(self) -> lib.Schema: ... + @property - def partition_expression(self) -> Expression: - """An Expression which evaluates to true for all data viewed by this - Fragment. - """ + def partition_expression(self) -> Expression: ... + def scanner( self, schema: lib.Schema | None = None, @@ -893,73 +254,8 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Build a scan operation against the fragment. - - Data is not loaded immediately. Instead, this produces a Scanner, - which exposes further operations (e.g. loading all data as a - table, counting rows). - - Parameters - ---------- - schema : Schema - Schema to use for scanning. This is used to unify a Fragment to - its Dataset's schema. If not specified this will use the - Fragment's physical schema which might differ for each Fragment. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - scanner : Scanner - """ + ) -> Scanner: ... + def to_batches( self, schema: lib.Schema | None = None, @@ -972,67 +268,8 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: - """ - Read the fragment as materialized record batches. - - Parameters - ---------- - schema : Schema, optional - Concrete schema to use for scanning. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - record_batches : iterator of RecordBatch - """ + ) -> Iterator[lib.RecordBatch]: ... + def to_table( self, schema: lib.Schema | None = None, @@ -1045,70 +282,8 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Convert this Fragment into a Table. - - Use this convenience utility with care. This will serially materialize - the Scan result in memory before creating the Table. - - Parameters - ---------- - schema : Schema, optional - Concrete schema to use for scanning. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ + ) -> lib.Table: ... + def take( self, indices: Indices, @@ -1121,67 +296,8 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Select rows of data by index. - - Parameters - ---------- - indices : Array or array-like - The indices of row to select in the dataset. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - Table - """ + ) -> lib.Table: ... + def head( self, num_rows: int, @@ -1194,67 +310,8 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Load the first N rows of the fragment. - - Parameters - ---------- - num_rows : int - The number of rows to load. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - Table - """ + ) -> lib.Table: ... + def count_rows( self, columns: list[str] | None = None, @@ -1266,113 +323,52 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> int: - """ - Count rows matching the scanner filter. - - Parameters - ---------- - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - count : int - """ + ) -> int: ... + class FileFragment(Fragment): - """A Fragment representing a data file.""" - def open(self) -> lib.NativeFile: - """ - Open a NativeFile of the buffer or file viewed by this fragment. - """ + def open(self) -> lib.NativeFile: ... + @property - def path(self) -> str: - """ - The path of the data file viewed by this fragment, if it views a - file. If instead it views a buffer, this will be "". - """ + def path(self) -> str: ... + @property - def filesystem(self) -> FileSystem: - """ - The FileSystem containing the data file viewed by this fragment, if - it views a file. If instead it views a buffer, this will be None. - """ + def filesystem(self) -> FileSystem: ... + @property - def buffer(self) -> lib.Buffer: - """ - The buffer viewed by this fragment, if it views a buffer. If - instead it views a file, this will be None. - """ + def buffer(self) -> lib.Buffer: ... + @property - def format(self) -> FileFormat: - """ - The format of the data file viewed by this fragment. - """ + def format(self) -> FileFormat: ... + class FragmentScanOptions(lib._Weakrefable): - """Scan options specific to a particular fragment and scan operation.""" @property def type_name(self) -> str: ... + class IpcFileWriteOptions(FileWriteOptions): @property def write_options(self) -> IpcWriteOptions: ... @write_options.setter def write_options(self, write_options: IpcWriteOptions) -> None: ... + class IpcFileFormat(FileFormat): def equals(self, other: IpcFileFormat) -> bool: ... def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... @property def default_extname(self) -> str: ... -class FeatherFileFormat(IpcFileFormat): ... + +class FeatherFileFormat(IpcFileFormat): + ... + class CsvFileFormat(FileFormat): - """ - FileFormat for CSV files. - - Parameters - ---------- - parse_options : pyarrow.csv.ParseOptions - Options regarding CSV parsing. - default_fragment_scan_options : CsvFragmentScanOptions - Default options for fragments scan. - convert_options : pyarrow.csv.ConvertOptions - Options regarding value conversion. - read_options : pyarrow.csv.ReadOptions - General read options. - """ + def __init__( self, parse_options: csv.ParseOptions | None = None, @@ -1387,17 +383,8 @@ class CsvFileFormat(FileFormat): def parse_options(self, parse_options: csv.ParseOptions) -> None: ... def equals(self, other: CsvFileFormat) -> bool: ... -class CsvFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for CSV fragments. - Parameters - ---------- - convert_options : pyarrow.csv.ConvertOptions - Options regarding value conversion. - read_options : pyarrow.csv.ReadOptions - General read options. - """ +class CsvFragmentScanOptions(FragmentScanOptions): convert_options: csv.ConvertOptions read_options: csv.ReadOptions @@ -1407,22 +394,13 @@ class CsvFragmentScanOptions(FragmentScanOptions): ) -> None: ... def equals(self, other: CsvFragmentScanOptions) -> bool: ... + class CsvFileWriteOptions(FileWriteOptions): write_options: csv.WriteOptions + class JsonFileFormat(FileFormat): - """ - FileFormat for JSON files. - - Parameters - ---------- - default_fragment_scan_options : JsonFragmentScanOptions - Default options for fragments scan. - parse_options : pyarrow.json.ParseOptions - Options regarding json parsing. - read_options : pyarrow.json.ReadOptions - General read options. - """ + def __init__( self, default_fragment_scan_options: JsonFragmentScanOptions | None = None, @@ -1431,118 +409,38 @@ class JsonFileFormat(FileFormat): ) -> None: ... def equals(self, other: JsonFileFormat) -> bool: ... -class JsonFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for JSON fragments. - Parameters - ---------- - parse_options : pyarrow.json.ParseOptions - Options regarding JSON parsing. - read_options : pyarrow.json.ReadOptions - General read options. - """ +class JsonFragmentScanOptions(FragmentScanOptions): parse_options: _json.ParseOptions read_options: _json.ReadOptions + def __init__( self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions ) -> None: ... def equals(self, other: JsonFragmentScanOptions) -> bool: ... + class Partitioning(lib._Weakrefable): - def parse(self, path: str) -> Expression: - """ - Parse a path into a partition expression. - - Parameters - ---------- - path : str - - Returns - ------- - pyarrow.dataset.Expression - """ - def format(self, expr: Expression) -> tuple[str, str]: - """ - Convert a filter expression into a tuple of (directory, filename) using - the current partitioning scheme - - Parameters - ---------- - expr : pyarrow.dataset.Expression - - Returns - ------- - tuple[str, str] - - Examples - -------- - - Specify the Schema for paths like "/2009/June": - - >>> import pyarrow as pa - >>> import pyarrow.dataset as ds - >>> import pyarrow.compute as pc - >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) - >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) - ('1862/Jan', '') - """ + def parse(self, path: str) -> Expression: ... + + def format(self, expr: Expression) -> tuple[str, str]: ... + @property - def schema(self) -> lib.Schema: - """The arrow Schema attached to the partitioning.""" + def schema(self) -> lib.Schema: ... + class PartitioningFactory(lib._Weakrefable): @property def type_name(self) -> str: ... + class KeyValuePartitioning(Partitioning): @property - def dictionaries(self) -> list[lib.Array | None]: - """ - The unique values for each partition field, if available. + def dictionaries(self) -> list[lib.Array | None]: ... - Those values are only available if the Partitioning object was - created through dataset discovery from a PartitioningFactory, or - if the dictionaries were manually specified in the constructor. - If no dictionary field is available, this returns an empty list. - """ class DirectoryPartitioning(KeyValuePartitioning): - """ - A Partitioning based on a specified Schema. - - The DirectoryPartitioning expects one segment in the file path for each - field in the schema (all fields are required to be present). - For example given schema the path "/2009/11" would - be parsed to ("year"_ == 2009 and "month"_ == 11). - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - DirectoryPartitioning - - Examples - -------- - >>> from pyarrow.dataset import DirectoryPartitioning - >>> partitioning = DirectoryPartitioning( - ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) - ... ) - >>> print(partitioning.parse("/2009/11/")) - ((year == 2009) and (month == 11)) - """ @staticmethod def discover( @@ -1551,38 +449,8 @@ class DirectoryPartitioning(KeyValuePartitioning): max_partition_dictionary_size: int = 0, schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a DirectoryPartitioning. - - Parameters - ---------- - field_names : list of str - The names to associate with the values from the subdirectory names. - If schema is given, will be populated from the schema. - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain types. This can be more efficient - when materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - max_partition_dictionary_size : int, default 0 - Synonymous with infer_dictionary for backwards compatibility with - 1.0: setting this to -1 or None is equivalent to passing - infer_dictionary=True. - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ + ) -> PartitioningFactory: ... + def __init__( self, schema: lib.Schema, @@ -1590,47 +458,9 @@ class DirectoryPartitioning(KeyValuePartitioning): segment_encoding: Literal["uri", "none"] = "uri", ) -> None: ... + class HivePartitioning(KeyValuePartitioning): - """ - A Partitioning for "/$key=$value/" nested directories as found in - Apache Hive. - - Multi-level, directory based partitioning scheme originating from - Apache Hive with all data files stored in the leaf directories. Data is - partitioned by static values of a particular column in the schema. - Partition keys are represented in the form $key=$value in directory names. - Field order is ignored, as are missing or unrecognized field names. - - For example, given schema, a possible - path would be "/year=2009/month=11/day=15". - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" - If any field is None then this fallback will be used as a label - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - HivePartitioning - - Examples - -------- - >>> from pyarrow.dataset import HivePartitioning - >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) - >>> print(partitioning.parse("/year=2009/month=11/")) - ((year == 2009) and (month == 11)) - - """ + def __init__( self, schema: lib.Schema, @@ -1638,6 +468,7 @@ class HivePartitioning(KeyValuePartitioning): null_fallback: str = "__HIVE_DEFAULT_PARTITION__", segment_encoding: Literal["uri", "none"] = "uri", ) -> None: ... + @staticmethod def discover( infer_dictionary: bool = False, @@ -1645,75 +476,10 @@ class HivePartitioning(KeyValuePartitioning): null_fallback="__HIVE_DEFAULT_PARTITION__", schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a HivePartitioning. - - Parameters - ---------- - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain. This can be more efficient when - materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - max_partition_dictionary_size : int, default 0 - Synonymous with infer_dictionary for backwards compatibility with - 1.0: setting this to -1 or None is equivalent to passing - infer_dictionary=True. - null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" - When inferring a schema for partition fields this value will be - replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ - for compatibility with Spark - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ + ) -> PartitioningFactory: ... + class FilenamePartitioning(KeyValuePartitioning): - """ - A Partitioning based on a specified Schema. - - The FilenamePartitioning expects one segment in the file name for each - field in the schema (all fields are required to be present) separated - by '_'. For example given schema the name - ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - FilenamePartitioning - - Examples - -------- - >>> from pyarrow.dataset import FilenamePartitioning - >>> partitioning = FilenamePartitioning( - ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) - ... ) - >>> print(partitioning.parse("2009_11_data.parquet")) - ((year == 2009) and (month == 11)) - """ def __init__( self, @@ -1721,99 +487,27 @@ class FilenamePartitioning(KeyValuePartitioning): dictionaries: dict[str, lib.Array] | None = None, segment_encoding: Literal["uri", "none"] = "uri", ) -> None: ... + @staticmethod def discover( field_names: list[str] | None = None, infer_dictionary: bool = False, schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a FilenamePartitioning. - - Parameters - ---------- - field_names : list of str - The names to associate with the values from the subdirectory names. - If schema is given, will be populated from the schema. - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain types. This can be more efficient - when materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ + ) -> PartitioningFactory: ... + class DatasetFactory(lib._Weakrefable): - """ - DatasetFactory is used to create a Dataset, inspect the Schema - of the fragments contained in it, and declare a partitioning. - """ root_partition: Expression - def finish(self, schema: lib.Schema | None = None) -> Dataset: - """ - Create a Dataset using the inspected schema or an explicit schema - (if given). - - Parameters - ---------- - schema : Schema, default None - The schema to conform the source to. If None, the inspected - schema is used. - - Returns - ------- - Dataset - """ - def inspect(self) -> lib.Schema: - """ - Inspect all data fragments and return a common Schema. - - Returns - ------- - Schema - """ + def finish(self, schema: lib.Schema | None = None) -> Dataset: ... + + def inspect(self) -> lib.Schema: ... + def inspect_schemas(self) -> list[lib.Schema]: ... + class FileSystemFactoryOptions(lib._Weakrefable): - """ - Influences the discovery of filesystem paths. - - Parameters - ---------- - partition_base_dir : str, optional - For the purposes of applying the partitioning, paths will be - stripped of the partition_base_dir. Files not matching the - partition_base_dir prefix will be skipped for partitioning discovery. - The ignored files will still be part of the Dataset, but will not - have partition information. - partitioning : Partitioning/PartitioningFactory, optional - Apply the Partitioning to every discovered Fragment. See Partitioning or - PartitioningFactory documentation. - exclude_invalid_files : bool, optional (default True) - If True, invalid files will be excluded (file format specific check). - This will incur IO for each files in a serial and single threaded - fashion. Disabling this feature will skip the IO, but unsupported - files may be present in the Dataset (resulting in an error at scan - time). - selector_ignore_prefixes : list, optional - When discovering from a Selector (and not from an explicit file list), - ignore files and directories matching any of these prefixes. - By default this is ['.', '_']. - """ partitioning: Partitioning partitioning_factory: PartitioningFactory @@ -1829,21 +523,8 @@ class FileSystemFactoryOptions(lib._Weakrefable): selector_ignore_prefixes: list[str] | None = None, ) -> None: ... + class FileSystemDatasetFactory(DatasetFactory): - """ - Create a DatasetFactory from a list of paths with schema inspection. - - Parameters - ---------- - filesystem : pyarrow.fs.FileSystem - Filesystem to discover. - paths_or_selector : pyarrow.fs.FileSelector or list of path-likes - Either a Selector object or a list of path-like objects. - format : FileFormat - Currently only ParquetFileFormat and IpcFileFormat are supported. - options : FileSystemFactoryOptions, optional - Various flags influencing the discovery of filesystem paths. - """ def __init__( self, @@ -1853,50 +534,35 @@ class FileSystemDatasetFactory(DatasetFactory): options: FileSystemFactoryOptions | None = None, ) -> None: ... + class UnionDatasetFactory(DatasetFactory): - """ - Provides a way to inspect/discover a Dataset's expected schema before - materialization. - - Parameters - ---------- - factories : list of DatasetFactory - """ + def __init__(self, factories: list[DatasetFactory]) -> None: ... + _RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): - """An iterator over a sequence of record batches.""" + def __iter__(self) -> Self: ... def __next__(self) -> _RecordBatchT: ... -class TaggedRecordBatch(NamedTuple): - """ - A combination of a record batch and the fragment it came from. - Parameters - ---------- - record_batch : RecordBatch - The record batch. - fragment : Fragment - Fragment of the record batch. - """ +class TaggedRecordBatch(NamedTuple): record_batch: lib.RecordBatch fragment: Fragment + class TaggedRecordBatchIterator(lib._Weakrefable): - """An iterator over a sequence of record batches with fragments.""" + def __iter__(self) -> Self: ... def __next__(self) -> TaggedRecordBatch: ... + class Scanner(lib._Weakrefable): - """A materialized scan operation with context and options bound. - A scanner is the class that glues the scan tasks, data fragments and data - sources together. - """ @staticmethod def from_dataset( dataset: Dataset, @@ -1910,63 +576,8 @@ class Scanner(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create Scanner from Dataset, - - Parameters - ---------- - dataset : Dataset - Dataset to scan. - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ + ) -> Scanner: ... + @staticmethod def from_fragment( fragment: Fragment, @@ -1981,65 +592,8 @@ class Scanner(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create Scanner from Fragment, - - Parameters - ---------- - fragment : Fragment - fragment to scan. - schema : Schema, optional - The schema of the fragment. - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ + ) -> Scanner: ... + @staticmethod def from_batches( source: Iterator[lib.RecordBatch] | RecordBatchReader, @@ -2054,196 +608,37 @@ class Scanner(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create a Scanner from an iterator of batches. - - This creates a scanner which can be used only once. It is - intended to support writing a dataset (which takes a scanner) - from a source which can be read only once (e.g. a - RecordBatchReader or generator). - - Parameters - ---------- - source : Iterator or Arrow-compatible stream object - The iterator of Batches. This can be a pyarrow RecordBatchReader, - any object that implements the Arrow PyCapsule Protocol for - streams, or an actual Python iterator of RecordBatches. - schema : Schema - The schema of the batches (required when passing a Python - iterator). - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ + ) -> Scanner: ... + @property - def dataset_schema(self) -> lib.Schema: - """The schema with which batches will be read from fragments.""" + def dataset_schema(self) -> lib.Schema: ... + @property - def projected_schema(self) -> lib.Schema: - """ - The materialized schema of the data, accounting for projections. - - This is the schema of any data returned from the scanner. - """ - def to_batches(self) -> Iterator[lib.RecordBatch]: - """ - Consume a Scanner in record batches. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def scan_batches(self) -> TaggedRecordBatchIterator: - """ - Consume a Scanner in record batches with corresponding fragments. - - Returns - ------- - record_batches : iterator of TaggedRecordBatch - """ - def to_table(self) -> lib.Table: - """ - Convert a Scanner into a Table. - - Use this convenience utility with care. This will serially materialize - the Scan result in memory before creating the Table. - - Returns - ------- - Table - """ - def take(self, indices: Indices) -> lib.Table: - """ - Select rows of data by index. - - Will only consume as many batches of the underlying dataset as - needed. Otherwise, this is equivalent to - ``to_table().take(indices)``. - - Parameters - ---------- - indices : Array or array-like - indices of rows to select in the dataset. - - Returns - ------- - Table - """ - def head(self, num_rows: int) -> lib.Table: - """ - Load the first N rows of the dataset. - - Parameters - ---------- - num_rows : int - The number of rows to load. - - Returns - ------- - Table - """ - def count_rows(self) -> int: - """ - Count rows matching the scanner filter. - - Returns - ------- - count : int - """ - def to_reader(self) -> RecordBatchReader: - """Consume this scanner as a RecordBatchReader. - - Returns - ------- - RecordBatchReader - """ - -def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: - """ - Extract partition keys (equality constraints between a field and a scalar) - from an expression as a dict mapping the field's name to its value. - - NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning - will be conjunctions of equality conditions and are accessible through this - function. Other subexpressions will be ignored. - - Parameters - ---------- - partition_expression : pyarrow.dataset.Expression - - Returns - ------- - dict - - Examples - -------- - - For example, an expression of - - is converted to {'part': 'A', 'year': 2016} - """ + def projected_schema(self) -> lib.Schema: ... + + def to_batches(self) -> Iterator[lib.RecordBatch]: ... + + def scan_batches(self) -> TaggedRecordBatchIterator: ... + + def to_table(self) -> lib.Table: ... + + def take(self, indices: Indices) -> lib.Table: ... + + def head(self, num_rows: int) -> lib.Table: ... + + def count_rows(self) -> int: ... + + def to_reader(self) -> RecordBatchReader: ... + + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... + class WrittenFile(lib._Weakrefable): - """ - Metadata information about files written as - part of a dataset write operation - - Parameters - ---------- - path : str - Path to the file. - metadata : pyarrow.parquet.FileMetaData, optional - For Parquet files, the Parquet file metadata. - size : int - The size of the file in bytes. - """ - def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... + + def __init__(self, path: str, metadata: _parquet.FileMetaData | + None, size: int) -> None: ... + def _filesystemdataset_write( data: Scanner, @@ -2262,37 +657,12 @@ def _filesystemdataset_write( create_dir: bool, ): ... + class _ScanNodeOptions(ExecNodeOptions): def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + class ScanNodeOptions(_ScanNodeOptions): - """ - A Source node which yields batches from a Dataset scan. - - This is the option class for the "scan" node factory. - - This node is capable of applying pushdown projections or filters - to the file readers which reduce the amount of data that needs to - be read (if supported by the file format). But note that this does not - construct associated filter or project nodes to perform the final - filtering or projection. Rather, you may supply the same filter - expression or projection to the scan node that you also supply - to the filter or project node. - - Yielded batches will be augmented with fragment/batch indices when - implicit_ordering=True to enable stable ordering for simple ExecPlans. - - Parameters - ---------- - dataset : pyarrow.dataset.Dataset - The table which acts as the data source. - **kwargs : dict, optional - Scan options. See `Scanner.from_dataset` for possible arguments. - require_sequenced_output : bool, default False - Batches are yielded sequentially, like single-threaded - implicit_ordering : bool, default False - Preserve implicit ordering of data. - """ def __init__( self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs diff --git a/python/pyarrow-stubs/_dataset_orc.pyi b/python/pyarrow-stubs/_dataset_orc.pyi index d4e5784750f..62f49bf5d30 100644 --- a/python/pyarrow-stubs/_dataset_orc.pyi +++ b/python/pyarrow-stubs/_dataset_orc.pyi @@ -17,6 +17,7 @@ from ._dataset import FileFormat + class OrcFileFormat(FileFormat): def equals(self, other: OrcFileFormat) -> bool: ... @property diff --git a/python/pyarrow-stubs/_dataset_parquet.pyi b/python/pyarrow-stubs/_dataset_parquet.pyi index 007d3404a18..df9536ef725 100644 --- a/python/pyarrow-stubs/_dataset_parquet.pyi +++ b/python/pyarrow-stubs/_dataset_parquet.pyi @@ -38,19 +38,9 @@ from .lib import CacheOptions, Schema, _Weakrefable parquet_encryption_enabled: bool + class ParquetFileFormat(FileFormat): - """ - FileFormat for Parquet - - Parameters - ---------- - read_options : ParquetReadOptions - Read options for the file. - default_fragment_scan_options : ParquetFragmentScanOptions - Scan Options for the file. - **kwargs : dict - Additional options for read option or scan option - """ + def __init__( self, read_options: ParquetReadOptions | None = None, @@ -59,10 +49,13 @@ class ParquetFileFormat(FileFormat): ) -> None: ... @property def read_options(self) -> ParquetReadOptions: ... - def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def make_write_options( + self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def equals(self, other: ParquetFileFormat) -> bool: ... @property def default_extname(self) -> str: ... + def make_fragment( self, file: StrPath | IO, @@ -71,49 +64,15 @@ class ParquetFileFormat(FileFormat): row_groups: Iterable[int] | None = None, *, file_size: int | None = None, - ) -> Fragment: - """ - Make a FileFragment from a given file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to make a fragment from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - partition_expression : Expression, optional - An expression that is guaranteed true for all rows in the fragment. Allows - fragment to be potentially skipped while scanning with a filter. - row_groups : Iterable, optional - The indices of the row groups to include - file_size : int, optional - The size of the file in bytes. Can improve performance with high-latency filesystems - when file size needs to be known before reading. - - Returns - ------- - fragment : Fragment - The file fragment - """ + ) -> Fragment: ... + class _NameStats(TypedDict): min: Any max: Any + class RowGroupInfo: - """ - A wrapper class for RowGroup information - - Parameters - ---------- - id : integer - The group ID. - metadata : FileMetaData - The rowgroup metadata. - schema : Schema - Schema of the rows. - """ id: int metadata: FileMetaData @@ -127,8 +86,8 @@ class RowGroupInfo: @property def statistics(self) -> dict[str, _NameStats]: ... + class ParquetFileFragment(FileFragment): - """A Fragment representing a parquet file.""" def ensure_complete_metadata(self) -> None: ... @property @@ -136,79 +95,22 @@ class ParquetFileFragment(FileFragment): @property def metadata(self) -> FileMetaData: ... @property - def num_row_groups(self) -> int: - """ - Return the number of row groups viewed by this fragment (not the - number of row groups in the origin file). - """ + def num_row_groups(self) -> int: ... + def split_by_row_group( self, filter: Expression | None = None, schema: Schema | None = None - ) -> list[Fragment]: - """ - Split the fragment into multiple fragments. - - Yield a Fragment wrapping each row group in this ParquetFileFragment. - Row groups will be excluded whose metadata contradicts the optional - filter. - - Parameters - ---------- - filter : Expression, default None - Only include the row groups which satisfy this predicate (using - the Parquet RowGroup statistics). - schema : Schema, default None - Schema to use when filtering row groups. Defaults to the - Fragment's physical schema - - Returns - ------- - A list of Fragments - """ + ) -> list[Fragment]: ... + def subset( self, filter: Expression | None = None, schema: Schema | None = None, row_group_ids: list[int] | None = None, - ) -> ParquetFileFormat: - """ - Create a subset of the fragment (viewing a subset of the row groups). - - Subset can be specified by either a filter predicate (with optional - schema) or by a list of row group IDs. Note that when using a filter, - the resulting fragment can be empty (viewing no row groups). - - Parameters - ---------- - filter : Expression, default None - Only include the row groups which satisfy this predicate (using - the Parquet RowGroup statistics). - schema : Schema, default None - Schema to use when filtering row groups. Defaults to the - Fragment's physical schema - row_group_ids : list of ints - The row group IDs to include in the subset. Can only be specified - if `filter` is None. - - Returns - ------- - ParquetFileFragment - """ + ) -> ParquetFileFormat: ... + class ParquetReadOptions(_Weakrefable): - """ - Parquet format specific options for reading. - - Parameters - ---------- - dictionary_columns : list of string, default None - Names of columns which should be dictionary encoded as - they are read - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds - """ + def __init__( self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None ) -> None: ... @@ -218,53 +120,16 @@ class ParquetReadOptions(_Weakrefable): def coerce_int96_timestamp_unit(self, unit: str) -> None: ... def equals(self, other: ParquetReadOptions) -> bool: ... + class ParquetFileWriteOptions(FileWriteOptions): def update(self, **kwargs) -> None: ... def _set_properties(self) -> None: ... def _set_arrow_properties(self) -> None: ... def _set_encryption_config(self) -> None: ... + @dataclass(kw_only=True) class ParquetFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for Parquet fragments. - - Parameters - ---------- - use_buffered_stream : bool, default False - Read files through buffered input streams rather than loading entire - row groups at once. This may be enabled to reduce memory overhead. - Disabled by default. - buffer_size : int, default 8192 - Size of buffered stream, if enabled. Default is 8KB. - pre_buffer : bool, default True - If enabled, pre-buffer the raw Parquet data instead of issuing one - read per column chunk. This can improve performance on high-latency - filesystems (e.g. S3, GCS) by coalescing and issuing file reads in - parallel using a background I/O thread pool. - Set to False if you want to prioritize minimal memory usage - over maximum speed. - cache_options : pyarrow.CacheOptions, default None - Cache options used when pre_buffer is enabled. The default values should - be good for most use cases. You may want to adjust these for example if - you have exceptionally high latency to the file system. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None - If not None, use the provided ParquetDecryptionConfig to decrypt the - Parquet file. - decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None - If not None, use the provided FileDecryptionProperties to decrypt encrypted - Parquet file. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. - """ use_buffered_stream: bool = False buffer_size: int = 8192 @@ -278,50 +143,17 @@ class ParquetFragmentScanOptions(FragmentScanOptions): def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + @dataclass class ParquetFactoryOptions(_Weakrefable): - """ - Influences the discovery of parquet dataset. - - Parameters - ---------- - partition_base_dir : str, optional - For the purposes of applying the partitioning, paths will be - stripped of the partition_base_dir. Files not matching the - partition_base_dir prefix will be skipped for partitioning discovery. - The ignored files will still be part of the Dataset, but will not - have partition information. - partitioning : Partitioning, PartitioningFactory, optional - The partitioning scheme applied to fragments, see ``Partitioning``. - validate_column_chunk_paths : bool, default False - Assert that all ColumnChunk paths are consistent. The parquet spec - allows for ColumnChunk data to be stored in multiple files, but - ParquetDatasetFactory supports only a single file with all ColumnChunk - data. If this flag is set construction of a ParquetDatasetFactory will - raise an error if ColumnChunk data is not resident in a single file. - """ partition_base_dir: str | None = None partitioning: Partitioning | PartitioningFactory | None = None validate_column_chunk_paths: bool = False + class ParquetDatasetFactory(DatasetFactory): - """ - Create a ParquetDatasetFactory from a Parquet `_metadata` file. - - Parameters - ---------- - metadata_path : str - Path to the `_metadata` parquet metadata-only file generated with - `pyarrow.parquet.write_metadata`. - filesystem : pyarrow.fs.FileSystem - Filesystem to read the metadata_path from, and subsequent parquet - files. - format : ParquetFileFormat - Parquet format options. - options : ParquetFactoryOptions, optional - Various flags influencing the discovery of filesystem paths. - """ + def __init__( self, metadata_path: str, diff --git a/python/pyarrow-stubs/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi index be40c0b39b3..d8338776481 100644 --- a/python/pyarrow-stubs/_dataset_parquet_encryption.pyi +++ b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi @@ -20,33 +20,9 @@ from ._parquet import FileDecryptionProperties from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig from .lib import _Weakrefable -class ParquetEncryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level encryption - within the Parquet framework. - - The ParquetEncryptionConfig class serves as a bridge for passing encryption-related - parameters to the appropriate components within the Parquet library. It maintains references - to objects that define the encryption strategy, Key Management Service (KMS) configuration, - and specific encryption configurations for Parquet data. - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for - creating cryptographic components, such as encryptors and decryptors. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration - parameters necessary for connecting to a Key Management Service (KMS). - encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration - Shared pointer to an `EncryptionConfiguration` object. This object defines specific - encryption settings for Parquet data, including the keys assigned to different columns. +class ParquetEncryptionConfig(_Weakrefable): - Raises - ------ - ValueError - Raised if `encryption_config` is None. - """ def __init__( self, crypto_factory: CryptoFactory, @@ -54,33 +30,9 @@ class ParquetEncryptionConfig(_Weakrefable): encryption_config: EncryptionConfiguration, ) -> None: ... -class ParquetDecryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level decryption - within the Parquet framework. - - ParquetDecryptionConfig is designed to pass decryption-related parameters to - the appropriate decryption components within the Parquet library. It holds references to - objects that define the decryption strategy, Key Management Service (KMS) configuration, - and specific decryption configurations for reading encrypted Parquet data. - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic - components for the decryption process. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary - for connecting to a Key Management Service (KMS) during decryption. - decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration - Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings - for reading encrypted Parquet data. +class ParquetDecryptionConfig(_Weakrefable): - Raises - ------ - ValueError - Raised if `decryption_config` is None. - """ def __init__( self, crypto_factory: CryptoFactory, @@ -88,14 +40,19 @@ class ParquetDecryptionConfig(_Weakrefable): encryption_config: EncryptionConfiguration, ) -> None: ... + def set_encryption_config( opts: ParquetFileWriteOptions, config: ParquetEncryptionConfig, ) -> None: ... + + def set_decryption_properties( opts: ParquetFragmentScanOptions, config: FileDecryptionProperties, ): ... + + def set_decryption_config( opts: ParquetFragmentScanOptions, config: ParquetDecryptionConfig, diff --git a/python/pyarrow-stubs/_feather.pyi b/python/pyarrow-stubs/_feather.pyi index 373fe38cdce..edd3a089f82 100644 --- a/python/pyarrow-stubs/_feather.pyi +++ b/python/pyarrow-stubs/_feather.pyi @@ -21,7 +21,10 @@ from _typeshed import StrPath from .lib import Buffer, NativeFile, Table, _Weakrefable -class FeatherError(Exception): ... + +class FeatherError(Exception): + ... + def write_feather( table: Table, @@ -32,6 +35,7 @@ def write_feather( version: int = 2, ): ... + class FeatherReader(_Weakrefable): def __init__( self, diff --git a/python/pyarrow-stubs/_flight.pyi b/python/pyarrow-stubs/_flight.pyi index a79475a8796..e4d226a9f60 100644 --- a/python/pyarrow-stubs/_flight.pyi +++ b/python/pyarrow-stubs/_flight.pyi @@ -47,7 +47,7 @@ from .lib import ( _T = TypeVar("_T") class FlightCallOptions(_Weakrefable): - """RPC-layer options for a Flight call.""" + def __init__( self, @@ -55,168 +55,92 @@ class FlightCallOptions(_Weakrefable): write_options: IpcWriteOptions | None = None, headers: list[tuple[str, str]] | None = None, read_options: IpcReadOptions | None = None, - ) -> None: - """Create call options. - - Parameters - ---------- - timeout : float, None - A timeout for the call, in seconds. None means that the - timeout defaults to an implementation-specific value. - write_options : pyarrow.ipc.IpcWriteOptions, optional - IPC write options. The default options can be controlled - by environment variables (see pyarrow.ipc). - headers : List[Tuple[str, str]], optional - A list of arbitrary headers as key, value tuples - read_options : pyarrow.ipc.IpcReadOptions, optional - Serialization options for reading IPC format. - """ + ) -> None: ... + class CertKeyPair(NamedTuple): - """A TLS certificate and key for use in Flight.""" + cert: str key: str class FlightError(Exception): - """ - The base class for Flight-specific errors. - - A server may raise this class or one of its subclasses to provide - a more detailed error to clients. - - Parameters - ---------- - message : str, optional - The error message. - extra_info : bytes, optional - Extra binary error details that were provided by the - server/will be sent to the client. - - Attributes - ---------- - extra_info : bytes - Extra binary error details that were provided by the - server/will be sent to the client. - """ + extra_info: bytes -class FlightInternalError(FlightError, ArrowException): - """An error internal to the Flight server occurred.""" +class FlightInternalError(FlightError, ArrowException): ... + + +class FlightTimedOutError(FlightError, ArrowException): ... + + +class FlightCancelledError(FlightError, ArrowCancelled): ... + + +class FlightServerError(FlightError, ArrowException): ... -class FlightTimedOutError(FlightError, ArrowException): - """The Flight RPC call timed out.""" -class FlightCancelledError(FlightError, ArrowCancelled): - """The operation was cancelled.""" +class FlightUnauthenticatedError(FlightError, ArrowException): ... -class FlightServerError(FlightError, ArrowException): - """A server error occurred.""" -class FlightUnauthenticatedError(FlightError, ArrowException): - """The client is not authenticated.""" +class FlightUnauthorizedError(FlightError, ArrowException): ... -class FlightUnauthorizedError(FlightError, ArrowException): - """The client is not authorized to perform the given operation.""" -class FlightUnavailableError(FlightError, ArrowException): - """The server is not reachable or available.""" +class FlightUnavailableError(FlightError, ArrowException): ... + class FlightWriteSizeExceededError(ArrowInvalid): - """A write operation exceeded the client-configured limit.""" + limit: int actual: int class Action(_Weakrefable): - """An action executable on a Flight service.""" - def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: - """Create an action from a type and a buffer. - Parameters - ---------- - action_type : bytes or str - buf : Buffer or bytes-like object - """ + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: ... + @property - def type(self) -> str: - """The action type.""" + def type(self) -> str: ... + @property - def body(self) -> Buffer: - """The action body (arguments for the action).""" - def serialize(self) -> bytes: - """Get the wire-format representation of this type. + def body(self) -> Buffer: ... - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. + def serialize(self) -> bytes: ... - """ @classmethod - def deserialize(cls, serialized: bytes) -> Self: - """Parse the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. + def deserialize(cls, serialized: bytes) -> Self: ... - """ class ActionType(NamedTuple): - """A type of action that is executable on a Flight service.""" + type: str description: str - def make_action(self, buf: Buffer | bytes) -> Action: - """Create an Action with this type. + def make_action(self, buf: Buffer | bytes) -> Action: ... - Parameters - ---------- - buf : obj - An Arrow buffer or Python bytes or bytes-like object. - """ class Result(_Weakrefable): - """A result from executing an Action.""" - def __init__(self, buf: Buffer | bytes) -> None: - """Create a new result. - - Parameters - ---------- - buf : Buffer or bytes-like object - """ + + def __init__(self, buf: Buffer | bytes) -> None: ... + @property - def body(self) -> Buffer: - """Get the Buffer containing the result.""" - def serialize(self) -> bytes: - """Get the wire-format representation of this type. + def body(self) -> Buffer: ... - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. + def serialize(self) -> bytes: ... - """ @classmethod - def deserialize(cls, serialized: bytes) -> Self: - """Parse the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. + def deserialize(cls, serialized: bytes) -> Self: ... - """ class BasicAuth(_Weakrefable): - """A container for basic auth.""" + def __init__( self, username: str | bytes | None = None, password: str | bytes | None = None - ) -> None: - """Create a new basic auth object. - - Parameters - ---------- - username : string - password : string - """ + ) -> None: ... + @property def username(self) -> bytes: ... @property @@ -226,29 +150,14 @@ class BasicAuth(_Weakrefable): def deserialize(serialized: str | bytes) -> BasicAuth: ... class DescriptorType(enum.Enum): - """ - The type of a FlightDescriptor. - Attributes - ---------- - - UNKNOWN - An unknown descriptor type. - - PATH - A Flight stream represented by a path. - - CMD - A Flight stream represented by an application-defined command. - - """ UNKNOWN = 0 PATH = 1 CMD = 2 class FlightMethod(enum.Enum): - """The implemented methods in Flight.""" + INVALID = 0 HANDSHAKE = 1 @@ -262,29 +171,29 @@ class FlightMethod(enum.Enum): DO_EXCHANGE = 9 class FlightDescriptor(_Weakrefable): - """A description of a data stream available from a Flight service.""" + @staticmethod - def for_path(*path: str | bytes) -> FlightDescriptor: - """Create a FlightDescriptor for a resource path.""" + def for_path(*path: str | bytes) -> FlightDescriptor: ... + @staticmethod - def for_command(command: str | bytes) -> FlightDescriptor: - """Create a FlightDescriptor for an opaque command.""" + def for_command(command: str | bytes) -> FlightDescriptor: ... + @property - def descriptor_type(self) -> DescriptorType: - """Get the type of this descriptor.""" + def descriptor_type(self) -> DescriptorType: ... + @property - def path(self) -> list[bytes] | None: - """Get the path for this descriptor.""" + def path(self) -> list[bytes] | None: ... + @property - def command(self) -> bytes | None: - """Get the command for this descriptor.""" + def command(self) -> bytes | None: ... + def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class Ticket(_Weakrefable): - """A ticket for requesting a Flight stream.""" + def __init__(self, ticket: str | bytes) -> None: ... @property def ticket(self) -> bytes: ... @@ -293,90 +202,60 @@ class Ticket(_Weakrefable): def deserialize(cls, serialized: bytes) -> Self: ... class Location(_Weakrefable): - """The location of a Flight service.""" + def __init__(self, uri: str | bytes) -> None: ... @property def uri(self) -> bytes: ... def equals(self, other: Location) -> bool: ... @staticmethod - def for_grpc_tcp(host: str | bytes, port: int) -> Location: - """Create a Location for a TCP-based gRPC service.""" + def for_grpc_tcp(host: str | bytes, port: int) -> Location: ... + @staticmethod - def for_grpc_tls(host: str | bytes, port: int) -> Location: - """Create a Location for a TLS-based gRPC service.""" + def for_grpc_tls(host: str | bytes, port: int) -> Location: ... + @staticmethod - def for_grpc_unix(path: str | bytes) -> Location: - """Create a Location for a domain socket-based gRPC service.""" + def for_grpc_unix(path: str | bytes) -> Location: ... + class FlightEndpoint(_Weakrefable): - """A Flight stream, along with the ticket and locations to access it.""" + def __init__( self, ticket: Ticket | str | bytes, locations: list[str | Location], expiration_time: TimestampScalar | None = ..., app_metadata: bytes | str = ..., - ): - """Create a FlightEndpoint from a ticket and list of locations. - - Parameters - ---------- - ticket : Ticket or bytes - the ticket needed to access this flight - locations : list of string URIs - locations where this flight is available - expiration_time : TimestampScalar, default None - Expiration time of this stream. If present, clients may assume - they can retry DoGet requests. Otherwise, clients should avoid - retrying DoGet requests. - app_metadata : bytes or str, default "" - Application-defined opaque metadata. - - Raises - ------ - ArrowException - If one of the location URIs is not a valid URI. - """ + ): ... + @property - def ticket(self) -> Ticket: - """Get the ticket in this endpoint.""" + def ticket(self) -> Ticket: ... + @property - def locations(self) -> list[Location]: - """Get locations where this flight is available.""" + def locations(self) -> list[Location]: ... + def serialize(self) -> bytes: ... @property - def expiration_time(self) -> TimestampScalar | None: - """Get the expiration time of this stream. - - If present, clients may assume they can retry DoGet requests. - Otherwise, clients should avoid retrying DoGet requests. + def expiration_time(self) -> TimestampScalar | None: ... - """ @property - def app_metadata(self) -> bytes | str: - """Get application-defined opaque metadata.""" + def app_metadata(self) -> bytes | str: ... + @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class SchemaResult(_Weakrefable): - """The serialized schema returned from a GetSchema request.""" - def __init__(self, schema: Schema) -> None: - """Create a SchemaResult from a schema. - - Parameters - ---------- - schema: Schema - the schema of the data in this flight. - """ + + def __init__(self, schema: Schema) -> None: ... + @property - def schema(self) -> Schema: - """The schema of the data in this flight.""" + def schema(self) -> Schema: ... + def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class FlightInfo(_Weakrefable): - """A description of a Flight stream.""" + def __init__( self, schema: Schema, @@ -386,62 +265,35 @@ class FlightInfo(_Weakrefable): total_bytes: int = ..., ordered: bool = ..., app_metadata: bytes | str = ..., - ) -> None: - """Create a FlightInfo object from a schema, descriptor, and endpoints. - - Parameters - ---------- - schema : Schema - the schema of the data in this flight. - descriptor : FlightDescriptor - the descriptor for this flight. - endpoints : list of FlightEndpoint - a list of endpoints where this flight is available. - total_records : int, default None - the total records in this flight, -1 or None if unknown. - total_bytes : int, default None - the total bytes in this flight, -1 or None if unknown. - ordered : boolean, default False - Whether endpoints are in the same order as the data. - app_metadata : bytes or str, default "" - Application-defined opaque metadata. - """ - @property - def schema(self) -> Schema: - """The schema of the data in this flight.""" + ) -> None: ... + @property - def descriptor(self) -> FlightDescriptor: - """The descriptor of the data in this flight.""" + def schema(self) -> Schema: ... + @property - def endpoints(self) -> list[FlightEndpoint]: - """The endpoints where this flight is available.""" + def descriptor(self) -> FlightDescriptor: ... + @property - def total_records(self) -> int: - """The total record count of this flight, or -1 if unknown.""" + def endpoints(self) -> list[FlightEndpoint]: ... + @property - def total_bytes(self) -> int: - """The size in bytes of the data in this flight, or -1 if unknown.""" + def total_records(self) -> int: ... + @property - def ordered(self) -> bool: - """Whether endpoints are in the same order as the data.""" + def total_bytes(self) -> int: ... + @property - def app_metadata(self) -> bytes | str: - """ - Application-defined opaque metadata. + def ordered(self) -> bool: ... - There is no inherent or required relationship between this and the - app_metadata fields in the FlightEndpoints or resulting FlightData - messages. Since this metadata is application-defined, a given - application could define there to be a relationship, but there is - none required by the spec. + @property + def app_metadata(self) -> bytes | str: ... - """ def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class FlightStreamChunk(_Weakrefable): - """A RecordBatch with application metadata on the side.""" + @property def data(self) -> RecordBatch | None: ... @property @@ -449,7 +301,7 @@ class FlightStreamChunk(_Weakrefable): def __iter__(self): ... class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): - """A reader for Flight streams.""" + # Needs to be separate class so the "real" class can subclass the # pure-Python mixin class @@ -457,115 +309,58 @@ class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): def __iter__(self) -> Self: ... def __next__(self) -> FlightStreamChunk: ... @property - def schema(self) -> Schema: - """Get the schema for this reader.""" - def read_all(self) -> Table: - """Read the entire contents of the stream as a Table.""" - def read_chunk(self) -> FlightStreamChunk: - """Read the next FlightStreamChunk along with any metadata. - - Returns - ------- - chunk : FlightStreamChunk - The next FlightStreamChunk in the stream. - - Raises - ------ - StopIteration - when the stream is finished - """ - def to_reader(self) -> RecordBatchReader: - """Convert this reader into a regular RecordBatchReader. - - This may fail if the schema cannot be read from the remote end. - - Returns - ------- - RecordBatchReader - """ - -class MetadataRecordBatchReader(_MetadataRecordBatchReader): - """The base class for readers for Flight streams. - - See Also - -------- - FlightStreamReader - """ + def schema(self) -> Schema: ... + + def read_all(self) -> Table: ... + + def read_chunk(self) -> FlightStreamChunk: ... + + def to_reader(self) -> RecordBatchReader: ... + + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): ... + class FlightStreamReader(MetadataRecordBatchReader): - """A reader that can also be canceled.""" - def cancel(self) -> None: - """Cancel the read operation.""" - def read_all(self) -> Table: - """Read the entire contents of the stream as a Table.""" + + def cancel(self) -> None: ... + + def read_all(self) -> Table: ... + class MetadataRecordBatchWriter(_CRecordBatchWriter): - """A RecordBatchWriter that also allows writing application metadata. - - This class is a context manager; on exit, close() will be called. - """ - - def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: - """Prepare to write data to this stream with the given schema.""" - def write_metadata(self, buf: Buffer) -> None: - """Write Flight metadata by itself.""" - def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] - """ - Write RecordBatch to stream. - - Parameters - ---------- - batch : RecordBatch - """ - def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: - """ - Write Table to stream in (contiguous) RecordBatch objects. - - Parameters - ---------- - table : Table - max_chunksize : int, default None - Maximum number of rows for RecordBatch chunks. Individual chunks may - be smaller depending on the chunk layout of individual columns. - """ - def close(self) -> None: - """ - Close stream and write end-of-stream 0 marker. - """ - def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: - """Write a RecordBatch along with Flight metadata. - - Parameters - ---------- - batch : RecordBatch - The next RecordBatch in the stream. - buf : Buffer - Application-specific metadata for the batch as defined by - Flight. - """ + + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ... + + def write_metadata(self, buf: Buffer) -> None: ... + + def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override] + + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: ... + + def close(self) -> None: ... + + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: ... + class FlightStreamWriter(MetadataRecordBatchWriter): - """A writer that also allows closing the write side of a stream.""" - def done_writing(self) -> None: - """Indicate that the client is done writing, but not done reading.""" + + def done_writing(self) -> None: ... + class FlightMetadataReader(_Weakrefable): - """A reader for Flight metadata messages sent during a DoPut.""" - def read(self) -> Buffer | None: - """Read the next metadata message.""" + + def read(self) -> Buffer | None: ... + class FlightMetadataWriter(_Weakrefable): - """A sender for Flight metadata messages during a DoPut.""" - def write(self, message: Buffer) -> None: - """Write the next metadata message. - Parameters - ---------- - message : Buffer - """ + def write(self, message: Buffer) -> None: ... + class AsyncioCall(Generic[_T]): - """State for an async RPC using asyncio.""" + _future: asyncio.Future[_T] @@ -573,11 +368,7 @@ class AsyncioCall(Generic[_T]): def wakeup(self, result_or_exception: BaseException | _T) -> None: ... class AsyncioFlightClient: - """ - A FlightClient with an asyncio-based async interface. - This interface is EXPERIMENTAL. - """ def __init__(self, client: FlightClient) -> None: ... async def get_flight_info( @@ -588,40 +379,7 @@ class AsyncioFlightClient: ): ... class FlightClient(_Weakrefable): - """A client to a Flight service. - - Connect to a Flight service on the given host and port. - - Parameters - ---------- - location : str, tuple or Location - Location to connect to. Either a gRPC URI like `grpc://localhost:port`, - a tuple of (host, port) pair, or a Location instance. - tls_root_certs : bytes or None - PEM-encoded - cert_chain: bytes or None - Client certificate if using mutual TLS - private_key: bytes or None - Client private key for cert_chain is using mutual TLS - override_hostname : str or None - Override the hostname checked by TLS. Insecure, use with caution. - middleware : list optional, default None - A list of ClientMiddlewareFactory instances. - write_size_limit_bytes : int optional, default None - A soft limit on the size of a data payload sent to the - server. Enabled if positive. If enabled, writing a record - batch that (when serialized) exceeds this limit will raise an - exception; the client can retry the write with a smaller - batch. - disable_server_verification : boolean optional, default False - A flag that indicates that, if the client is connecting - with TLS, that it skips server verification. If this is - enabled, all other TLS settings are overridden. - generic_options : list optional, default None - A list of generic (string, int or string) option tuples passed - to the underlying transport. Effect is implementation - dependent. - """ + def __init__( self, location: str | tuple[str, int] | Location, @@ -638,14 +396,8 @@ class FlightClient(_Weakrefable): @property def supports_async(self) -> bool: ... def as_async(self) -> AsyncioFlightClient: ... - def wait_for_available(self, timeout: int = 5) -> None: - """Block until the server can be contacted. - - Parameters - ---------- - timeout : int, default 5 - The maximum seconds to wait. - """ + def wait_for_available(self, timeout: int = 5) -> None: ... + @deprecated( "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." ) @@ -658,439 +410,167 @@ class FlightClient(_Weakrefable): private_key: str | None = None, override_hostname: str | None = None, disable_server_verification: bool = False, - ) -> FlightClient: - """Connect to a Flight server. + ) -> FlightClient: ... - .. deprecated:: 0.15.0 - Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. - """ def authenticate( self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None - ) -> None: - """Authenticate to the server. - - Parameters - ---------- - auth_handler : ClientAuthHandler - The authentication mechanism to use. - options : FlightCallOptions - Options for this call. - """ + ) -> None: ... + def authenticate_basic_token( self, username: str, password: str, options: FlightCallOptions | None = None - ) -> tuple[str, str]: - """Authenticate to the server with HTTP basic authentication. - - Parameters - ---------- - username : string - Username to authenticate with - password : string - Password to authenticate with - options : FlightCallOptions - Options for this call - - Returns - ------- - tuple : Tuple[str, str] - A tuple representing the FlightCallOptions authorization - header entry of a bearer token. - """ - def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: - """List the actions available on a service.""" + ) -> tuple[str, str]: ... + + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: ... + def do_action( self, action: Action, options: FlightCallOptions | None = None - ) -> Iterator[Result]: - """ - Execute an action on a service. - - Parameters - ---------- - action : str, tuple, or Action - Can be action type name (no body), type and body, or any Action - object - options : FlightCallOptions - RPC options - - Returns - ------- - results : iterator of Result values - """ + ) -> Iterator[Result]: ... + def list_flights( self, criteria: str | None = None, options: FlightCallOptions | None = None - ) -> Generator[FlightInfo, None, None]: - """List the flights available on a service.""" + ) -> Generator[FlightInfo, None, None]: ... + def get_flight_info( self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> FlightInfo: - """Request information about an available flight.""" + ) -> FlightInfo: ... + def get_schema( self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> Schema: - """Request schema for an available flight.""" + ) -> Schema: ... + def do_get( self, ticket: Ticket, options: FlightCallOptions | None = None - ) -> FlightStreamReader: - """Request the data for a flight. + ) -> FlightStreamReader: ... - Returns - ------- - reader : FlightStreamReader - """ def do_put( self, descriptor: FlightDescriptor, schema: Schema, options: FlightCallOptions | None = None, - ) -> tuple[FlightStreamWriter, FlightStreamReader]: - """Upload data to a flight. - - Returns - ------- - writer : FlightStreamWriter - reader : FlightMetadataReader - """ + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + def do_exchange( self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> tuple[FlightStreamWriter, FlightStreamReader]: - """Start a bidirectional data exchange with a server. - - Parameters - ---------- - descriptor : FlightDescriptor - A descriptor for the flight. - options : FlightCallOptions - RPC options. - - Returns - ------- - writer : FlightStreamWriter - reader : FlightStreamReader - """ - def close(self) -> None: - """Close the client and disconnect.""" + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def close(self) -> None: ... + def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_value, traceback) -> None: ... -class FlightDataStream(_Weakrefable): - """ - Abstract base class for Flight data streams. +class FlightDataStream(_Weakrefable): ... - See Also - -------- - RecordBatchStream - GeneratorStream - """ class RecordBatchStream(FlightDataStream): - """A Flight data stream backed by RecordBatches. - - The remainder of this DoGet request will be handled in C++, - without having to acquire the GIL. - """ def __init__( self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None - ) -> None: - """Create a RecordBatchStream from a data source. + ) -> None: ... - Parameters - ---------- - data_source : RecordBatchReader or Table - The data to stream to the client. - options : pyarrow.ipc.IpcWriteOptions, optional - Optional IPC options to control how to write the data. - """ class GeneratorStream(FlightDataStream): - """A Flight data stream backed by a Python generator.""" + def __init__( self, schema: Schema, generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], options: IpcWriteOptions | None = None, - ) -> None: - """Create a GeneratorStream from a Python generator. + ) -> None: ... - Parameters - ---------- - schema : Schema - The schema for the data to be returned. - generator : iterator or iterable - The generator should yield other FlightDataStream objects, - Tables, RecordBatches, or RecordBatchReaders. +class ServerCallContext(_Weakrefable): - options : pyarrow.ipc.IpcWriteOptions, optional - """ + def peer_identity(self) -> bytes: ... + + def peer(self) -> str: ... -class ServerCallContext(_Weakrefable): - """Per-call state/context.""" - def peer_identity(self) -> bytes: - """Get the identity of the authenticated peer. - - May be the empty string. - """ - def peer(self) -> str: - """Get the address of the peer.""" # Set safe=True as gRPC on Windows sometimes gives garbage bytes - def is_cancelled(self) -> bool: - """Check if the current RPC call has been canceled by the client.""" - def add_header(self, key: str, value: str) -> None: - """Add a response header.""" - def add_trailer(self, key: str, value: str) -> None: - """Add a response trailer.""" - def get_middleware(self, key: str) -> ServerMiddleware | None: - """ - Get a middleware instance by key. - - Returns None if the middleware was not found. - """ + def is_cancelled(self) -> bool: ... + + def add_header(self, key: str, value: str) -> None: ... + + def add_trailer(self, key: str, value: str) -> None: ... + + def get_middleware(self, key: str) -> ServerMiddleware | None: ... + class ServerAuthReader(_Weakrefable): - """A reader for messages from the client during an auth handshake.""" + def read(self) -> str: ... class ServerAuthSender(_Weakrefable): - """A writer for messages to the client during an auth handshake.""" + def write(self, message: str) -> None: ... class ClientAuthReader(_Weakrefable): - """A reader for messages from the server during an auth handshake.""" + def read(self) -> str: ... class ClientAuthSender(_Weakrefable): - """A writer for messages to the server during an auth handshake.""" + def write(self, message: str) -> None: ... class ServerAuthHandler(_Weakrefable): - """Authentication middleware for a server. - - To implement an authentication mechanism, subclass this class and - override its methods. - """ - def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): - """Conduct the handshake with the client. + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ... - May raise an error if the client cannot authenticate. + def is_valid(self, token: str) -> bool: ... - Parameters - ---------- - outgoing : ServerAuthSender - A channel to send messages to the client. - incoming : ServerAuthReader - A channel to read messages from the client. - """ - def is_valid(self, token: str) -> bool: - """Validate a client token, returning their identity. - May return an empty string (if the auth mechanism does not - name the peer) or raise an exception (if the token is - invalid). +class ClientAuthHandler(_Weakrefable): - Parameters - ---------- - token : bytes - The authentication token from the client. + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ... - """ + def get_token(self) -> str: ... -class ClientAuthHandler(_Weakrefable): - """Authentication plugin for a client.""" - def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): - """Conduct the handshake with the server. - - Parameters - ---------- - outgoing : ClientAuthSender - A channel to send messages to the server. - incoming : ClientAuthReader - A channel to read messages from the server. - """ - def get_token(self) -> str: - """Get the auth token for a call.""" class CallInfo(NamedTuple): - """Information about a particular RPC for Flight middleware.""" + method: FlightMethod class ClientMiddlewareFactory(_Weakrefable): - """A factory for new middleware instances. - - All middleware methods will be called from the same thread as the - RPC method implementation. That is, thread-locals set in the - client are accessible from the middleware itself. - """ - def start_call(self, info: CallInfo) -> ClientMiddleware | None: - """Called at the start of an RPC. + def start_call(self, info: CallInfo) -> ClientMiddleware | None: ... - This must be thread-safe and must not raise exceptions. - - Parameters - ---------- - info : CallInfo - Information about the call. - - Returns - ------- - instance : ClientMiddleware - An instance of ClientMiddleware (the instance to use for - the call), or None if this call is not intercepted. - - """ class ClientMiddleware(_Weakrefable): - """Client-side middleware for a call, instantiated per RPC. - - Methods here should be fast and must be infallible: they should - not raise exceptions or stall indefinitely. - - """ - - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: - """A callback before headers are sent. - - Returns - ------- - headers : dict - A dictionary of header values to add to the request, or - None if no headers are to be added. The dictionary should - have string keys and string or list-of-string values. - - Bytes values are allowed, but the underlying transport may - not support them or may restrict them. For gRPC, binary - values are only allowed on headers ending in "-bin". - - Header names must be lowercase ASCII. - - """ - def received_headers(self, headers: dict[str, list[str] | list[bytes]]): - """A callback when headers are received. - The default implementation does nothing. + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... - Parameters - ---------- - headers : dict - A dictionary of headers from the server. Keys are strings - and values are lists of strings (for text headers) or - bytes (for binary headers). - """ + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ... - def call_completed(self, exception: ArrowException): - """A callback when the call finishes. - The default implementation does nothing. + def call_completed(self, exception: ArrowException): ... - Parameters - ---------- - exception : ArrowException - If the call errored, this is the equivalent - exception. Will be None if the call succeeded. - - """ class ServerMiddlewareFactory(_Weakrefable): - """A factory for new middleware instances. - - All middleware methods will be called from the same thread as the - RPC method implementation. That is, thread-locals set in the - middleware are accessible from the method itself. - """ def start_call( self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> ServerMiddleware | None: - """Called at the start of an RPC. - - This must be thread-safe. - - Parameters - ---------- - info : CallInfo - Information about the call. - headers : dict - A dictionary of headers from the client. Keys are strings - and values are lists of strings (for text headers) or - bytes (for binary headers). - - Returns - ------- - instance : ServerMiddleware - An instance of ServerMiddleware (the instance to use for - the call), or None if this call is not intercepted. + ) -> ServerMiddleware | None: ... - Raises - ------ - exception : pyarrow.ArrowException - If an exception is raised, the call will be rejected with - the given error. - """ +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): ... -class TracingServerMiddlewareFactory(ServerMiddlewareFactory): - """A factory for tracing middleware instances. - - This enables OpenTelemetry support in Arrow (if Arrow was compiled - with OpenTelemetry support enabled). A new span will be started on - each RPC call. The TracingServerMiddleware instance can then be - retrieved within an RPC handler to get the propagated context, - which can be used to start a new span on the Python side. - - Because the Python/C++ OpenTelemetry libraries do not - interoperate, spans on the C++ side are not directly visible to - the Python side and vice versa. - - """ class ServerMiddleware(_Weakrefable): - """Server-side middleware for a call, instantiated per RPC. - - Methods here should be fast and must be infallible: they should - not raise exceptions or stall indefinitely. - """ - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: - """A callback before headers are sent. + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... - Returns - ------- - headers : dict - A dictionary of header values to add to the response, or - None if no headers are to be added. The dictionary should - have string keys and string or list-of-string values. + def call_completed(self, exception: ArrowException): ... - Bytes values are allowed, but the underlying transport may - not support them or may restrict them. For gRPC, binary - values are only allowed on headers ending in "-bin". - - Header names must be lowercase ASCII. - - """ - def call_completed(self, exception: ArrowException): - """A callback when the call finishes. - - Parameters - ---------- - exception : pyarrow.ArrowException - If the call errored, this is the equivalent - exception. Will be None if the call succeeded. - - """ class TracingServerMiddleware(ServerMiddleware): trace_context: dict def __init__(self, trace_context: dict) -> None: ... class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): - """Wrapper to bundle server middleware into a single C++ one.""" + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... def start_call( # type: ignore[override] @@ -1103,47 +583,12 @@ class _ServerMiddlewareWrapper(ServerMiddleware): def call_completed(self, exception: ArrowException) -> None: ... class _FlightServerFinalizer(_Weakrefable): - """ - A finalizer that shuts down the server on destruction. - See ARROW-16597. If the server is still active at interpreter - exit, the process may segfault. - """ def finalize(self) -> None: ... class FlightServerBase(_Weakrefable): - """A Flight service definition. - - To start the server, create an instance of this class with an - appropriate location. The server will be running as soon as the - instance is created; it is not required to call :meth:`serve`. - - Override methods to define your Flight service. - - Parameters - ---------- - location : str, tuple or Location optional, default None - Location to serve on. Either a gRPC URI like `grpc://localhost:port`, - a tuple of (host, port) pair, or a Location instance. - If None is passed then the server will be started on localhost with a - system provided random port. - auth_handler : ServerAuthHandler optional, default None - An authentication mechanism to use. May be None. - tls_certificates : list optional, default None - A list of (certificate, key) pairs. - verify_client : boolean optional, default False - If True, then enable mutual TLS: require the client to present - a client certificate, and validate the certificate. - root_certificates : bytes optional, default None - If enabling mutual TLS, this specifies the PEM-encoded root - certificate used to validate client certificates. - middleware : dict optional, default None - A dictionary of :class:`ServerMiddlewareFactory` instances. The - string keys can be used to retrieve the middleware instance within - RPC handlers (see :meth:`ServerCallContext.get_middleware`). - - """ + def __init__( self, location: str | tuple[str, int] | Location | None = None, @@ -1154,197 +599,46 @@ class FlightServerBase(_Weakrefable): middleware: dict[str, ServerMiddlewareFactory] | None = None, ): ... @property - def port(self) -> int: - """ - Get the port that this server is listening on. - - Returns a non-positive value if the operation is invalid - (e.g. init() was not called or server is listening on a domain - socket). - """ - def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: - """List flights available on this service. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - criteria : bytes - Filter criteria provided by the client. - - Returns - ------- - iterator of FlightInfo - - """ + def port(self) -> int: ... + + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: ... + def get_flight_info( self, context: ServerCallContext, descriptor: FlightDescriptor - ) -> FlightInfo: - """Get information about a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - - Returns - ------- - FlightInfo - - """ - def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: - """Get the schema of a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - - Returns - ------- - Schema - - """ + ) -> FlightInfo: ... + + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: ... + def do_put( self, context: ServerCallContext, descriptor: FlightDescriptor, reader: MetadataRecordBatchReader, writer: FlightMetadataWriter, - ) -> None: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - reader : MetadataRecordBatchReader - A reader for data uploaded by the client. - writer : FlightMetadataWriter - A writer to send responses to the client. - - """ - def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - ticket : Ticket - The ticket for the flight. - - Returns - ------- - FlightDataStream - A stream of data to send back to the client. - - """ + ) -> None: ... + + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: ... + def do_exchange( self, context: ServerCallContext, descriptor: FlightDescriptor, reader: MetadataRecordBatchReader, writer: MetadataRecordBatchWriter, - ) -> None: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - reader : MetadataRecordBatchReader - A reader for data uploaded by the client. - writer : MetadataRecordBatchWriter - A writer to send responses to the client. - - """ - def list_actions(self, context: ServerCallContext) -> Iterable[Action]: - """List custom actions available on this server. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - - Returns - ------- - iterator of ActionType or tuple - - """ - def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: - """Execute a custom action. - - This method should return an iterator, or it should be a - generator. Applications should override this method to - implement their own behavior. The default method raises a - NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - action : Action - The action to execute. - - Returns - ------- - iterator of bytes - - """ - def serve(self) -> None: - """Block until the server shuts down. - - This method only returns if shutdown() is called or a signal is - received. - """ - def run(self) -> None: - """Block until the server shuts down. - - .. deprecated:: 0.15.0 - Use the ``FlightServer.serve`` method instead - """ - def shutdown(self) -> None: - """Shut down the server, blocking until current requests finish. - - Do not call this directly from the implementation of a Flight - method, as then the server will block forever waiting for that - request to finish. Instead, call this method from a background - thread. - - This method should only be called once. - """ - def wait(self) -> None: - """Block until server is terminated with shutdown.""" + ) -> None: ... + + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ... + + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: ... + + def serve(self) -> None: ... + + def run(self) -> None: ... + + def shutdown(self) -> None: ... + + def wait(self) -> None: ... + def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_value, traceback): ... @@ -1359,39 +653,4 @@ def connect( write_size_limit_bytes: int | None = None, disable_server_verification: bool = False, generic_options: list[tuple[str, int | str]] | None = None, -) -> FlightClient: - """ - Connect to a Flight server. - - Parameters - ---------- - location : str, tuple, or Location - Location to connect to. Either a URI like "grpc://localhost:port", - a tuple of (host, port), or a Location instance. - tls_root_certs : bytes or None - PEM-encoded. - cert_chain: str or None - If provided, enables TLS mutual authentication. - private_key: str or None - If provided, enables TLS mutual authentication. - override_hostname : str or None - Override the hostname checked by TLS. Insecure, use with caution. - middleware : list or None - A list of ClientMiddlewareFactory instances to apply. - write_size_limit_bytes : int or None - A soft limit on the size of a data payload sent to the - server. Enabled if positive. If enabled, writing a record - batch that (when serialized) exceeds this limit will raise an - exception; the client can retry the write with a smaller - batch. - disable_server_verification : boolean or None - Disable verifying the server when using TLS. - Insecure, use with caution. - generic_options : list or None - A list of generic (string, int or string) options to pass to - the underlying transport. - - Returns - ------- - client : FlightClient - """ +) -> FlightClient: ... diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi index 1f3667ef413..42ea8543738 100644 --- a/python/pyarrow-stubs/_fs.pyi +++ b/python/pyarrow-stubs/_fs.pyi @@ -36,77 +36,15 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() File = enum.auto() Directory = enum.auto() + class FileInfo(_Weakrefable): - """ - FileSystem entry info. - - Parameters - ---------- - path : str - The full path to the filesystem entry. - type : FileType - The type of the filesystem entry. - mtime : datetime or float, default None - If given, the modification time of the filesystem entry. - If a float is given, it is the number of seconds since the - Unix epoch. - mtime_ns : int, default None - If given, the modification time of the filesystem entry, - in nanoseconds since the Unix epoch. - `mtime` and `mtime_ns` are mutually exclusive. - size : int, default None - If given, the filesystem entry size in bytes. This should only - be given if `type` is `FileType.File`. - - Examples - -------- - Generate a file: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> path_fs = local_path + "/pyarrow-fs-example.dat" - >>> with local.open_output_stream(path_fs) as stream: - ... stream.write(b"data") - 4 - - Get FileInfo object using ``get_file_info()``: - - >>> file_info = local.get_file_info(path_fs) - >>> file_info - - - Inspect FileInfo attributes: - - >>> file_info.type - - - >>> file_info.is_file - True - - >>> file_info.path - '/.../pyarrow-fs-example.dat' - - >>> file_info.base_name - 'pyarrow-fs-example.dat' - - >>> file_info.size - 4 - - >>> file_info.extension - 'dat' - - >>> file_info.mtime # doctest: +SKIP - datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) - - >>> file_info.mtime_ns # doctest: +SKIP - 1656489370873922073 - """ def __init__( self, @@ -118,901 +56,161 @@ class FileInfo(_Weakrefable): size: int | None = None, ): ... @property - def type(self) -> FileType: - """ - Type of the file. - - The returned enum values can be the following: - - - FileType.NotFound: target does not exist - - FileType.Unknown: target exists but its type is unknown (could be a - special file such as a Unix socket or character device, or - Windows NUL / CON / ...) - - FileType.File: target is a regular file - - FileType.Directory: target is a regular directory - - Returns - ------- - type : FileType - """ + def type(self) -> FileType: ... + @property def is_file(self) -> bool: ... @property - def path(self) -> str: - """ - The full file path in the filesystem. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.path - '/.../pyarrow-fs-example.dat' - """ - @property - def base_name(self) -> str: - """ - The file base name. - - Component after the last directory separator. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.base_name - 'pyarrow-fs-example.dat' - """ + def path(self) -> str: ... + @property - def size(self) -> int: - """ - The size in bytes, if available. + def base_name(self) -> str: ... - Only regular files are guaranteed to have a size. + @property + def size(self) -> int: ... - Returns - ------- - size : int or None - """ @property - def extension(self) -> str: - """ - The file extension. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.extension - 'dat' - """ + def extension(self) -> str: ... + @property - def mtime(self) -> dt.datetime | None: - """ - The time of last modification, if available. - - Returns - ------- - mtime : datetime.datetime or None - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.mtime # doctest: +SKIP - datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) - """ + def mtime(self) -> dt.datetime | None: ... + @property - def mtime_ns(self) -> int | None: - """ - The time of last modification, if available, expressed in nanoseconds - since the Unix epoch. - - Returns - ------- - mtime_ns : int or None - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.mtime_ns # doctest: +SKIP - 1656489370873922073 - """ + def mtime_ns(self) -> int | None: ... + class FileSelector(_Weakrefable): - """ - File and directory selector. - - It contains a set of options that describes how to search for files and - directories. - - Parameters - ---------- - base_dir : str - The directory in which to select files. Relative paths also work, use - '.' for the current directory and '..' for the parent. - allow_not_found : bool, default False - The behavior if `base_dir` doesn't exist in the filesystem. - If false, an error is returned. - If true, an empty selection is returned. - recursive : bool, default False - Whether to recurse into subdirectories. - - Examples - -------- - List the contents of a directory and subdirectories: - - >>> selector_1 = fs.FileSelector(local_path, recursive=True) - >>> local.get_file_info(selector_1) # doctest: +SKIP - [, - , - ] - - List only the contents of the base directory: - - >>> selector_2 = fs.FileSelector(local_path) - >>> local.get_file_info(selector_2) # doctest: +SKIP - [, - ] - - Return empty selection if the directory doesn't exist: - - >>> selector_not_found = fs.FileSelector( - ... local_path + "/missing", recursive=True, allow_not_found=True - ... ) - >>> local.get_file_info(selector_not_found) - [] - """ base_dir: str allow_not_found: bool recursive: bool - def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... + def __init__(self, base_dir: str, allow_not_found: bool = False, + recursive: bool = False): ... + class FileSystem(_Weakrefable): - """ - Abstract file system API. - """ @classmethod - def from_uri(cls, uri: str) -> tuple[Self, str]: - """ - Create a new FileSystem from URI or Path. - - Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". - In addition, the argument can be a pathlib.Path object, or a string - describing an absolute local path. - - Parameters - ---------- - uri : string - URI-based path, for example: file:///some/local/path. - - Returns - ------- - tuple of (FileSystem, str path) - With (filesystem, path) tuple where path is the abstract path - inside the FileSystem instance. - - Examples - -------- - Create a new FileSystem subclass from a URI: - - >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) - >>> local_new, path_new = fs.FileSystem.from_uri(uri) - >>> local_new - >> path_new - '/.../pyarrow-fs-example.dat' - - Or from a s3 bucket: - - >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") - (, 'usgs-landsat/collection02') - """ - def equals(self, other: FileSystem) -> bool: - """ - Parameters - ---------- - other : pyarrow.fs.FileSystem - - Returns - ------- - bool - """ + def from_uri(cls, uri: str) -> tuple[Self, str]: ... + + def equals(self, other: FileSystem) -> bool: ... + @property - def type_name(self) -> str: - """ - The filesystem's type name. - """ - def get_file_info(self, paths_or_selector: str | FileSelector | list[str]) -> FileInfo | list[FileInfo]: - """ - Get info for the given files. - - Any symlink is automatically dereferenced, recursively. A non-existing - or unreachable file returns a FileStat object and has a FileType of - value NotFound. An exception indicates a truly exceptional condition - (low-level I/O error, etc.). - - Parameters - ---------- - paths_or_selector : FileSelector, path-like or list of path-likes - Either a selector object, a path-like object or a list of - path-like objects. The selector's base directory will not be - part of the results, even if it exists. If it doesn't exist, - use `allow_not_found`. - - Returns - ------- - FileInfo or list of FileInfo - Single FileInfo object is returned for a single path, otherwise - a list of FileInfo objects is returned. - - Examples - -------- - >>> local - - >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) - - """ - def create_dir(self, path: str, *, recursive: bool = True) -> None: - """ - Create a directory and subdirectories. - - This function succeeds if the directory already exists. - - Parameters - ---------- - path : str - The path of the new directory. - recursive : bool, default True - Create nested directories as well. - """ - def delete_dir(self, path: str) -> None: - """ - Delete a directory and its contents, recursively. - - Parameters - ---------- - path : str - The path of the directory to be deleted. - """ + def type_name(self) -> str: ... + + def get_file_info(self, paths_or_selector: str | FileSelector | + list[str]) -> FileInfo | list[FileInfo]: ... + + def create_dir(self, path: str, *, recursive: bool = True) -> None: ... + + def delete_dir(self, path: str) -> None: ... + def delete_dir_contents( self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False - ) -> None: - """ - Delete a directory's contents, recursively. - - Like delete_dir, but doesn't delete the directory itself. - - Parameters - ---------- - path : str - The path of the directory to be deleted. - accept_root_dir : boolean, default False - Allow deleting the root directory's contents - (if path is empty or "/") - missing_dir_ok : boolean, default False - If False then an error is raised if path does - not exist - """ - def move(self, src: str, dest: str) -> None: - """ - Move / rename a file or directory. - - If the destination exists: - - if it is a non-empty directory, an error is returned - - otherwise, if it has the same type as the source, it is replaced - - otherwise, behavior is unspecified (implementation-dependent). - - Parameters - ---------- - src : str - The path of the file or the directory to be moved. - dest : str - The destination path where the file or directory is moved to. - - Examples - -------- - Create a new folder with a file: - - >>> local.create_dir("/tmp/other_dir") - >>> local.copy_file(path, "/tmp/move_example.dat") - - Move the file: - - >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") - - Inspect the file info: - - >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") - - >>> local.get_file_info("/tmp/move_example.dat") - - - Delete the folder: - >>> local.delete_dir("/tmp/other_dir") - """ - def copy_file(self, src: str, dest: str) -> None: - """ - Copy a file. - - If the destination exists and is a directory, an error is returned. - Otherwise, it is replaced. - - Parameters - ---------- - src : str - The path of the file to be copied from. - dest : str - The destination path where the file is copied to. - - Examples - -------- - >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") - - Inspect the file info: - - >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") - - >>> local.get_file_info(path) - - """ - def delete_file(self, path: str) -> None: - """ - Delete a file. - - Parameters - ---------- - path : str - The path of the file to be deleted. - """ - def open_input_file(self, path: str) -> NativeFile: - """ - Open an input file for random access reading. - - Parameters - ---------- - path : str - The source to open for reading. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Print the data from the file with `open_input_file()`: - - >>> with local.open_input_file(path) as f: - ... print(f.readall()) - b'data' - """ + ) -> None: ... + + def move(self, src: str, dest: str) -> None: ... + + def copy_file(self, src: str, dest: str) -> None: ... + + def delete_file(self, path: str) -> None: ... + + def open_input_file(self, path: str) -> NativeFile: ... + def open_input_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None - ) -> NativeFile: - """ - Open an input stream for sequential reading. - - Parameters - ---------- - path : str - The source to open for reading. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly decompression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary read buffer. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Print the data from the file with `open_input_stream()`: - - >>> with local.open_input_stream(path) as f: - ... print(f.readall()) - b'data' - """ + ) -> NativeFile: ... + def open_output_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None, metadata: dict[str, str] | None = None, - ) -> NativeFile: - """ - Open an output stream for sequential writing. - - If the target already exists, existing data is truncated. - - Parameters - ---------- - path : str - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - metadata : dict optional, default None - If not None, a mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - Unsupported metadata keys will be ignored. - - Returns - ------- - stream : NativeFile - - Examples - -------- - >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream(path) as stream: - ... stream.write(b"data") - 4 - """ + ) -> NativeFile: ... + def open_append_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None, metadata: dict[str, str] | None = None, - ): - """ - Open an output stream for appending. - - If the target doesn't exist, a new empty file is created. - - .. note:: - Some filesystem implementations do not support efficient - appending to an existing file, in which case this method will - raise NotImplementedError. - Consider writing to multiple files (using e.g. the dataset layer) - instead. - - Parameters - ---------- - path : str - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - metadata : dict optional, default None - If not None, a mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - Unsupported metadata keys will be ignored. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Append new data to a FileSystem subclass with nonempty file: - - >>> with local.open_append_stream(path) as f: - ... f.write(b"+newly added") - 12 - - Print out the content to the file: - - >>> with local.open_input_file(path) as f: - ... print(f.readall()) - b'data+newly added' - """ - def normalize_path(self, path: str) -> str: - """ - Normalize filesystem path. - - Parameters - ---------- - path : str - The path to normalize - - Returns - ------- - normalized_path : str - The normalized path - """ + ): ... -class LocalFileSystem(FileSystem): - """ - A FileSystem implementation accessing files on the local machine. + def normalize_path(self, path: str) -> str: ... - Details such as symlinks are abstracted away (symlinks are always followed, - except when deleting an entry). - - Parameters - ---------- - use_mmap : bool, default False - Whether open_input_stream and open_input_file should return - a mmap'ed file or a regular file. - - Examples - -------- - Create a FileSystem object with LocalFileSystem constructor: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> local - - - and write data on to the file: - - >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: - ... stream.write(b"data") - 4 - >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: - ... print(stream.readall()) - b'data' - - Create a FileSystem object inferred from a URI of the saved file: - - >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") - >>> local_new - >> path - '/tmp/local_fs.dat' - - Check if FileSystems `local` and `local_new` are equal: - - >>> local.equals(local_new) - True - - Compare two different FileSystems: - - >>> local2 = fs.LocalFileSystem(use_mmap=True) - >>> local.equals(local2) - False - - Copy a file and print out the data: - - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") - >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: - ... print(stream.readall()) - b'data' - - Open an output stream for appending, add text and print the new data: - - >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: - ... f.write(b"+newly added") - 12 - - >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: - ... print(f.readall()) - b'data+newly added' - - Create a directory, copy a file into it and then delete the whole directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder") - - >>> local.delete_dir("/tmp/new_folder") - >>> local.get_file_info("/tmp/new_folder") - - - Create a directory, copy a file into it and then delete - the content of the directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - >>> local.delete_dir_contents("/tmp/new_folder") - >>> local.get_file_info("/tmp/new_folder") - - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - - Create a directory, copy a file into it and then delete - the file from the directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.delete_file("/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - >>> local.get_file_info("/tmp/new_folder") - - - Move the file: - >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") - - >>> local.get_file_info("/tmp/local_fs-copy.dat") - - - To finish delete the file left: - >>> local.delete_file("/tmp/local_fs.dat") - """ +class LocalFileSystem(FileSystem): def __init__(self, *, use_mmap: bool = False) -> None: ... -class SubTreeFileSystem(FileSystem): - """ - Delegates to another implementation after prepending a fixed base path. - - This is useful to expose a logical view of a subtree of a filesystem, - for example a directory in a LocalFileSystem. - - Note, that this makes no security guarantee. For example, symlinks may - allow to "escape" the subtree and access other parts of the underlying - filesystem. - - Parameters - ---------- - base_path : str - The root of the subtree. - base_fs : FileSystem - FileSystem object the operations delegated to. - - Examples - -------- - Create a LocalFileSystem instance: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: - ... stream.write(b"data") - 4 - Create a directory and a SubTreeFileSystem instance: - - >>> local.create_dir("/tmp/sub_tree") - >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) - - Write data into the existing file: - - >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: - ... f.write(b"+newly added") - 12 - - Print out the attributes: - - >>> subtree.base_fs - - >>> subtree.base_path - '/tmp/sub_tree/' - - Get info for the given directory or given file: - - >>> subtree.get_file_info("") - - >>> subtree.get_file_info("sub_tree_fs.dat") - - - Delete the file and directory: - - >>> subtree.delete_file("sub_tree_fs.dat") - >>> local.delete_dir("/tmp/sub_tree") - >>> local.delete_file("/tmp/local_fs.dat") +class SubTreeFileSystem(FileSystem): - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ def __init__(self, base_path: str, base_fs: FileSystem): ... @property def base_path(self) -> str: ... @property def base_fs(self) -> FileSystem: ... + class _MockFileSystem(FileSystem): def __init__(self, current_time: dt.datetime | None = None) -> None: ... -class PyFileSystem(FileSystem): - """ - A FileSystem with behavior implemented in Python. - - Parameters - ---------- - handler : FileSystemHandler - The handler object implementing custom filesystem behavior. - - Examples - -------- - Create an fsspec-based filesystem object for GitHub: - - >>> from fsspec.implementations import github - >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP - - Get a PyArrow FileSystem object: - >>> from pyarrow.fs import PyFileSystem, FSSpecHandler - >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP - - Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: +class PyFileSystem(FileSystem): - >>> pa_fs.get_file_info("README.md") # doctest: +SKIP - - """ def __init__(self, handler: FileSystemHandler) -> None: ... @property - def handler(self) -> FileSystemHandler: - """ - The filesystem's underlying handler. + def handler(self) -> FileSystemHandler: ... - Returns - ------- - handler : FileSystemHandler - """ class FileSystemHandler(ABC): - """ - An abstract class exposing methods to implement PyFileSystem's behavior. - """ - @abstractmethod - def get_type_name(self) -> str: - """ - Implement PyFileSystem.type_name. - """ + @abstractmethod - def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: - """ - Implement PyFileSystem.get_file_info(paths). - - Parameters - ---------- - paths : list of str - paths for which we want to retrieve the info. - """ + def get_type_name(self) -> str: ... + @abstractmethod - def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: - """ - Implement PyFileSystem.get_file_info(selector). + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ... - Parameters - ---------- - selector : FileSelector - selector for which we want to retrieve the info. - """ + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... @abstractmethod - def create_dir(self, path: str, recursive: bool) -> None: - """ - Implement PyFileSystem.create_dir(...). - - Parameters - ---------- - path : str - path of the directory. - recursive : bool - if the parent directories should be created too. - """ + def create_dir(self, path: str, recursive: bool) -> None: ... + @abstractmethod - def delete_dir(self, path: str) -> None: - """ - Implement PyFileSystem.delete_dir(...). - - Parameters - ---------- - path : str - path of the directory. - """ + def delete_dir(self, path: str) -> None: ... + @abstractmethod - def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: - """ - Implement PyFileSystem.delete_dir_contents(...). - - Parameters - ---------- - path : str - path of the directory. - missing_dir_ok : bool - if False an error should be raised if path does not exist - """ + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + @abstractmethod - def delete_root_dir_contents(self) -> None: - """ - Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). - """ + def delete_root_dir_contents(self) -> None: ... + @abstractmethod - def delete_file(self, path: str) -> None: - """ - Implement PyFileSystem.delete_file(...). - - Parameters - ---------- - path : str - path of the file. - """ + def delete_file(self, path: str) -> None: ... + @abstractmethod - def move(self, src: str, dest: str) -> None: - """ - Implement PyFileSystem.move(...). - - Parameters - ---------- - src : str - path of what should be moved. - dest : str - path of where it should be moved to. - """ + def move(self, src: str, dest: str) -> None: ... @abstractmethod - def copy_file(self, src: str, dest: str) -> None: - """ - Implement PyFileSystem.copy_file(...). - - Parameters - ---------- - src : str - path of what should be copied. - dest : str - path of where it should be copied to. - """ + def copy_file(self, src: str, dest: str) -> None: ... + @abstractmethod - def open_input_stream(self, path: str) -> NativeFile: - """ - Implement PyFileSystem.open_input_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - """ + def open_input_stream(self, path: str) -> NativeFile: ... + @abstractmethod - def open_input_file(self, path: str) -> NativeFile: - """ - Implement PyFileSystem.open_input_file(...). - - Parameters - ---------- - path : str - path of what should be opened. - """ + def open_input_file(self, path: str) -> NativeFile: ... + @abstractmethod - def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: - """ - Implement PyFileSystem.open_output_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - metadata : mapping - Mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - """ + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... @abstractmethod - def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: - """ - Implement PyFileSystem.open_append_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - metadata : mapping - Mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - """ + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + @abstractmethod - def normalize_path(self, path: str) -> str: - """ - Implement PyFileSystem.normalize_path(...). - - Parameters - ---------- - path : str - path of what should be normalized. - """ + def normalize_path(self, path: str) -> str: ... + SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] diff --git a/python/pyarrow-stubs/_gcsfs.pyi b/python/pyarrow-stubs/_gcsfs.pyi index 0ced106615a..631c7ae4932 100644 --- a/python/pyarrow-stubs/_gcsfs.pyi +++ b/python/pyarrow-stubs/_gcsfs.pyi @@ -20,59 +20,8 @@ import datetime as dt from ._fs import FileSystem from .lib import KeyValueMetadata -class GcsFileSystem(FileSystem): - """ - Google Cloud Storage (GCS) backed FileSystem implementation - - By default uses the process described in https://google.aip.dev/auth/4110 - to resolve credentials. If not running on Google Cloud Platform (GCP), - this generally requires the environment variable - GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file - containing credentials. - - Note: GCS buckets are special and the operations available on them may be - limited or more expensive than expected compared to local file systems. - Note: When pickling a GcsFileSystem that uses default credentials, resolution - credentials are not stored in the serialized data. Therefore, when unpickling - it is assumed that the necessary credentials are in place for the target - process. - - Parameters - ---------- - anonymous : boolean, default False - Whether to connect anonymously. - If true, will not attempt to look up credentials using standard GCP - configuration methods. - access_token : str, default None - GCP access token. If provided, temporary credentials will be fetched by - assuming this role; also, a `credential_token_expiration` must be - specified as well. - target_service_account : str, default None - An optional service account to try to impersonate when accessing GCS. This - requires the specified credential user or service account to have the necessary - permissions. - credential_token_expiration : datetime, default None - Expiration for credential generated with an access token. Must be specified - if `access_token` is specified. - default_bucket_location : str, default 'US' - GCP region to create buckets in. - scheme : str, default 'https' - GCS connection transport scheme. - endpoint_override : str, default None - Override endpoint with a connect string such as "localhost:9000" - default_metadata : mapping or pyarrow.KeyValueMetadata, default None - Default metadata for `open_output_stream`. This will be ignored if - non-empty metadata is passed to `open_output_stream`. - retry_time_limit : timedelta, default None - Set the maximum amount of time the GCS client will attempt to retry - transient errors. Subsecond granularity is ignored. - project_id : str, default None - The GCP project identifier to use for creating buckets. - If not set, the library uses the GOOGLE_CLOUD_PROJECT environment - variable. Most I/O operations do not need a project id, only applications - that create new buckets need a project id. - """ +class GcsFileSystem(FileSystem): def __init__( self, @@ -89,12 +38,7 @@ class GcsFileSystem(FileSystem): project_id: str | None = None, ): ... @property - def default_bucket_location(self) -> str: - """ - The GCP location this filesystem will write to. - """ + def default_bucket_location(self) -> str: ... + @property - def project_id(self) -> str: - """ - The GCP project id this filesystem will use. - """ + def project_id(self) -> str: ... diff --git a/python/pyarrow-stubs/_hdfs.pyi b/python/pyarrow-stubs/_hdfs.pyi index ed367379171..ee1253d64b6 100644 --- a/python/pyarrow-stubs/_hdfs.pyi +++ b/python/pyarrow-stubs/_hdfs.pyi @@ -19,42 +19,9 @@ from _typeshed import StrPath from ._fs import FileSystem -class HadoopFileSystem(FileSystem): - """ - HDFS backed FileSystem implementation - - Parameters - ---------- - host : str - HDFS host to connect to. Set to "default" for fs.defaultFS from - core-site.xml. - port : int, default 8020 - HDFS port to connect to. Set to 0 for default or logical (HA) nodes. - user : str, default None - Username when connecting to HDFS; None implies login user. - replication : int, default 3 - Number of copies each block will have. - buffer_size : int, default 0 - If 0, no buffering will happen otherwise the size of the temporary read - and write buffer. - default_block_size : int, default None - None means the default configuration for HDFS, a typical block size is - 128 MB. - kerb_ticket : string or path, default None - If not None, the path to the Kerberos ticket cache. - extra_conf : dict, default None - Extra key/value pairs for configuration; will override any - hdfs-site.xml properties. - Examples - -------- - >>> from pyarrow import fs - >>> hdfs = fs.HadoopFileSystem( - ... host, port, user=user, kerb_ticket=ticket_cache_path - ... ) # doctest: +SKIP +class HadoopFileSystem(FileSystem): - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ def __init__( self, host: str, @@ -68,25 +35,4 @@ class HadoopFileSystem(FileSystem): extra_conf: dict | None = None, ): ... @staticmethod - def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] - """ - Instantiate HadoopFileSystem object from an URI string. - - The following two calls are equivalent - - * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ -&replication=1')`` - * ``HadoopFileSystem('localhost', port=8020, user='test', \ -replication=1)`` - - Parameters - ---------- - uri : str - A string URI describing the connection to HDFS. - In order to change the user, replication, buffer_size or - default_block_size pass the values as query parts. - - Returns - ------- - HadoopFileSystem - """ + def from_uri(uri: str) -> HadoopFileSystem: ... # type: ignore[override] diff --git a/python/pyarrow-stubs/_ipc.pyi b/python/pyarrow-stubs/_ipc.pyi index 1676e49e962..23d770070e7 100644 --- a/python/pyarrow-stubs/_ipc.pyi +++ b/python/pyarrow-stubs/_ipc.pyi @@ -34,6 +34,7 @@ from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakref from .io import Buffer, Codec, NativeFile from ._types import DictionaryMemo, KeyValueMetadata + class MetadataVersion(enum.IntEnum): V1 = enum.auto() V2 = enum.auto() @@ -41,22 +42,8 @@ class MetadataVersion(enum.IntEnum): V4 = enum.auto() V5 = enum.auto() + class WriteStats(NamedTuple): - """IPC write statistics - - Parameters - ---------- - num_messages : int - Number of messages. - num_record_batches : int - Number of record batches. - num_dictionary_batches : int - Number of dictionary batches. - num_dictionary_deltas : int - Delta of dictionaries. - num_replaced_dictionaries : int - Number of replaced dictionaries. - """ num_messages: int num_record_batches: int @@ -64,22 +51,8 @@ class WriteStats(NamedTuple): num_dictionary_deltas: int num_replaced_dictionaries: int + class ReadStats(NamedTuple): - """IPC read statistics - - Parameters - ---------- - num_messages : int - Number of messages. - num_record_batches : int - Number of record batches. - num_dictionary_batches : int - Number of dictionary batches. - num_dictionary_deltas : int - Delta of dictionaries. - num_replaced_dictionaries : int - Number of replaced dictionaries. - """ num_messages: int num_record_batches: int @@ -87,26 +60,13 @@ class ReadStats(NamedTuple): num_dictionary_deltas: int num_replaced_dictionaries: int + class IpcReadOptions(_Weakrefable): - """ - Serialization options for reading IPC format. - - Parameters - ---------- - ensure_native_endian : bool, default True - Whether to convert incoming data to platform-native endianness. - use_threads : bool - Whether to use the global CPU thread pool to parallelize any - computational tasks like decompression - included_fields : list - If empty (the default), return all deserialized fields. - If non-empty, the values are the indices of fields to read on - the top-level schema - """ ensure_native_endian: bool use_threads: bool included_fields: list[int] + def __init__( self, *, @@ -115,40 +75,8 @@ class IpcReadOptions(_Weakrefable): included_fields: list[int] | None = None, ) -> None: ... + class IpcWriteOptions(_Weakrefable): - """ - Serialization options for the IPC format. - - Parameters - ---------- - metadata_version : MetadataVersion, default MetadataVersion.V5 - The metadata version to write. V5 is the current and latest, - V4 is the pre-1.0 metadata version (with incompatible Union layout). - allow_64bit : bool, default False - If true, allow field lengths that don't fit in a signed 32-bit int. - use_legacy_format : bool, default False - Whether to use the pre-Arrow 0.15 IPC format. - compression : str, Codec, or None - compression codec to use for record batch buffers. - If None then batch buffers will be uncompressed. - Must be "lz4", "zstd" or None. - To specify a compression_level use `pyarrow.Codec` - use_threads : bool - Whether to use the global CPU thread pool to parallelize any - computational tasks like compression. - emit_dictionary_deltas : bool - Whether to emit dictionary deltas. Default is false for maximum - stream compatibility. - unify_dictionaries : bool - If true then calls to write_table will attempt to unify dictionaries - across all batches in the table. This can help avoid the need for - replacement dictionaries (which the file format does not support) - but requires computing the unified dictionary and then remapping - the indices arrays. - - This parameter is ignored when writing to the IPC stream format as - the IPC stream format can support replacement dictionaries. - """ metadata_version: MetadataVersion allow_64bit: bool @@ -157,6 +85,7 @@ class IpcWriteOptions(_Weakrefable): use_threads: bool emit_dictionary_deltas: bool unify_dictionaries: bool + def __init__( self, *, @@ -169,10 +98,8 @@ class IpcWriteOptions(_Weakrefable): unify_dictionaries: bool = False, ) -> None: ... + class Message(_Weakrefable): - """ - Container for an Arrow IPC message with metadata and optional body - """ @property def type(self) -> str: ... @@ -183,416 +110,134 @@ class Message(_Weakrefable): @property def body(self) -> Buffer | None: ... def equals(self, other: Message) -> bool: ... + def serialize_to( self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None - ): - """ - Write message to generic OutputStream - - Parameters - ---------- - sink : NativeFile - alignment : int, default 8 - Byte alignment for metadata and body - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - """ - def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write message as encapsulated IPC message - - Parameters - ---------- - alignment : int, default 8 - Byte alignment for metadata and body - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - """ + ): ... + + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | + None = None) -> Buffer: ... + class MessageReader(_Weakrefable): - """ - Interface for reading Message objects from some source (like an - InputStream) - """ + @classmethod - def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: - """ - Open stream from source, if you want to use memory map use - MemoryMappedFile as source. - - Parameters - ---------- - source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object - A readable source, like an InputStream - """ + def open_stream(cls, source: bytes | NativeFile | + IOBase | SupportPyBuffer) -> Self: ... + def __iter__(self) -> Self: ... - def read_next_message(self) -> Message: - """ - Read next Message from the stream. - - Raises - ------ - StopIteration - At end of stream - """ + def read_next_message(self) -> Message: ... + __next__ = read_next_message # ---------------------------------------------------------------------- # File and stream readers and writers + class _CRecordBatchWriter(_Weakrefable): - """The base RecordBatchWriter wrapper. - - Provides common implementations of convenience methods. Should not - be instantiated directly by user code. - """ - def write(self, table_or_batch: Table | RecordBatch): - """ - Write RecordBatch or Table to stream. - - Parameters - ---------- - table_or_batch : {RecordBatch, Table} - """ + + def write(self, table_or_batch: Table | RecordBatch): ... + def write_batch( self, batch: RecordBatch, custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, - ): - """ - Write RecordBatch to stream. - - Parameters - ---------- - batch : RecordBatch - custom_metadata : mapping or KeyValueMetadata - Keys and values must be string-like / coercible to bytes - """ - def write_table(self, table: Table, max_chunksize: int | None = None) -> None: - """ - Write Table to stream in (contiguous) RecordBatch objects. - - Parameters - ---------- - table : Table - max_chunksize : int, default None - Maximum number of rows for RecordBatch chunks. Individual chunks may - be smaller depending on the chunk layout of individual columns. - """ - def close(self) -> None: - """ - Close stream and write end-of-stream 0 marker. - """ + ): ... + + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... + + def close(self) -> None: ... + def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @property - def stats(self) -> WriteStats: - """ - Current IPC write statistics. - """ + def stats(self) -> WriteStats: ... + class _RecordBatchStreamWriter(_CRecordBatchWriter): @property def _use_legacy_format(self) -> bool: ... @property def _metadata_version(self) -> MetadataVersion: ... - def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... + def _open(self, sink, schema: Schema, + options: IpcWriteOptions = IpcWriteOptions()): ... class _ReadPandasMixin: - def read_pandas(self, **options) -> pd.DataFrame: - """ - Read contents of stream to a pandas.DataFrame. - - Read all record batches as a pyarrow.Table then convert it to a - pandas.DataFrame using Table.to_pandas. - - Parameters - ---------- - **options - Arguments to forward to :meth:`Table.to_pandas`. + def read_pandas(self, **options) -> pd.DataFrame: ... - Returns - ------- - df : pandas.DataFrame - """ class RecordBatchReader(_Weakrefable): - """Base class for reading stream of record batches. - - Record batch readers function as iterators of record batches that also - provide the schema (without the need to get any batches). - - Warnings - -------- - Do not call this class's constructor directly, use one of the - ``RecordBatchReader.from_*`` functions instead. - - Notes - ----- - To import and export using the Arrow C stream interface, use the - ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this - interface is intended for expert users. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([("x", pa.int64())]) - >>> def iter_record_batches(): - ... for i in range(2): - ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) - >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) - >>> print(reader.schema) - x: int64 - >>> for batch in reader: - ... print(batch) - pyarrow.RecordBatch - x: int64 - ---- - x: [1,2,3] - pyarrow.RecordBatch - x: int64 - ---- - x: [1,2,3] - """ def __iter__(self) -> Self: ... - def read_next_batch(self) -> RecordBatch: - """ - Read next RecordBatch from the stream. - - Raises - ------ - StopIteration: - At end of stream. - - Returns - ------- - RecordBatch - """ + def read_next_batch(self) -> RecordBatch: ... + __next__ = read_next_batch @property - def schema(self) -> Schema: - """ - Shared schema of the record batches in the stream. - - Returns - ------- - Schema - """ - def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: - """ - Read next RecordBatch from the stream along with its custom metadata. - - Raises - ------ - StopIteration: - At end of stream. - - Returns - ------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ + def schema(self) -> Schema: ... + + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + def iter_batches_with_custom_metadata( self, - ) -> Iterator[RecordBatchWithMetadata]: - """ - Iterate over record batches from the stream along with their custom - metadata. - - Yields - ------ - RecordBatchWithMetadata - """ - def read_all(self) -> Table: - """ - Read all record batches as a pyarrow.Table. - - Returns - ------- - Table - """ - read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] - def close(self) -> None: - """ - Release any resources associated with the reader. - """ + ) -> Iterator[RecordBatchWithMetadata]: ... + + def read_all(self) -> Table: ... + + # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + read_pandas = _ReadPandasMixin.read_pandas + def close(self) -> None: ... + def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... - def cast(self, target_schema: Schema) -> Self: - """ - Wrap this reader with one that casts each batch lazily as it is pulled. - Currently only a safe cast to target_schema is implemented. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - - Returns - ------- - RecordBatchReader - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowArrayStream struct, given its pointer. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArrayStream struct. - - Be careful: if you don't pass the ArrowArrayStream struct to a - consumer, array memory will leak. This is a low-level function - intended for expert users. - """ + def cast(self, target_schema: Schema) -> Self: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import RecordBatchReader from a C ArrowArrayStream struct, - given its pointer. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArrayStream struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export to a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - - Returns - ------- - PyCapsule - A capsule containing a C ArrowArrayStream struct. - """ + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + @classmethod - def _import_from_c_capsule(cls, stream) -> Self: - """ - Import RecordBatchReader from a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - stream: PyCapsule - A capsule containing a C ArrowArrayStream PyCapsule. - - Returns - ------- - RecordBatchReader - """ + def _import_from_c_capsule(cls, stream) -> Self: ... + @classmethod - def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: - """ - Create RecordBatchReader from a Arrow-compatible stream object. - - This accepts objects implementing the Arrow PyCapsule Protocol for - streams, i.e. objects that have a ``__arrow_c_stream__`` method. - - Parameters - ---------- - data : Arrow-compatible stream object - Any object that implements the Arrow PyCapsule Protocol for - streams. - schema : Schema, default None - The schema to which the stream should be casted, if supported - by the stream object. - - Returns - ------- - RecordBatchReader - """ + def from_stream(cls, data: SupportArrowStream, + schema: Schema | None = None) -> Self: ... + @classmethod - def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: - """ - Create RecordBatchReader from an iterable of batches. - - Parameters - ---------- - schema : Schema - The shared schema of the record batches - batches : Iterable[RecordBatch] - The batches that this reader will return. - - Returns - ------- - reader : RecordBatchReader - """ + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: ... + class _RecordBatchStreamReader(RecordBatchReader): @property - def stats(self) -> ReadStats: - """ - Current IPC read statistics. - """ + def stats(self) -> ReadStats: ... -class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... -class RecordBatchWithMetadata(NamedTuple): - """RecordBatch with its custom metadata +class _RecordBatchFileWriter(_RecordBatchStreamWriter): + ... + - Parameters - ---------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ +class RecordBatchWithMetadata(NamedTuple): batch: RecordBatch custom_metadata: KeyValueMetadata + class _RecordBatchFileReader(_Weakrefable): @property - def num_record_batches(self) -> int: - """ - The number of record batches in the IPC file. - """ - def get_batch(self, i: int) -> RecordBatch: - """ - Read the record batch with the given index. - - Parameters - ---------- - i : int - The index of the record batch in the IPC file. - - Returns - ------- - batch : RecordBatch - """ + def num_record_batches(self) -> int: ... + + def get_batch(self, i: int) -> RecordBatch: ... + get_record_batch = get_batch - def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: - """ - Read the record batch with the given index along with - its custom metadata - - Parameters - ---------- - i : int - The index of the record batch in the IPC file. - - Returns - ------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - def read_all(self) -> Table: - """ - Read all record batches as a pyarrow.Table - """ - read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... + + def read_all(self) -> Table: ... + + # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + read_pandas = _ReadPandasMixin.read_pandas def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @property @@ -600,105 +245,30 @@ class _RecordBatchFileReader(_Weakrefable): @property def stats(self) -> ReadStats: ... -def get_tensor_size(tensor: Tensor) -> int: - """ - Return total size of serialized Tensor including metadata and padding. - - Parameters - ---------- - tensor : Tensor - The tensor for which we want to known the size. - """ - -def get_record_batch_size(batch: RecordBatch) -> int: - """ - Return total size of serialized RecordBatch including metadata and padding. - - Parameters - ---------- - batch : RecordBatch - The recordbatch for which we want to know the size. - """ - -def write_tensor(tensor: Tensor, dest: NativeFile) -> int: - """ - Write pyarrow.Tensor to pyarrow.NativeFile object its current position. - - Parameters - ---------- - tensor : pyarrow.Tensor - dest : pyarrow.NativeFile - - Returns - ------- - bytes_written : int - Total number of bytes written to the file - """ - -def read_tensor(source: NativeFile) -> Tensor: - """Read pyarrow.Tensor from pyarrow.NativeFile object from current - position. If the file source supports zero copy (e.g. a memory map), then - this operation does not allocate any memory. This function not assume that - the stream is aligned - - Parameters - ---------- - source : pyarrow.NativeFile - - Returns - ------- - tensor : Tensor - - """ - -def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: - """ - Read length-prefixed message from file or buffer-like object - - Parameters - ---------- - source : pyarrow.NativeFile, file-like object, or buffer-like object - - Returns - ------- - message : Message - """ - -def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: - """ - Read Schema from message or buffer - - Parameters - ---------- - obj : buffer or Message - dictionary_memo : DictionaryMemo, optional - Needed to be able to reconstruct dictionary-encoded fields - with read_record_batch - - Returns - ------- - schema : Schema - """ + +def get_tensor_size(tensor: Tensor) -> int: ... + + +def get_record_batch_size(batch: RecordBatch) -> int: ... + + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... + + +def read_tensor(source: NativeFile) -> Tensor: ... + + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... + + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | + None = None) -> Schema: ... + def read_record_batch( obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None -) -> RecordBatch: - """ - Read RecordBatch from message, given a known schema. If reading data from a - complete IPC stream, use ipc.open_stream instead - - Parameters - ---------- - obj : Message or Buffer-like - schema : Schema - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - - Returns - ------- - batch : RecordBatch - """ +) -> RecordBatch: ... + __all__ = [ "MetadataVersion", diff --git a/python/pyarrow-stubs/_json.pyi b/python/pyarrow-stubs/_json.pyi index f416b4b29c6..b52be2bf028 100644 --- a/python/pyarrow-stubs/_json.pyi +++ b/python/pyarrow-stubs/_json.pyi @@ -22,165 +22,49 @@ from _typeshed import StrPath from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable class ReadOptions(_Weakrefable): - """ - Options for reading JSON files. - - Parameters - ---------- - use_threads : bool, optional (default True) - Whether to use multiple threads to accelerate reading - block_size : int, optional - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as - the size of individual chunks in the Table. - """ + use_threads: bool - """ - Whether to use multiple threads to accelerate reading. - """ + block_size: int - """ - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as the size of - individual chunks in the Table. - """ def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... - def equals(self, other: ReadOptions) -> bool: - """ - Parameters - ---------- - other : pyarrow.json.ReadOptions + def equals(self, other: ReadOptions) -> bool: ... - Returns - ------- - bool - """ class ParseOptions(_Weakrefable): - """ - Options for parsing JSON files. - - Parameters - ---------- - explicit_schema : Schema, optional (default None) - Optional explicit schema (no type inference, ignores other fields). - newlines_in_values : bool, optional (default False) - Whether objects may be printed across multiple lines (for example - pretty printed). If false, input must end with an empty line. - unexpected_field_behavior : str, default "infer" - How JSON fields outside of explicit_schema (if given) are treated. - - Possible behaviors: - - - "ignore": unexpected JSON fields are ignored - - "error": error out on unexpected JSON fields - - "infer": unexpected JSON fields are type-inferred and included in - the output - """ + explicit_schema: Schema - """ - Optional explicit schema (no type inference, ignores other fields) - """ - newlines_in_values: bool - """ - Whether newline characters are allowed in JSON values. - Setting this to True reduces the performance of multi-threaded - JSON reading. - """ - unexpected_field_behavior: Literal["ignore", "error", "infer"] - """ - How JSON fields outside of explicit_schema (if given) are treated. - Possible behaviors: + newlines_in_values: bool - - "ignore": unexpected JSON fields are ignored - - "error": error out on unexpected JSON fields - - "infer": unexpected JSON fields are type-inferred and included in - the output + unexpected_field_behavior: Literal["ignore", "error", "infer"] - Set to "infer" by default. - """ def __init__( self, explicit_schema: Schema | None = None, newlines_in_values: bool | None = None, unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", ): ... - def equals(self, other: ParseOptions) -> bool: - """ - Parameters - ---------- - other : pyarrow.json.ParseOptions + def equals(self, other: ParseOptions) -> bool: ... - Returns - ------- - bool - """ -class JSONStreamingReader(RecordBatchReader): - """An object that reads record batches incrementally from a JSON file. +class JSONStreamingReader(RecordBatchReader): ... - Should not be instantiated directly by user code. - """ def read_json( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, -) -> Table: - """ - Read a Table from a stream of JSON data. - - Parameters - ---------- - input_file : str, path or file-like object - The location of JSON data. Currently only the line-delimited JSON - format is supported. - read_options : pyarrow.json.ReadOptions, optional - Options for the JSON reader (see ReadOptions constructor for defaults). - parse_options : pyarrow.json.ParseOptions, optional - Options for the JSON parser - (see ParseOptions constructor for defaults). - memory_pool : MemoryPool, optional - Pool to allocate Table memory from. - - Returns - ------- - :class:`pyarrow.Table` - Contents of the JSON file as a in-memory table. - """ +) -> Table: ... + def open_json( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, -) -> JSONStreamingReader: - """ - Open a streaming reader of JSON data. - - Reading using this function is always single-threaded. - - Parameters - ---------- - input_file : string, path or file-like object - The location of JSON data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.json.ReadOptions, optional - Options for the JSON reader (see pyarrow.json.ReadOptions constructor - for defaults) - parse_options : pyarrow.json.ParseOptions, optional - Options for the JSON parser - (see pyarrow.json.ParseOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate RecordBatch memory from - - Returns - ------- - :class:`pyarrow.json.JSONStreamingReader` - """ +) -> JSONStreamingReader: ... + diff --git a/python/pyarrow-stubs/_parquet.pyi b/python/pyarrow-stubs/_parquet.pyi index c75337cbf3b..ce499fd1c16 100644 --- a/python/pyarrow-stubs/_parquet.pyi +++ b/python/pyarrow-stubs/_parquet.pyi @@ -110,6 +110,7 @@ _Compression: TypeAlias = Literal[ "UNKNOWN", ] + class _Statistics(TypedDict): has_min_max: bool min: Any | None @@ -119,6 +120,7 @@ class _Statistics(TypedDict): num_values: int physical_type: _PhysicalType + class Statistics(_Weakrefable): def to_dict(self) -> _Statistics: ... def equals(self, other: Statistics) -> bool: ... @@ -149,11 +151,13 @@ class Statistics(_Weakrefable): @property def converted_type(self) -> _ConvertedType | None: ... + class ParquetLogicalType(_Weakrefable): def to_json(self) -> str: ... @property def type(self) -> _LogicTypeName: ... + class _ColumnChunkMetaData(TypedDict): file_offset: int file_path: str | None @@ -170,6 +174,7 @@ class _ColumnChunkMetaData(TypedDict): total_compressed_size: int total_uncompressed_size: int + class ColumnChunkMetaData(_Weakrefable): def to_dict(self) -> _ColumnChunkMetaData: ... def equals(self, other: ColumnChunkMetaData) -> bool: ... @@ -212,15 +217,18 @@ class ColumnChunkMetaData(_Weakrefable): @property def metadata(self) -> dict[bytes, bytes] | None: ... + class _SortingColumn(TypedDict): column_index: int descending: bool nulls_first: bool + class SortingColumn: def __init__( self, column_index: int, descending: bool = False, nulls_first: bool = False ) -> None: ... + @classmethod def from_ordering( cls, @@ -228,6 +236,7 @@ class SortingColumn: sort_keys: Sequence[tuple[str, Order]], null_placement: Literal["at_start", "at_end"] = "at_end", ) -> tuple[SortingColumn, ...]: ... + @staticmethod def to_ordering( schema: Schema, sorting_columns: tuple[SortingColumn, ...] @@ -241,6 +250,7 @@ class SortingColumn: def nulls_first(self) -> bool: ... def to_dict(self) -> _SortingColumn: ... + class _RowGroupMetaData(TypedDict): num_columns: int num_rows: int @@ -248,6 +258,7 @@ class _RowGroupMetaData(TypedDict): columns: list[ColumnChunkMetaData] sorting_columns: list[SortingColumn] + class RowGroupMetaData(_Weakrefable): def __init__(self, parent: FileMetaData, index: int) -> None: ... def equals(self, other: RowGroupMetaData) -> bool: ... @@ -262,6 +273,7 @@ class RowGroupMetaData(_Weakrefable): @property def sorting_columns(self) -> list[SortingColumn]: ... + class _FileMetaData(TypedDict): created_by: str num_columns: int @@ -270,6 +282,7 @@ class _FileMetaData(TypedDict): format_version: str serialized_size: int + class FileMetaData(_Weakrefable): def __hash__(self) -> int: ... def to_dict(self) -> _FileMetaData: ... @@ -293,7 +306,9 @@ class FileMetaData(_Weakrefable): def row_group(self, i: int) -> RowGroupMetaData: ... def set_file_path(self, path: str) -> None: ... def append_row_groups(self, other: FileMetaData) -> None: ... - def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | + NativeFile | IO) -> None: ... + class ParquetSchema(_Weakrefable): def __init__(self, container: FileMetaData) -> None: ... @@ -306,6 +321,7 @@ class ParquetSchema(_Weakrefable): def equals(self, other: ParquetSchema) -> bool: ... def column(self, i: int) -> ColumnSchema: ... + class ColumnSchema(_Weakrefable): def __init__(self, schema: ParquetSchema, index: int) -> None: ... def equals(self, other: ColumnSchema) -> bool: ... @@ -330,8 +346,10 @@ class ColumnSchema(_Weakrefable): @property def scale(self) -> int | None: ... + class ParquetReader(_Weakrefable): def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open( self, source: StrPath | NativeFile | IO, @@ -357,6 +375,7 @@ class ParquetReader(_Weakrefable): def num_row_groups(self) -> int: ... def set_use_threads(self, use_threads: bool) -> None: ... def set_batch_size(self, batch_size: int) -> None: ... + def iter_batches( self, batch_size: int, @@ -364,25 +383,31 @@ class ParquetReader(_Weakrefable): column_indices: list[int] | None = None, use_threads: bool = True, ) -> Iterator[RecordBatch]: ... + def read_row_group( self, i: int, column_indices: list[int] | None = None, use_threads: bool = True ) -> Table: ... + def read_row_groups( self, row_groups: list[int], column_indices: list[int] | None = None, use_threads: bool = True, ) -> Table: ... + def read_all( self, column_indices: list[int] | None = None, use_threads: bool = True ) -> Table: ... - def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def scan_contents( + self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def column_name_idx(self, column_name: str) -> int: ... def read_column(self, column_index: int) -> ChunkedArray: ... def close(self) -> None: ... @property def closed(self) -> bool: ... + class ParquetWriter(_Weakrefable): def __init__( self, @@ -458,5 +483,10 @@ class ParquetWriter(_Weakrefable): @property def store_decimal_as_integer(self) -> bool: ... -class FileEncryptionProperties: ... -class FileDecryptionProperties: ... + +class FileEncryptionProperties: + ... + + +class FileDecryptionProperties: + ... diff --git a/python/pyarrow-stubs/_parquet_encryption.pyi b/python/pyarrow-stubs/_parquet_encryption.pyi index e1228cbdb5a..cf09b6ee39c 100644 --- a/python/pyarrow-stubs/_parquet_encryption.pyi +++ b/python/pyarrow-stubs/_parquet_encryption.pyi @@ -22,6 +22,7 @@ from typing import Callable from ._parquet import FileDecryptionProperties, FileEncryptionProperties from .lib import _Weakrefable + class EncryptionConfiguration(_Weakrefable): footer_key: str column_keys: dict[str, list[str]] @@ -45,15 +46,18 @@ class EncryptionConfiguration(_Weakrefable): data_key_length_bits: int | None = None, ) -> None: ... + class DecryptionConfiguration(_Weakrefable): cache_lifetime: dt.timedelta def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + class KmsConnectionConfig(_Weakrefable): kms_instance_id: str kms_instance_url: str key_access_token: str custom_kms_conf: dict[str, str] + def __init__( self, *, @@ -64,17 +68,22 @@ class KmsConnectionConfig(_Weakrefable): ) -> None: ... def refresh_key_access_token(self, value: str) -> None: ... + class KmsClient(_Weakrefable): def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + class CryptoFactory(_Weakrefable): - def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... + def __init__(self, kms_client_factory: Callable[[ + KmsConnectionConfig], KmsClient]): ... + def file_encryption_properties( self, kms_connection_config: KmsConnectionConfig, encryption_config: EncryptionConfiguration, ) -> FileEncryptionProperties: ... + def file_decryption_properties( self, kms_connection_config: KmsConnectionConfig, diff --git a/python/pyarrow-stubs/_s3fs.pyi b/python/pyarrow-stubs/_s3fs.pyi index f1399bc4b1e..f065d78f993 100644 --- a/python/pyarrow-stubs/_s3fs.pyi +++ b/python/pyarrow-stubs/_s3fs.pyi @@ -23,6 +23,7 @@ from typing_extensions import Required, NotRequired from ._fs import FileSystem from .lib import KeyValueMetadata + class _ProxyOptions(TypedDict): schema: Required[Literal["http", "https"]] host: Required[str] @@ -30,6 +31,7 @@ class _ProxyOptions(TypedDict): username: NotRequired[str] password: NotRequired[str] + class S3LogLevel(enum.IntEnum): Off = enum.auto() Fatal = enum.auto() @@ -39,6 +41,7 @@ class S3LogLevel(enum.IntEnum): Debug = enum.auto() Trace = enum.auto() + Off = S3LogLevel.Off Fatal = S3LogLevel.Fatal Error = S3LogLevel.Error @@ -47,6 +50,7 @@ Info = S3LogLevel.Info Debug = S3LogLevel.Debug Trace = S3LogLevel.Trace + def initialize_s3( log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 ) -> None: ... @@ -55,12 +59,19 @@ def finalize_s3() -> None: ... def ensure_s3_finalized() -> None: ... def resolve_s3_region(bucket: str) -> str: ... + class S3RetryStrategy: max_attempts: int def __init__(self, max_attempts=3) -> None: ... -class AwsStandardS3RetryStrategy(S3RetryStrategy): ... -class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... + +class AwsStandardS3RetryStrategy(S3RetryStrategy): + ... + + +class AwsDefaultS3RetryStrategy(S3RetryStrategy): + ... + class S3FileSystem(FileSystem): def __init__( diff --git a/python/pyarrow-stubs/_stubs_typing.pyi b/python/pyarrow-stubs/_stubs_typing.pyi index 98479791103..56aa7fd1123 100644 --- a/python/pyarrow-stubs/_stubs_typing.pyi +++ b/python/pyarrow-stubs/_stubs_typing.pyi @@ -56,21 +56,27 @@ _V = TypeVar("_V", covariant=True) SingleOrList: TypeAlias = list[_T] | _T + class SupportEq(Protocol): def __eq__(self, other) -> bool: ... + class SupportLt(Protocol): def __lt__(self, other) -> bool: ... + class SupportGt(Protocol): def __gt__(self, other) -> bool: ... + class SupportLe(Protocol): def __le__(self, other) -> bool: ... + class SupportGe(Protocol): def __ge__(self, other) -> bool: ... + FilterTuple: TypeAlias = ( tuple[str, Literal["=", "==", "!="], SupportEq] | tuple[str, Literal["<"], SupportLt] @@ -80,22 +86,31 @@ FilterTuple: TypeAlias = ( | tuple[str, Literal["in", "not in"], Collection] ) -class Buffer(Protocol): ... -class SupportPyBuffer(Protocol): ... +class Buffer(Protocol): + ... + + +class SupportPyBuffer(Protocol): + ... + class SupportArrowStream(Protocol): def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + class SupportArrowArray(Protocol): def __arrow_c_array__(self, requested_schema=None) -> Any: ... + class SupportArrowDeviceArray(Protocol): def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + class SupportArrowSchema(Protocol): def __arrow_c_schema(self) -> Any: ... + class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... def __len__(self) -> int: ... diff --git a/python/pyarrow-stubs/_substrait.pyi b/python/pyarrow-stubs/_substrait.pyi index ee78e9720fe..12dd437412f 100644 --- a/python/pyarrow-stubs/_substrait.pyi +++ b/python/pyarrow-stubs/_substrait.pyi @@ -20,6 +20,7 @@ from typing import Any, Callable from ._compute import Expression from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + def run_query( plan: Buffer | int, *, @@ -28,14 +29,18 @@ def run_query( ) -> RecordBatchReader: ... def _parse_json_plan(plan: bytes) -> Buffer: ... + class SubstraitSchema: schema: Schema expression: Expression def __init__(self, schema: Schema, expression: Expression) -> None: ... def to_pysubstrait(self) -> Any: ... + def serialize_schema(schema: Schema) -> SubstraitSchema: ... def deserialize_schema(buf: Buffer | bytes) -> Schema: ... + + def serialize_expressions( exprs: list[Expression], names: list[str], @@ -44,6 +49,7 @@ def serialize_expressions( allow_arrow_extensions: bool = False, ) -> Buffer: ... + class BoundExpressions(_Weakrefable): @property def schema(self) -> Schema: ... @@ -52,5 +58,6 @@ class BoundExpressions(_Weakrefable): @classmethod def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... + def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow-stubs/_types.pyi b/python/pyarrow-stubs/_types.pyi index 6596fb3e1d1..0cb4bba6a6f 100644 --- a/python/pyarrow-stubs/_types.pyi +++ b/python/pyarrow-stubs/_types.pyi @@ -45,669 +45,278 @@ from typing_extensions import TypeVar, deprecated from .io import Buffer from .scalar import ExtensionScalar -class _Weakrefable: ... -class _Metadata(_Weakrefable): ... + +class _Weakrefable: + ... + + +class _Metadata(_Weakrefable): + ... + class DataType(_Weakrefable): - """ - Base class of all Arrow data types. - - Each data type is an *instance* of this class. - - Examples - -------- - Instance of int64 type: - - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - """ - def field(self, i: int) -> Field: - """ - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Field - """ + + def field(self, i: int) -> Field: ... + @property def id(self) -> int: ... @property - def bit_width(self) -> int: - """ - Bit width for fixed width type. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> pa.int64().bit_width - 64 - """ + def bit_width(self) -> int: ... + @property - def byte_width(self) -> int: - """ - Byte width for fixed width type. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> pa.int64().byte_width - 8 - """ + def byte_width(self) -> int: ... + @property - def num_fields(self) -> int: - """ - The number of child fields. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> pa.int64().num_fields - 0 - >>> pa.list_(pa.string()) - ListType(list) - >>> pa.list_(pa.string()).num_fields - 1 - >>> struct = pa.struct({'x': pa.int32(), 'y': pa.string()}) - >>> struct.num_fields - 2 - """ + def num_fields(self) -> int: ... + @property - def num_buffers(self) -> int: - """ - Number of data buffers required to construct Array type - excluding children. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64().num_buffers - 2 - >>> pa.string().num_buffers - 3 - """ - def __hash__(self) -> int: - """ - Return hash(self). - """ - def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: - """ - Return true if type is equivalent to passed value. - - Parameters - ---------- - other : DataType or string convertible to DataType - check_metadata : bool - Whether nested Field metadata equality should be checked as well. - - Returns - ------- - is_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64().equals(pa.string()) - False - >>> pa.int64().equals(pa.int64()) - True - """ - def to_pandas_dtype(self) -> np.generic: - """ - Return the equivalent NumPy / Pandas dtype. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64().to_pandas_dtype() - - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowSchema struct, given its pointer. - - Be careful: if you don't pass the ArrowSchema struct to a consumer, - its memory will leak. This is a low-level function intended for - expert users. - """ + def num_buffers(self) -> int: ... + + def __hash__(self) -> int: ... + + def equals(self, other: DataType | str, *, + check_metadata: bool = False) -> bool: ... + + def to_pandas_dtype(self) -> np.generic: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import DataType from a C ArrowSchema struct, given its pointer. - - This is a low-level function intended for expert users. - """ - def __arrow_c_schema__(self) -> Any: - """ - Export to a ArrowSchema PyCapsule - - Unlike _export_to_c, this will not leak memory if the capsule is not used. - """ + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + @classmethod - def _import_from_c_capsule(cls, schema) -> Self: - """ - Import a DataType from a ArrowSchema PyCapsule + def _import_from_c_capsule(cls, schema) -> Self: ... - Parameters - ---------- - schema : PyCapsule - A valid PyCapsule with name 'arrow_schema' containing an - ArrowSchema pointer. - """ _AsPyType = TypeVar("_AsPyType") _DataTypeT = TypeVar("_DataTypeT", bound=DataType) -class _BasicDataType(DataType, Generic[_AsPyType]): ... -class NullType(_BasicDataType[None]): ... -class BoolType(_BasicDataType[bool]): ... -class UInt8Type(_BasicDataType[int]): ... -class Int8Type(_BasicDataType[int]): ... -class UInt16Type(_BasicDataType[int]): ... -class Int16Type(_BasicDataType[int]): ... -class Uint32Type(_BasicDataType[int]): ... -class Int32Type(_BasicDataType[int]): ... -class UInt64Type(_BasicDataType[int]): ... -class Int64Type(_BasicDataType[int]): ... -class Float16Type(_BasicDataType[float]): ... -class Float32Type(_BasicDataType[float]): ... -class Float64Type(_BasicDataType[float]): ... -class Date32Type(_BasicDataType[dt.date]): ... -class Date64Type(_BasicDataType[dt.date]): ... -class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): ... -class StringType(_BasicDataType[str]): ... -class LargeStringType(_BasicDataType[str]): ... -class StringViewType(_BasicDataType[str]): ... -class BinaryType(_BasicDataType[bytes]): ... -class LargeBinaryType(_BasicDataType[bytes]): ... -class BinaryViewType(_BasicDataType[bytes]): ... -_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) -_Tz = TypeVar("_Tz", str, None, default=None) +class _BasicDataType(DataType, Generic[_AsPyType]): + ... + + +class NullType(_BasicDataType[None]): + ... + + +class BoolType(_BasicDataType[bool]): + ... + + +class UInt8Type(_BasicDataType[int]): + ... + + +class Int8Type(_BasicDataType[int]): + ... + + +class UInt16Type(_BasicDataType[int]): + ... + + +class Int16Type(_BasicDataType[int]): + ... + + +class Uint32Type(_BasicDataType[int]): + ... -class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): - """ - Concrete class for timestamp data types. - Examples - -------- - >>> import pyarrow as pa +class Int32Type(_BasicDataType[int]): + ... + + +class UInt64Type(_BasicDataType[int]): + ... + + +class Int64Type(_BasicDataType[int]): + ... + + +class Float16Type(_BasicDataType[float]): + ... + + +class Float32Type(_BasicDataType[float]): + ... + + +class Float64Type(_BasicDataType[float]): + ... - Create an instance of timestamp type: - >>> pa.timestamp('us') - TimestampType(timestamp[us]) +class Date32Type(_BasicDataType[dt.date]): + ... - Create an instance of timestamp type with timezone: - >>> pa.timestamp('s', tz='UTC') - TimestampType(timestamp[s, tz=UTC]) - """ +class Date64Type(_BasicDataType[dt.date]): + ... + + +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): + ... + + +class StringType(_BasicDataType[str]): + ... + + +class LargeStringType(_BasicDataType[str]): + ... + + +class StringViewType(_BasicDataType[str]): + ... + + +class BinaryType(_BasicDataType[bytes]): + ... + + +class LargeBinaryType(_BasicDataType[bytes]): + ... + + +class BinaryViewType(_BasicDataType[bytes]): + ... + + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + @property - def unit(self) -> _Unit: - """ - The timestamp unit ('s', 'ms', 'us' or 'ns'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.timestamp('us') - >>> t.unit - 'us' - """ + def unit(self) -> _Unit: ... + @property - def tz(self) -> _Tz: - """ - The timestamp time zone, if any, or None. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.timestamp('s', tz='UTC') - >>> t.tz - 'UTC' - """ + def tz(self) -> _Tz: ... -_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) -class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): - """ - Concrete class for time32 data types. +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) - Supported time unit resolutions are 's' [second] - and 'ms' [millisecond]. - Examples - -------- - Create an instance of time32 type: +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): - >>> import pyarrow as pa - >>> pa.time32('ms') - Time32Type(time32[ms]) - """ @property - def unit(self) -> _Time32Unit: - """ - The time unit ('s' or 'ms'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.time32('ms') - >>> t.unit - 'ms' - """ + def unit(self) -> _Time32Unit: ... -_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) -class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): - """ - Concrete class for time64 data types. +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) - Supported time unit resolutions are 'us' [microsecond] - and 'ns' [nanosecond]. - Examples - -------- - Create an instance of time64 type: +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): - >>> import pyarrow as pa - >>> pa.time64('us') - Time64Type(time64[us]) - """ @property - def unit(self) -> _Time64Unit: - """ - The time unit ('us' or 'ns'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.time64('us') - >>> t.unit - 'us' - """ + def unit(self) -> _Time64Unit: ... -class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): - """ - Concrete class for duration data types. - Examples - -------- - Create an instance of duration type: +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): - >>> import pyarrow as pa - >>> pa.duration('s') - DurationType(duration[s]) - """ @property - def unit(self) -> _Unit: - """ - The duration unit ('s', 'ms', 'us' or 'ns'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.duration('s') - >>> t.unit - 's' - """ + def unit(self) -> _Unit: ... -class FixedSizeBinaryType(_BasicDataType[Decimal]): - """ - Concrete class for fixed-size binary data types. - Examples - -------- - Create an instance of fixed-size binary type: +class FixedSizeBinaryType(_BasicDataType[Decimal]): + ... - >>> import pyarrow as pa - >>> pa.binary(3) - FixedSizeBinaryType(fixed_size_binary[3]) - """ _Precision = TypeVar("_Precision", default=Any) _Scale = TypeVar("_Scale", default=Any) -class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal32 data types. - Examples - -------- - Create an instance of decimal32 type: +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - >>> import pyarrow as pa - >>> pa.decimal32(5, 2) - Decimal32Type(decimal32(5, 2)) - """ @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal32(5, 2) - >>> t.precision - 5 - """ + def precision(self) -> _Precision: ... + @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal32(5, 2) - >>> t.scale - 2 - """ + def scale(self) -> _Scale: ... -class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal64 data types. - Examples - -------- - Create an instance of decimal64 type: +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - >>> import pyarrow as pa - >>> pa.decimal64(5, 2) - Decimal64Type(decimal64(5, 2)) - """ @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal64(5, 2) - >>> t.precision - 5 - """ + def precision(self) -> _Precision: ... + @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal64(5, 2) - >>> t.scale - 2 - """ + def scale(self) -> _Scale: ... -class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal128 data types. - Examples - -------- - Create an instance of decimal128 type: +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - >>> import pyarrow as pa - >>> pa.decimal128(5, 2) - Decimal128Type(decimal128(5, 2)) - """ @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal128(5, 2) - >>> t.precision - 5 - """ + def precision(self) -> _Precision: ... + @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal128(5, 2) - >>> t.scale - 2 - """ + def scale(self) -> _Scale: ... -class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal256 data types. - Examples - -------- - Create an instance of decimal256 type: +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - >>> import pyarrow as pa - >>> pa.decimal256(76, 38) - Decimal256Type(decimal256(76, 38)) - """ @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal256(76, 38) - >>> t.precision - 76 - """ + def precision(self) -> _Precision: ... + @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal256(76, 38) - >>> t.scale - 38 - """ + def scale(self) -> _Scale: ... -class ListType(DataType, Generic[_DataTypeT]): - """ - Concrete class for list data types. - Examples - -------- - Create an instance of ListType: +class ListType(DataType, Generic[_DataTypeT]): - >>> import pyarrow as pa - >>> pa.list_(pa.string()) - ListType(list) - """ @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.string()).value_field - pyarrow.Field - """ + def value_field(self) -> Field[_DataTypeT]: ... + @property - def value_type(self) -> _DataTypeT: - """ - The data type of list values. + def value_type(self) -> _DataTypeT: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.string()).value_type - DataType(string) - """ class LargeListType(DataType, Generic[_DataTypeT]): - """ - Concrete class for large list data types - (like ListType, but with 64-bit offsets). - - Examples - -------- - Create an instance of LargeListType: - - >>> import pyarrow as pa - >>> pa.large_list(pa.string()) - LargeListType(large_list) - """ + @property def value_field(self) -> Field[_DataTypeT]: ... @property - def value_type(self) -> _DataTypeT: - """ - The data type of large list values. + def value_type(self) -> _DataTypeT: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.large_list(pa.string()).value_type - DataType(string) - """ class ListViewType(DataType, Generic[_DataTypeT]): - """ - Concrete class for list view data types. - - Examples - -------- - Create an instance of ListViewType: - >>> import pyarrow as pa - >>> pa.list_view(pa.string()) - ListViewType(list_view) - """ @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for list view values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_view(pa.string()).value_field - pyarrow.Field - """ + def value_field(self) -> Field[_DataTypeT]: ... + @property - def value_type(self) -> _DataTypeT: - """ - The data type of list view values. + def value_type(self) -> _DataTypeT: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_view(pa.string()).value_type - DataType(string) - """ class LargeListViewType(DataType, Generic[_DataTypeT]): - """ - Concrete class for large list view data types - (like ListViewType, but with 64-bit offsets). - - Examples - -------- - Create an instance of LargeListViewType: - - >>> import pyarrow as pa - >>> pa.large_list_view(pa.string()) - LargeListViewType(large_list_view) - """ + @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for large list view values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.large_list_view(pa.string()).value_field - pyarrow.Field - """ + def value_field(self) -> Field[_DataTypeT]: ... + @property - def value_type(self) -> _DataTypeT: - """ - The data type of large list view values. + def value_type(self) -> _DataTypeT: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.large_list_view(pa.string()).value_type - DataType(string) - """ class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): - """ - Concrete class for fixed size list data types. - Examples - -------- - Create an instance of FixedSizeListType: - - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2) - FixedSizeListType(fixed_size_list[2]) - """ @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2).value_field - pyarrow.Field - """ + def value_field(self) -> Field[_DataTypeT]: ... + @property - def value_type(self) -> _DataTypeT: - """ - The data type of large list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2).value_type - DataType(int32) - """ + def value_type(self) -> _DataTypeT: ... + @property - def list_size(self) -> _Size: - """ - The size of the fixed size lists. + def list_size(self) -> _Size: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2).list_size - 2 - """ class DictionaryMemo(_Weakrefable): - """ - Tracking container for dictionary-encoded fields. - """ + ... + _IndexT = TypeVar( "_IndexT", @@ -724,716 +333,167 @@ _BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) _ValueT = TypeVar("_ValueT", bound=DataType) _Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) -class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): - """ - Concrete class for dictionary data types. - - Examples - -------- - Create an instance of dictionary type: - >>> import pyarrow as pa - >>> pa.dictionary(pa.int64(), pa.utf8()) - DictionaryType(dictionary) - """ +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): @property - def ordered(self) -> _Ordered: - """ - Whether the dictionary is ordered, i.e. whether the ordering of values - in the dictionary is important. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.dictionary(pa.int64(), pa.utf8()).ordered - False - """ - @property - def index_type(self) -> _IndexT: - """ - The data type of dictionary indices (a signed integer type). - - Examples - -------- - >>> import pyarrow as pa - >>> pa.dictionary(pa.int16(), pa.utf8()).index_type - DataType(int16) - """ + def ordered(self) -> _Ordered: ... + @property - def value_type(self) -> _BasicValueT: - """ - The dictionary value type. + def index_type(self) -> _IndexT: ... - The dictionary values are found in an instance of DictionaryArray. + @property + def value_type(self) -> _BasicValueT: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.dictionary(pa.int16(), pa.utf8()).value_type - DataType(string) - """ _K = TypeVar("_K", bound=DataType) -class MapType(DataType, Generic[_K, _ValueT, _Ordered]): - """ - Concrete class for map data types. - - Examples - -------- - Create an instance of MapType: - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()) - MapType(map) - >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) - MapType(map) - """ +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): @property - def key_field(self) -> Field[_K]: - """ - The field for keys in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).key_field - pyarrow.Field - """ + def key_field(self) -> Field[_K]: ... + @property - def key_type(self) -> _K: - """ - The data type of keys in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).key_type - DataType(string) - """ + def key_type(self) -> _K: ... + @property - def item_field(self) -> Field[_ValueT]: - """ - The field for items in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).item_field - pyarrow.Field - """ + def item_field(self) -> Field[_ValueT]: ... + @property - def item_type(self) -> _ValueT: - """ - The data type of items in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).item_type - DataType(int32) - """ + def item_type(self) -> _ValueT: ... + @property - def keys_sorted(self) -> _Ordered: - """ - Should the entries be sorted according to keys. + def keys_sorted(self) -> _Ordered: ... - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True).keys_sorted - True - """ _Size = TypeVar("_Size", default=int) + class StructType(DataType): - """ - Concrete class for struct data types. - ``StructType`` supports direct indexing using ``[...]`` (implemented via - ``__getitem__``) to access its fields. - It will return the struct field with the given index or name. - - Examples - -------- - >>> import pyarrow as pa - - Accessing fields using direct indexing: - - >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) - >>> struct_type[0] - pyarrow.Field - >>> struct_type['y'] - pyarrow.Field - - Accessing fields using ``field()``: - - >>> struct_type.field(1) - pyarrow.Field - >>> struct_type.field('x') - pyarrow.Field - - # Creating a schema from the struct type's fields: - >>> pa.schema(list(struct_type)) - x: int32 - y: string - """ - def get_field_index(self, name: str) -> int: - """ - Return index of the unique field with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - index : int - The index of the field with the given name; -1 if the - name isn't found or there are several fields with the given - name. - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) - - Index of the field with a name 'y': - - >>> struct_type.get_field_index('y') - 1 - - Index of the field that does not exist: - - >>> struct_type.get_field_index('z') - -1 - """ - def field(self, i: int | str) -> Field: - """ - Select a field by its column name or numeric index. - - Parameters - ---------- - i : int or str - - Returns - ------- - pyarrow.Field - - Examples - -------- - - >>> import pyarrow as pa - >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) - - Select the second field: - - >>> struct_type.field(1) - pyarrow.Field - - Select the field named 'x': - - >>> struct_type.field('x') - pyarrow.Field - """ - def get_all_field_indices(self, name: str) -> list[int]: - """ - Return sorted list of indices for the fields with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - indices : List[int] - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) - >>> struct_type.get_all_field_indices('x') - [0] - """ - def __len__(self) -> int: - """ - Like num_fields(). - """ - def __iter__(self) -> Iterator[Field]: - """ - Iterate over struct fields, in order. - """ - __getitem__ = field # pyright: ignore[reportUnknownVariableType] - @property - def names(self) -> list[str]: - """ - Lists the field names. - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct([('a', pa.int64()), ('b', pa.float64()), ('c', pa.string())]) - >>> struct_type.names - ['a', 'b', 'c'] - """ - @property - def fields(self) -> list[Field]: - """ - Lists all fields within the StructType. - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct([('a', pa.int64()), ('b', pa.float64()), ('c', pa.string())]) - >>> struct_type.fields - [pyarrow.Field, pyarrow.Field, pyarrow.Field] - """ + def get_field_index(self, name: str) -> int: ... -class UnionType(DataType): - """ - Base class for union data types. + def field(self, i: int | str) -> Field: ... - Examples - -------- - Create an instance of a dense UnionType using ``pa.union``: + def get_all_field_indices(self, name: str) -> list[int]: ... - >>> import pyarrow as pa - >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], - ... mode=pa.lib.UnionMode_DENSE), - (DenseUnionType(dense_union),) + def __len__(self) -> int: ... - Create an instance of a dense UnionType using ``pa.dense_union``: + def __iter__(self) -> Iterator[Field]: ... - >>> pa.dense_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - DenseUnionType(dense_union) + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + @property + def names(self) -> list[str]: ... - Create an instance of a sparse UnionType using ``pa.union``: + @property + def fields(self) -> list[Field]: ... - >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], - ... mode=pa.lib.UnionMode_SPARSE), - (SparseUnionType(sparse_union),) - Create an instance of a sparse UnionType using ``pa.sparse_union``: +class UnionType(DataType): - >>> pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - SparseUnionType(sparse_union) - """ @property - def mode(self) -> Literal["sparse", "dense"]: - """ - The mode of the union ("dense" or "sparse"). - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - >>> union.mode - 'sparse' - """ - @property - def type_codes(self) -> list[int]: - """ - The type code to indicate each data type in this union. - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - >>> union.type_codes - [0, 1] - """ - def __len__(self) -> int: - """ - Like num_fields(). - """ - def __iter__(self) -> Iterator[Field]: - """ - Iterate over union members, in order. - """ - def field(self, i: int) -> Field: - """ - Return a child field by its numeric index. - - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - >>> union[0] - pyarrow.Field - """ - __getitem__ = field # pyright: ignore[reportUnknownVariableType] + def mode(self) -> Literal["sparse", "dense"]: ... -class SparseUnionType(UnionType): - """ - Concrete class for sparse union types. + @property + def type_codes(self) -> list[int]: ... - Examples - -------- - Create an instance of a sparse UnionType using ``pa.union``: + def __len__(self) -> int: ... - >>> import pyarrow as pa - >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], - ... mode=pa.lib.UnionMode_SPARSE), - (SparseUnionType(sparse_union),) + def __iter__(self) -> Iterator[Field]: ... - Create an instance of a sparse UnionType using ``pa.sparse_union``: + def field(self, i: int) -> Field: ... - >>> pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - SparseUnionType(sparse_union) - """ - @property - def mode(self) -> Literal["sparse"]: - """ - The mode of the union ("dense" or "sparse"). - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - >>> union.mode - 'sparse' - """ + __getitem__ = field # pyright: ignore[reportUnknownVariableType] -class DenseUnionType(UnionType): - """ - Concrete class for dense union types. - Examples - -------- - Create an instance of a dense UnionType using ``pa.union``: +class SparseUnionType(UnionType): - >>> import pyarrow as pa - >>> pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], - ... mode=pa.lib.UnionMode_DENSE), - (DenseUnionType(dense_union),) + @property + def mode(self) -> Literal["sparse"]: ... - Create an instance of a dense UnionType using ``pa.dense_union``: - >>> pa.dense_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - DenseUnionType(dense_union) - """ +class DenseUnionType(UnionType): @property - def mode(self) -> Literal["dense"]: - """ - The mode of the union ("dense" or "sparse"). - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())]) - >>> union.mode - 'sparse' - """ + def mode(self) -> Literal["dense"]: ... + _RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): - """ - Concrete class for run-end encoded types. - """ + @property def run_end_type(self) -> _RunEndType: ... @property def value_type(self) -> _BasicValueT: ... + _StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + class BaseExtensionType(DataType): - """ - Concrete base class for extension types. - """ - def __arrow_ext_class__(self) -> type[ExtensionArray]: - """ - The associated array extension class - """ - def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: - """ - The associated scalar class - """ + + def __arrow_ext_class__(self) -> type[ExtensionArray]: ... + + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + @property - def extension_name(self) -> str: - """ - The extension type name. - """ + def extension_name(self) -> str: ... + @property - def storage_type(self) -> DataType: - """ - The underlying storage type. - """ - def wrap_array(self, storage: _StorageT) -> _StorageT: - """ - Wrap the given storage array as an extension array. - - Parameters - ---------- - storage : Array or ChunkedArray - - Returns - ------- - array : Array or ChunkedArray - Extension array wrapping the storage array - """ + def storage_type(self) -> DataType: ... + + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + class ExtensionType(BaseExtensionType): - """ - Concrete base class for Python-defined extension types. - - Parameters - ---------- - storage_type : DataType - The underlying storage type for the extension type. - extension_name : str - A unique name distinguishing this extension type. The name will be - used when deserializing IPC data. - - Examples - -------- - Define a RationalType extension type subclassing ExtensionType: - - >>> import pyarrow as pa - >>> class RationalType(pa.ExtensionType): - ... def __init__(self, data_type: pa.DataType): - ... if not pa.types.is_integer(data_type): - ... raise TypeError(f"data_type must be an integer type not {data_type}") - ... super().__init__( - ... pa.struct( - ... [ - ... ("numer", data_type), - ... ("denom", data_type), - ... ], - ... ), - ... # N.B. This name does _not_ reference `data_type` so deserialization - ... # will work for _any_ integer `data_type` after registration - ... "my_package.rational", - ... ) - ... def __arrow_ext_serialize__(self) -> bytes: - ... # No parameters are necessary - ... return b"" - ... @classmethod - ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass - ... return RationalType(storage_type[0].type) - - Register the extension type: - - >>> pa.register_extension_type(RationalType(pa.int64())) - - Create an instance of RationalType extension type: - - >>> rational_type = RationalType(pa.int32()) - - Inspect the extension type: - - >>> rational_type.extension_name - 'my_package.rational' - >>> rational_type.storage_type - StructType(struct) - - Wrap an array as an extension array: - - >>> storage_array = pa.array( - ... [ - ... {"numer": 10, "denom": 17}, - ... {"numer": 20, "denom": 13}, - ... ], - ... type=rational_type.storage_type - ... ) - >>> rational_array = rational_type.wrap_array(storage_array) - >>> rational_array - - -- is_valid: all not null - -- child 0 type: int32 - [ - 10, - 20 - ] - -- child 1 type: int32 - [ - 17, - 13 - ] - - Or do the same with creating an ExtensionArray: - - >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) - >>> rational_array - - -- is_valid: all not null - -- child 0 type: int32 - [ - 10, - 20 - ] - -- child 1 type: int32 - [ - 17, - 13 - ] - - Unregister the extension type: - - >>> pa.unregister_extension_type("my_package.rational") - - Note that even though we registered the concrete type - ``RationalType(pa.int64())``, PyArrow will be able to deserialize - ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer - will reference the name ``my_package.rational`` and the ``@classmethod`` - ``__arrow_ext_deserialize__``. - """ - - def __init__(self, storage_type: DataType, extension_name: str) -> None: - """ - Initialize an extension type instance. - - This should be called at the end of the subclass' - ``__init__`` method. - """ - def __arrow_ext_serialize__(self) -> bytes: - """ - Serialized representation of metadata to reconstruct the type object. - - This method should return a bytes object, and those serialized bytes - are stored in the custom metadata of the Field holding an extension - type in an IPC message. - The bytes are passed to ``__arrow_ext_deserialize`` and should hold - sufficient information to reconstruct the data type instance. - """ - @classmethod - def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: - """ - Return an extension type instance from the storage type and serialized - metadata. - This method should return an instance of the ExtensionType subclass - that matches the passed storage type and serialized metadata (the - return value of ``__arrow_ext_serialize__``). - """ + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... -class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): - """ - Concrete class for fixed shape tensor extension type. + def __arrow_ext_serialize__(self) -> bytes: ... - Examples - -------- - Create an instance of fixed shape tensor extension type: + @classmethod + def __arrow_ext_deserialize__( + cls, storage_type: DataType, serialized: bytes) -> Self: ... - >>> import pyarrow as pa - >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) - FixedShapeTensorType(extension) - Create an instance of fixed shape tensor extension type with - permutation: +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), - ... permutation=[0, 2, 1]) - >>> tensor_type.permutation - [0, 2, 1] - """ - @property - def value_type(self) -> _ValueT: - """ - Data type of an individual tensor. - """ @property - def shape(self) -> list[int]: - """ - Shape of the tensors. - """ + def value_type(self) -> _ValueT: ... + @property - def dim_names(self) -> list[str] | None: - """ - Explicit names of the dimensions. - """ + def shape(self) -> list[int]: ... + @property - def permutation(self) -> list[int] | None: - """ - Indices of the dimensions ordering. - """ + def dim_names(self) -> list[str] | None: ... -class Bool8Type(BaseExtensionType): - """ - Concrete class for bool8 extension type. + @property + def permutation(self) -> list[int] | None: ... - Bool8 is an alternate representation for boolean - arrays using 8 bits instead of 1 bit per value. The underlying - storage type is int8. - Examples - -------- - Create an instance of bool8 extension type: +class Bool8Type(BaseExtensionType): + ... - >>> import pyarrow as pa - >>> pa.bool8() - Bool8Type(extension) - """ class UuidType(BaseExtensionType): - """ - Concrete class for UUID extension type. - """ - -class JsonType(BaseExtensionType): - """ - Concrete class for JSON extension type. - - Examples - -------- - Define the extension type for JSON array + ... - >>> import pyarrow as pa - >>> json_type = pa.json_(pa.large_utf8()) - Create an extension array +class JsonType(BaseExtensionType): + ... - >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] - >>> storage = pa.array(arr, pa.large_utf8()) - >>> pa.ExtensionArray.from_storage(json_type, storage) - - [ - null, - "{ "id":30, "values":["a", "b"] }" - ] - """ class OpaqueType(BaseExtensionType): - """ - Concrete class for opaque extension type. - - Opaque is a placeholder for a type from an external (often non-Arrow) - system that could not be interpreted. - - Examples - -------- - Create an instance of opaque extension type: - >>> import pyarrow as pa - >>> pa.opaque(pa.int32(), "geometry", "postgis") - OpaqueType(extension) - """ @property - def type_name(self) -> str: - """ - The name of the type in the external system. - """ + def type_name(self) -> str: ... + @property - def vendor_name(self) -> str: - """ - The name of the external system. - """ + def vendor_name(self) -> str: ... + # TODO # @deprecated( @@ -1467,2701 +527,347 @@ class OpaqueType(BaseExtensionType): # """ class UnknownExtensionType(ExtensionType): # type: ignore - """ - A concrete class for Python-defined extension types that refer to - an unknown Python implementation. - - Parameters - ---------- - storage_type : DataType - The storage type for which the extension is built. - serialized : bytes - The serialised output. - """ - def __init__(self, storage_type: DataType, serialized: bytes) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ - -def register_extension_type(ext_type: ExtensionType) -> None: # type: ignore - """ - Register a Python extension type. - - Registration is based on the extension name (so different registered types - need unique extension names). Registration needs an extension type - instance, but then works for any instance of the same subclass regardless - of parametrization of the type. - - Parameters - ---------- - ext_type : BaseExtensionType instance - The ExtensionType subclass to register. - - Examples - -------- - Define a RationalType extension type subclassing ExtensionType: - - >>> import pyarrow as pa - >>> class RationalType(pa.ExtensionType): - ... def __init__(self, data_type: pa.DataType): - ... if not pa.types.is_integer(data_type): - ... raise TypeError(f"data_type must be an integer type not {data_type}") - ... super().__init__( - ... pa.struct( - ... [ - ... ("numer", data_type), - ... ("denom", data_type), - ... ], - ... ), - ... # N.B. This name does _not_ reference `data_type` so deserialization - ... # will work for _any_ integer `data_type` after registration - ... "my_package.rational", - ... ) - ... def __arrow_ext_serialize__(self) -> bytes: - ... # No parameters are necessary - ... return b"" - ... @classmethod - ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass - ... return RationalType(storage_type[0].type) - - Register the extension type: - - >>> pa.register_extension_type(RationalType(pa.int64())) - - Unregister the extension type: - - >>> pa.unregister_extension_type("my_package.rational") - """ - -def unregister_extension_type(type_name: str) -> None: - """ - Unregister a Python extension type. - - Parameters - ---------- - type_name : str - The name of the ExtensionType subclass to unregister. - - Examples - -------- - Define a RationalType extension type subclassing ExtensionType: - - >>> import pyarrow as pa - >>> class RationalType(pa.ExtensionType): - ... def __init__(self, data_type: pa.DataType): - ... if not pa.types.is_integer(data_type): - ... raise TypeError(f"data_type must be an integer type not {data_type}") - ... super().__init__( - ... pa.struct( - ... [ - ... ("numer", data_type), - ... ("denom", data_type), - ... ], - ... ), - ... # N.B. This name does _not_ reference `data_type` so deserialization - ... # will work for _any_ integer `data_type` after registration - ... "my_package.rational", - ... ) - ... def __arrow_ext_serialize__(self) -> bytes: - ... # No parameters are necessary - ... return b"" - ... @classmethod - ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass - ... return RationalType(storage_type[0].type) - - Register the extension type: - - >>> pa.register_extension_type(RationalType(pa.int64())) - - Unregister the extension type: - - >>> pa.unregister_extension_type("my_package.rational") - """ + + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + + +def register_extension_type(ext_type: ExtensionType) -> None: ... # type: ignore + + +def unregister_extension_type(type_name: str) -> None: ... + class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): - """ - KeyValueMetadata - - Parameters - ---------- - __arg0__ : dict - A dict of the key-value metadata - **kwargs : optional - additional key-value metadata - """ - def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ - def equals(self, other: KeyValueMetadata) -> bool: - """ - Parameters - ---------- - other : pyarrow.KeyValueMetadata - - Returns - ------- - bool - """ - def __len__(self) -> int: - """ - Return len(self). - """ - def __contains__(self, __key: object) -> bool: - """ - Return bool(key in self). - """ - def __getitem__(self, __key: Any) -> Any: - """ - Return self[key]. - """ - def __iter__(self) -> Iterator[bytes]: - """ - Implement iter(self). - """ - def get_all(self, key: str) -> list[bytes]: - """ - Parameters - ---------- - key : str - - Returns - ------- - list[byte] - """ - def to_dict(self) -> dict[bytes, bytes]: - """ - Convert KeyValueMetadata to dict. If a key occurs twice, the value for - the first one is returned - """ + + def __init__(self, __arg0__: Mapping[bytes, bytes] + | None = None, **kwargs) -> None: ... + + def equals(self, other: KeyValueMetadata) -> bool: ... + + def __len__(self) -> int: ... + + def __contains__(self, __key: object) -> bool: ... + + def __getitem__(self, __key: Any) -> Any: ... + + def __iter__(self) -> Iterator[bytes]: ... + + def get_all(self, key: str) -> list[bytes]: ... + + def to_dict(self) -> dict[bytes, bytes]: ... + class Field(_Weakrefable, Generic[_DataTypeT]): - """ - A named field, with a data type, nullability, and optional metadata. - - Notes - ----- - Do not use this class's constructor directly; use pyarrow.field - - Examples - -------- - Create an instance of pyarrow.Field: - - >>> import pyarrow as pa - >>> pa.field('key', pa.int32()) - pyarrow.Field - >>> pa.field('key', pa.int32(), nullable=False) - pyarrow.Field - >>> field = pa.field('key', pa.int32(), - ... metadata={"key": "Something important"}) - >>> field - pyarrow.Field - >>> field.metadata - {b'key': b'Something important'} - - Use the field to create a struct type: - - >>> pa.struct([field]) - StructType(struct) - """ - - def equals(self, other: Field, check_metadata: bool = False) -> bool: - """ - Test if this field is equal to the other - - Parameters - ---------- - other : pyarrow.Field - check_metadata : bool, default False - Whether Field metadata equality should be checked as well. - - Returns - ------- - is_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> f1 = pa.field('key', pa.int32()) - >>> f2 = pa.field('key', pa.int32(), nullable=False) - >>> f1.equals(f2) - False - >>> f1.equals(f1) - True - """ - def __hash__(self) -> int: - """ - Return hash(self). - """ + + def equals(self, other: Field, check_metadata: bool = False) -> bool: ... + + def __hash__(self) -> int: ... + @property - def nullable(self) -> bool: - """ - The field nullability. - - Examples - -------- - >>> import pyarrow as pa - >>> f1 = pa.field('key', pa.int32()) - >>> f2 = pa.field('key', pa.int32(), nullable=False) - >>> f1.nullable - True - >>> f2.nullable - False - """ + def nullable(self) -> bool: ... + @property - def name(self) -> str: - """ - The field name. - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32()) - >>> field.name - 'key' - """ + def name(self) -> str: ... + @property - def metadata(self) -> dict[bytes, bytes] | None: - """ - The field metadata (if any is set). - - Returns - ------- - metadata : dict or None - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32(), - ... metadata={"key": "Something important"}) - >>> field.metadata - {b'key': b'Something important'} - """ + def metadata(self) -> dict[bytes, bytes] | None: ... + @property def type(self) -> _DataTypeT: ... - def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: - """ - Add metadata as dict of string keys and values to Field - - Parameters - ---------- - metadata : dict - Keys and values must be string-like / coercible to bytes - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32()) - - Create new field by adding metadata to existing one: - - >>> field_new = field.with_metadata({"key": "Something important"}) - >>> field_new - pyarrow.Field - >>> field_new.metadata - {b'key': b'Something important'} - """ - def remove_metadata(self) -> Self: - """ - Create new field without metadata, if any - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32(), - ... metadata={"key": "Something important"}) - >>> field.metadata - {b'key': b'Something important'} - - Create new field by removing the metadata from the existing one: - - >>> field_new = field.remove_metadata() - >>> field_new.metadata - """ - def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: - """ - A copy of this field with the replaced type - - Parameters - ---------- - new_type : pyarrow.DataType - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32()) - >>> field - pyarrow.Field - - Create new field by replacing type of an existing one: - - >>> field_new = field.with_type(pa.int64()) - >>> field_new - pyarrow.Field - """ - def with_name(self, name: str) -> Self: - """ - A copy of this field with the replaced name - - Parameters - ---------- - name : str - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32()) - >>> field - pyarrow.Field - - Create new field by replacing the name of an existing one: - - >>> field_new = field.with_name('lock') - >>> field_new - pyarrow.Field - """ - def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: - """ - A copy of this field with the replaced nullability - - Parameters - ---------- - nullable : bool - - Returns - ------- - field: pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field('key', pa.int32()) - >>> field - pyarrow.Field - >>> field.nullable - True - - Create new field by replacing the nullability of an existing one: - - >>> field_new = field.with_nullable(False) - >>> field_new - pyarrow.Field - >>> field_new.nullable - False - """ - def flatten(self) -> list[Field]: - """ - Flatten this field. If a struct field, individual child fields - will be returned with their names prefixed by the parent's name. - - Returns - ------- - fields : List[pyarrow.Field] - - Examples - -------- - >>> import pyarrow as pa - >>> f1 = pa.field('bar', pa.float64(), nullable=False) - >>> f2 = pa.field('foo', pa.int32()).with_metadata({"key": "Something important"}) - >>> ff = pa.field('ff', pa.struct([f1, f2]), nullable=False) - - Flatten a struct field: - - >>> ff - pyarrow.Field not null> - >>> ff.flatten() - [pyarrow.Field, pyarrow.Field] - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowSchema struct, given its pointer. - - Be careful: if you don't pass the ArrowSchema struct to a consumer, - its memory will leak. This is a low-level function intended for - expert users. - """ + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... + + def remove_metadata(self) -> Self: ... + + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... + + def with_name(self, name: str) -> Self: ... + + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ... + + def flatten(self) -> list[Field]: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import Field from a C ArrowSchema struct, given its pointer. - - This is a low-level function intended for expert users. - """ - def __arrow_c_schema__(self) -> Any: - """ - Export to a ArrowSchema PyCapsule - - Unlike _export_to_c, this will not leak memory if the capsule is not used. - """ + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + @classmethod - def _import_from_c_capsule(cls, schema) -> Self: - """ - Import a Field from a ArrowSchema PyCapsule + def _import_from_c_capsule(cls, schema) -> Self: ... - Parameters - ---------- - schema : PyCapsule - A valid PyCapsule with name 'arrow_schema' containing an - ArrowSchema pointer. - """ class Schema(_Weakrefable): - """ - A named collection of types a.k.a schema. A schema defines the - column names and types in a record batch or table data structure. - They also contain metadata about the columns. For example, schemas - converted from Pandas contain metadata about their original Pandas - types so they can be converted back to the same types. - - Warnings - -------- - Do not call this class's constructor directly. Instead use - :func:`pyarrow.schema` factory function which makes a new Arrow - Schema object. - - Examples - -------- - Create a new Arrow Schema object: - - >>> import pyarrow as pa - >>> pa.schema([ - ... ('some_int', pa.int32()), - ... ('some_string', pa.string()) - ... ]) - some_int: int32 - some_string: string - - Create Arrow Schema with metadata: - - >>> pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}) - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - - def __len__(self) -> int: - """ - Return len(self). - """ - def __getitem__(self, key: str) -> Field: - """ - Return self[key]. - """ + + def __len__(self) -> int: ... + + def __getitem__(self, key: str) -> Field: ... + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] - def __iter__(self) -> Iterator[Field]: - """ - Implement iter(self). - """ - def __hash__(self) -> int: - """ - Return hash(self). - """ + def __iter__(self) -> Iterator[Field]: ... + + def __hash__(self) -> int: ... + def __sizeof__(self) -> int: ... @property - def pandas_metadata(self) -> dict: - """ - Return deserialized-from-JSON pandas metadata field (if it exists) - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> schema = pa.Table.from_pandas(df).schema - - Select pandas metadata field from Arrow Schema: - - >>> schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, 'stop': 4, 'step': 1}], ... - """ + def pandas_metadata(self) -> dict: ... + @property - def names(self) -> list[str]: - """ - The schema's field names. - - Returns - ------- - list of str - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Get the names of the schema's fields: - - >>> schema.names - ['n_legs', 'animals'] - """ + def names(self) -> list[str]: ... + @property - def types(self) -> list[DataType]: - """ - The schema's field types. - - Returns - ------- - list of DataType - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Get the types of the schema's fields: - - >>> schema.types - [DataType(int64), DataType(string)] - """ + def types(self) -> list[DataType]: ... + @property - def metadata(self) -> dict[bytes, bytes]: - """ - The schema's metadata (if any is set). - - Returns - ------- - metadata: dict or None - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}) - - Get the metadata of the schema's fields: - - >>> schema.metadata - {b'n_legs': b'Number of legs per animal'} - """ - def empty_table(self) -> Table: - """ - Provide an empty table according to the schema. - - Returns - ------- - table: pyarrow.Table - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Create an empty table with schema's fields: - - >>> schema.empty_table() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[]] - animals: [[]] - """ - def equals(self, other: Schema, check_metadata: bool = False) -> bool: - """ - Test if this schema is equal to the other - - Parameters - ---------- - other : pyarrow.Schema - check_metadata : bool, default False - Key/value metadata must be equal too - - Returns - ------- - is_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> schema1 = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}) - >>> schema2 = pa.schema([ - ... ('some_int', pa.int32()), - ... ('some_string', pa.string()) - ... ]) - - Test two equal schemas: - - >>> schema1.equals(schema1) - True - - Test two unequal schemas: - - >>> schema1.equals(schema2) - False - """ + def metadata(self) -> dict[bytes, bytes]: ... + + def empty_table(self) -> Table: ... + + def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... + @classmethod - def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: - """ - Returns implied schema from dataframe - - Parameters - ---------- - df : pandas.DataFrame - preserve_index : bool, default True - Whether to store the index as an additional column (or columns, for - MultiIndex) in the resulting `Table`. - The default of None will store the index as a column, except for - RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - - Returns - ------- - pyarrow.Schema - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({ - ... 'int': [1, 2], - ... 'str': ['a', 'b'] - ... }) - - Create an Arrow Schema from the schema of a pandas dataframe: - - >>> pa.Schema.from_pandas(df) - int: int64 - str: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, ... - """ - def field(self, i: int | str | bytes) -> Field: - """ - Select a field by its column name or numeric index. - - Parameters - ---------- - i : int or string - - Returns - ------- - pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Select the second field: - - >>> schema.field(1) - pyarrow.Field - - Select the field of the column named 'n_legs': - - >>> schema.field('n_legs') - pyarrow.Field - """ + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | + None = None) -> Schema: ... + + def field(self, i: int | str | bytes) -> Field: ... + @deprecated("Use 'field' instead") - def field_by_name(self, name: str) -> Field: - """ - DEPRECATED - - Parameters - ---------- - name : str - - Returns - ------- - field: pyarrow.Field - """ - def get_field_index(self, name: str) -> int: - """ - Return index of the unique field with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - index : int - The index of the field with the given name; -1 if the - name isn't found or there are several fields with the given - name. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Get the index of the field named 'animals': - - >>> schema.get_field_index("animals") - 1 - - Index in case of several fields with the given name: - - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string()), - ... pa.field('animals', pa.bool_())], - ... metadata={"n_legs": "Number of legs per animal"}) - >>> schema.get_field_index("animals") - -1 - """ - def get_all_field_indices(self, name: str) -> list[int]: - """ - Return sorted list of indices for the fields with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - indices : List[int] - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string()), - ... pa.field('animals', pa.bool_())]) - - Get the indexes of the fields named 'animals': - - >>> schema.get_all_field_indices("animals") - [1, 2] - """ - def append(self, field: Field) -> Schema: - """ - Append a field at the end of the schema. - - In contrast to Python's ``list.append()`` it does return a new - object, leaving the original Schema unmodified. - - Parameters - ---------- - field : Field - - Returns - ------- - schema: Schema - New object with appended field. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Append a field 'extra' at the end of the schema: - - >>> schema_new = schema.append(pa.field('extra', pa.bool_())) - >>> schema_new - n_legs: int64 - animals: string - extra: bool - - Original schema is unmodified: - - >>> schema - n_legs: int64 - animals: string - """ - def insert(self, i: int, field: Field) -> Schema: - """ - Add a field at position i to the schema. - - Parameters - ---------- - i : int - field : Field - - Returns - ------- - schema: Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Insert a new field on the second position: - - >>> schema.insert(1, pa.field('extra', pa.bool_())) - n_legs: int64 - extra: bool - animals: string - """ - def remove(self, i: int) -> Schema: - """ - Remove the field at index i from the schema. - - Parameters - ---------- - i : int - - Returns - ------- - schema: Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Remove the second field of the schema: - - >>> schema.remove(1) - n_legs: int64 - """ - def set(self, i: int, field: Field) -> Schema: - """ - Replace a field at position i in the schema. - - Parameters - ---------- - i : int - field : Field - - Returns - ------- - schema: Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Replace the second field of the schema with a new field 'extra': - - >>> schema.set(1, pa.field('replaced', pa.bool_())) - n_legs: int64 - replaced: bool - """ + def field_by_name(self, name: str) -> Field: ... + + def get_field_index(self, name: str) -> int: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def append(self, field: Field) -> Schema: ... + + def insert(self, i: int, field: Field) -> Schema: ... + + def remove(self, i: int) -> Schema: ... + + def set(self, i: int, field: Field) -> Schema: ... + @deprecated("Use 'with_metadata' instead") - def add_metadata(self, metadata: dict) -> Schema: - """ - DEPRECATED - - Parameters - ---------- - metadata : dict - Keys and values must be string-like / coercible to bytes - """ - def with_metadata(self, metadata: dict) -> Schema: - """ - Add metadata as dict of string keys and values to Schema - - Parameters - ---------- - metadata : dict - Keys and values must be string-like / coercible to bytes - - Returns - ------- - schema : pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Add metadata to existing schema field: - - >>> schema.with_metadata({"n_legs": "Number of legs per animal"}) - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write Schema to Buffer as encapsulated IPC message - - Parameters - ---------- - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())]) - - Write schema to Buffer: - - >>> schema.serialize() - - """ - def remove_metadata(self) -> Schema: - """ - Create new schema without metadata, if any - - Returns - ------- - schema : pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([ - ... pa.field('n_legs', pa.int64()), - ... pa.field('animals', pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}) - >>> schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Create a new schema with removing the metadata from the original: - - >>> schema.remove_metadata() - n_legs: int64 - animals: string - """ + def add_metadata(self, metadata: dict) -> Schema: ... + + def with_metadata(self, metadata: dict) -> Schema: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def remove_metadata(self) -> Schema: ... + def to_string( self, truncate_metadata: bool = True, show_field_metadata: bool = True, show_schema_metadata: bool = True, - ) -> str: - """ - Return human-readable representation of Schema - - Parameters - ---------- - truncate_metadata : boolean, default True - Limit metadata key/value display to a single line of ~80 characters - or less - show_field_metadata : boolean, default True - Display Field-level KeyValueMetadata - show_schema_metadata : boolean, default True - Display Schema-level KeyValueMetadata - element_size_limit : int, default 100 - Maximum number of characters of a single element before it is truncated. - - Returns - ------- - str : the formatted output - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowSchema struct, given its pointer. - - Be careful: if you don't pass the ArrowSchema struct to a consumer, - its memory will leak. This is a low-level function intended for - expert users. - """ + ) -> str: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod - def _import_from_c(cls, in_ptr: int) -> Schema: - """ - Import Schema from a C ArrowSchema struct, given its pointer. - - This is a low-level function intended for expert users. - """ - def __arrow_c_schema__(self) -> Any: - """ - Export to a ArrowSchema PyCapsule - - Unlike _export_to_c, this will not leak memory if the capsule is not used. - """ + def _import_from_c(cls, in_ptr: int) -> Schema: ... + + def __arrow_c_schema__(self) -> Any: ... + @staticmethod - def _import_from_c_capsule(schema: Any) -> Schema: - """ - Import a Schema from a ArrowSchema PyCapsule + def _import_from_c_capsule(schema: Any) -> Schema: ... - Parameters - ---------- - schema : PyCapsule - A valid PyCapsule with name 'arrow_schema' containing an - ArrowSchema pointer. - """ def unify_schemas( schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" -) -> Schema: - """ - Unify schemas by merging fields by name. - - The resulting schema will contain the union of fields from all schemas. - Fields with the same name will be merged. Note that two fields with - different types will fail merging by default. - - - The unified field will inherit the metadata from the schema where - that field is first defined. - - The first N fields in the schema will be ordered the same as the - N fields in the first schema. - - The resulting schema will inherit its metadata from the first input - schema. - - Parameters - ---------- - schemas : list of Schema - Schemas to merge into a single one. - promote_options : str, default default - Accepts strings "default" and "permissive". - Default: null and only null can be unified with another type. - Permissive: types are promoted to the greater common denominator. - - Returns - ------- - Schema - - Raises - ------ - ArrowInvalid : - If any input schema contains fields with duplicate names. - If Fields of the same name are not mergeable. - """ +) -> Schema: ... + def field( name: SupportArrowSchema | str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None -) -> Field[_DataTypeT] | Field[Any]: - """ - Create a pyarrow.Field instance. - - Parameters - ---------- - name : str or bytes - Name of the field. - Alternatively, you can also pass an object that implements the Arrow - PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). - type : pyarrow.DataType or str - Arrow datatype of the field or a string matching one. - nullable : bool, default True - Whether the field's values are nullable. - metadata : dict, default None - Optional field metadata, the keys and values must be coercible to - bytes. - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - Create an instance of pyarrow.Field: - - >>> import pyarrow as pa - >>> pa.field('key', pa.int32()) - pyarrow.Field - >>> pa.field('key', pa.int32(), nullable=False) - pyarrow.Field - - >>> field = pa.field('key', pa.int32(), - ... metadata={"key": "Something important"}) - >>> field - pyarrow.Field - >>> field.metadata - {b'key': b'Something important'} - - Use the field to create a struct type: - - >>> pa.struct([field]) - StructType(struct) - - A str can also be passed for the type parameter: - - >>> pa.field('key', 'int32') - pyarrow.Field - """ - -def null() -> NullType: - """ - Create instance of null type. - - Examples - -------- - Create an instance of a null type: - - >>> import pyarrow as pa - >>> pa.null() - DataType(null) - >>> print(pa.null()) - null - - Create a ``Field`` type with a null type and a name: - - >>> pa.field('null_field', pa.null()) - pyarrow.Field - """ - -def bool_() -> BoolType: - """ - Create instance of boolean type. - - Examples - -------- - Create an instance of a boolean type: - - >>> import pyarrow as pa - >>> pa.bool_() - DataType(bool) - >>> print(pa.bool_()) - bool - - Create a ``Field`` type with a boolean type - and a name: - - >>> pa.field('bool_field', pa.bool_()) - pyarrow.Field - """ - -def uint8() -> UInt8Type: - """ - Create instance of unsigned int8 type. - - Examples - -------- - Create an instance of unsigned int8 type: - - >>> import pyarrow as pa - >>> pa.uint8() - DataType(uint8) - >>> print(pa.uint8()) - uint8 - - Create an array with unsigned int8 type: - - >>> pa.array([0, 1, 2], type=pa.uint8()) - - [ - 0, - 1, - 2 - ] - """ - -def int8() -> Int8Type: - """ - Create instance of signed int8 type. - - Examples - -------- - Create an instance of int8 type: - - >>> import pyarrow as pa - >>> pa.int8() - DataType(int8) - >>> print(pa.int8()) - int8 - - Create an array with int8 type: - - >>> pa.array([0, 1, 2], type=pa.int8()) - - [ - 0, - 1, - 2 - ] - """ - -def uint16() -> UInt16Type: - """ - Create instance of unsigned uint16 type. - - Examples - -------- - Create an instance of unsigned int16 type: - - >>> import pyarrow as pa - >>> pa.uint16() - DataType(uint16) - >>> print(pa.uint16()) - uint16 - - Create an array with unsigned int16 type: - - >>> pa.array([0, 1, 2], type=pa.uint16()) - - [ - 0, - 1, - 2 - ] - """ - -def int16() -> Int16Type: - """ - Create instance of signed int16 type. - - Examples - -------- - Create an instance of int16 type: - - >>> import pyarrow as pa - >>> pa.int16() - DataType(int16) - >>> print(pa.int16()) - int16 - - Create an array with int16 type: - - >>> pa.array([0, 1, 2], type=pa.int16()) - - [ - 0, - 1, - 2 - ] - """ - -def uint32() -> Uint32Type: - """ - Create instance of unsigned uint32 type. - - Examples - -------- - Create an instance of unsigned int32 type: - - >>> import pyarrow as pa - >>> pa.uint32() - DataType(uint32) - >>> print(pa.uint32()) - uint32 - - Create an array with unsigned int32 type: - - >>> pa.array([0, 1, 2], type=pa.uint32()) - - [ - 0, - 1, - 2 - ] - """ - -def int32() -> Int32Type: - """ - Create instance of signed int32 type. - - Examples - -------- - Create an instance of int32 type: - - >>> import pyarrow as pa - >>> pa.int32() - DataType(int32) - >>> print(pa.int32()) - int32 - - Create an array with int32 type: - - >>> pa.array([0, 1, 2], type=pa.int32()) - - [ - 0, - 1, - 2 - ] - """ - -def int64() -> Int64Type: - """ - Create instance of signed int64 type. - - Examples - -------- - Create an instance of int64 type: - - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> print(pa.int64()) - int64 - - Create an array with int64 type: - - >>> pa.array([0, 1, 2], type=pa.int64()) - - [ - 0, - 1, - 2 - ] - """ - -def uint64() -> UInt64Type: - """ - Create instance of unsigned uint64 type. - - Examples - -------- - Create an instance of unsigned int64 type: - - >>> import pyarrow as pa - >>> pa.uint64() - DataType(uint64) - >>> print(pa.uint64()) - uint64 - - Create an array with unsigned uint64 type: - - >>> pa.array([0, 1, 2], type=pa.uint64()) - - [ - 0, - 1, - 2 - ] - """ - -def timestamp(unit: _Unit, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: - """ - Create instance of timestamp type with resolution and optional time zone. - - Parameters - ---------- - unit : str - one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns' - [nanosecond] - tz : str, default None - Time zone name. None indicates time zone naive - - Examples - -------- - Create an instance of timestamp type: - - >>> import pyarrow as pa - >>> pa.timestamp('us') - TimestampType(timestamp[us]) - >>> pa.timestamp('s', tz='America/New_York') - TimestampType(timestamp[s, tz=America/New_York]) - >>> pa.timestamp('s', tz='+07:30') - TimestampType(timestamp[s, tz=+07:30]) - - Use timestamp type when creating a scalar object: - - >>> from datetime import datetime - >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('s', tz='UTC')) - - >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('us')) - - - Returns - ------- - timestamp_type : TimestampType - """ - -def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: - """ - Create instance of 32-bit time (time of day) type with unit resolution. - - Parameters - ---------- - unit : str - one of 's' [second], or 'ms' [millisecond] - - Returns - ------- - type : pyarrow.Time32Type - - Examples - -------- - >>> import pyarrow as pa - >>> pa.time32('s') - Time32Type(time32[s]) - >>> pa.time32('ms') - Time32Type(time32[ms]) - """ - -def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: - """ - Create instance of 64-bit time (time of day) type with unit resolution. - - Parameters - ---------- - unit : str - One of 'us' [microsecond], or 'ns' [nanosecond]. - - Returns - ------- - type : pyarrow.Time64Type - - Examples - -------- - >>> import pyarrow as pa - >>> pa.time64('us') - Time64Type(time64[us]) - >>> pa.time64('ns') - Time64Type(time64[ns]) - """ - -def duration(unit: _Unit) -> DurationType[_Unit]: - """ - Create instance of a duration type with unit resolution. - - Parameters - ---------- - unit : str - One of 's' [second], 'ms' [millisecond], 'us' [microsecond], or - 'ns' [nanosecond]. - - Returns - ------- - type : pyarrow.DurationType - - Examples - -------- - Create an instance of duration type: - - >>> import pyarrow as pa - >>> pa.duration('us') - DurationType(duration[us]) - >>> pa.duration('s') - DurationType(duration[s]) - - Create an array with duration type: - - >>> pa.array([0, 1, 2], type=pa.duration('s')) - - [ - 0, - 1, - 2 - ] - """ - -def month_day_nano_interval() -> MonthDayNanoIntervalType: - """ - Create instance of an interval type representing months, days and - nanoseconds between two dates. - - Examples - -------- - Create an instance of an month_day_nano_interval type: - - >>> import pyarrow as pa - >>> pa.month_day_nano_interval() - DataType(month_day_nano_interval) - - Create a scalar with month_day_nano_interval type: - - >>> pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()) - - """ - -def date32() -> Date32Type: - """ - Create instance of 32-bit date (days since UNIX epoch 1970-01-01). - - Examples - -------- - Create an instance of 32-bit date type: - - >>> import pyarrow as pa - >>> pa.date32() - DataType(date32[day]) - - Create a scalar with 32-bit date type: - - >>> from datetime import date - >>> pa.scalar(date(2012, 1, 1), type=pa.date32()) - - """ - -def date64() -> Date64Type: - """ - Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01). - - Examples - -------- - Create an instance of 64-bit date type: - - >>> import pyarrow as pa - >>> pa.date64() - DataType(date64[ms]) - - Create a scalar with 64-bit date type: - - >>> from datetime import datetime - >>> pa.scalar(datetime(2012, 1, 1), type=pa.date64()) - - """ - -def float16() -> Float16Type: - """ - Create half-precision floating point type. - - Examples - -------- - Create an instance of float16 type: - - >>> import pyarrow as pa - >>> pa.float16() - DataType(halffloat) - >>> print(pa.float16()) - halffloat - - Create an array with float16 type: - - >>> arr = np.array([1.5, np.nan], dtype=np.float16) - >>> a = pa.array(arr, type=pa.float16()) - >>> a - - [ - 1.5, - nan - ] - - Note that unlike other float types, if you convert this array - to a python list, the types of its elements will be ``np.float16`` - - >>> [type(val) for val in a.to_pylist()] - [, ] - """ - -def float32() -> Float32Type: - """ - Create single-precision floating point type. - - Examples - -------- - Create an instance of float32 type: - - >>> import pyarrow as pa - >>> pa.float32() - DataType(float) - >>> print(pa.float32()) - float - - Create an array with float32 type: - - >>> pa.array([0.0, 1.0, 2.0], type=pa.float32()) - - [ - 0, - 1, - 2 - ] - """ - -def float64() -> Float64Type: - """ - Create double-precision floating point type. - - Examples - -------- - Create an instance of float64 type: - - >>> import pyarrow as pa - >>> pa.float64() - DataType(double) - >>> print(pa.float64()) - double - - Create an array with float64 type: - - >>> pa.array([0.0, 1.0, 2.0], type=pa.float64()) - - [ - 0, - 1, - 2 - ] - """ - -def decimal32(precision: _Precision, scale: _Scale | None = None) -> Decimal32Type[_Precision, _Scale| Literal[0]]: - """ - Create decimal type with precision and scale and 32-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - As an example, ``decimal32(7, 3)`` can exactly represent the numbers - 1234.567 and -1234.567 (encoded internally as the 32-bit integers - 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. - - ``decimal32(5, -3)`` can exactly represent the number 12345000 - (encoded internally as the 32-bit integer 12345), but neither - 123450000 nor 1234500. - - If you need a precision higher than 9 significant digits, consider - using ``decimal64``, ``decimal128``, or ``decimal256``. - - Parameters - ---------- - precision : int - Must be between 1 and 9 - scale : int - - Returns - ------- - decimal_type : Decimal32Type - - Examples - -------- - Create an instance of decimal type: - - >>> import pyarrow as pa - >>> pa.decimal32(5, 2) - Decimal32Type(decimal32(5, 2)) - - Create an array with decimal type: - - >>> import decimal - >>> a = decimal.Decimal('123.45') - >>> pa.array([a], pa.decimal32(5, 2)) - - [ - 123.45 - ] - """ - -def decimal64(precision: _Precision, scale: _Scale | None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: - """ - Create decimal type with precision and scale and 64-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - As an example, ``decimal64(7, 3)`` can exactly represent the numbers - 1234.567 and -1234.567 (encoded internally as the 64-bit integers - 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. - - ``decimal64(5, -3)`` can exactly represent the number 12345000 - (encoded internally as the 64-bit integer 12345), but neither - 123450000 nor 1234500. - - If you need a precision higher than 18 significant digits, consider - using ``decimal128``, or ``decimal256``. - - Parameters - ---------- - precision : int - Must be between 1 and 18 - scale : int - - Returns - ------- - decimal_type : Decimal64Type - - Examples - -------- - Create an instance of decimal type: - - >>> import pyarrow as pa - >>> pa.decimal64(5, 2) - Decimal64Type(decimal64(5, 2)) - - Create an array with decimal type: - - >>> import decimal - >>> a = decimal.Decimal('123.45') - >>> pa.array([a], pa.decimal64(5, 2)) - - [ - 123.45 - ] - """ - -def decimal128(precision: _Precision, scale: _Scale | None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: - """ - Create decimal type with precision and scale and 128-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - As an example, ``decimal128(7, 3)`` can exactly represent the numbers - 1234.567 and -1234.567 (encoded internally as the 128-bit integers - 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. - - ``decimal128(5, -3)`` can exactly represent the number 12345000 - (encoded internally as the 128-bit integer 12345), but neither - 123450000 nor 1234500. - - If you need a precision higher than 38 significant digits, consider - using ``decimal256``. - - Parameters - ---------- - precision : int - Must be between 1 and 38 - scale : int - - Returns - ------- - decimal_type : Decimal128Type - - Examples - -------- - Create an instance of decimal type: - - >>> import pyarrow as pa - >>> pa.decimal128(5, 2) - Decimal128Type(decimal128(5, 2)) - - Create an array with decimal type: - - >>> import decimal - >>> a = decimal.Decimal('123.45') - >>> pa.array([a], pa.decimal128(5, 2)) - - [ - 123.45 - ] - """ - -def decimal256(precision: _Precision, scale: _Scale | None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: - """ - Create decimal type with precision and scale and 256-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - For most use cases, the maximum precision offered by ``decimal128`` - is sufficient, and it will result in a more compact and more efficient - encoding. ``decimal256`` is useful if you need a precision higher - than 38 significant digits. - - Parameters - ---------- - precision : int - Must be between 1 and 76 - scale : int - - Returns - ------- - decimal_type : Decimal256Type - """ - -def string() -> StringType: - """ - Create UTF8 variable-length string type. - - Examples - -------- - Create an instance of a string type: - - >>> import pyarrow as pa - >>> pa.string() - DataType(string) - - and use the string type to create an array: - - >>> pa.array(['foo', 'bar', 'baz'], type=pa.string()) - - [ - "foo", - "bar", - "baz" - ] - """ +) -> Field[_DataTypeT] | Field[Any]: ... -utf8 = string -""" -Alias for string(). -Examples --------- -Create an instance of a string type: +def null() -> NullType: ... ->>> import pyarrow as pa ->>> pa.utf8() -DataType(string) -and use the string type to create an array: +def bool_() -> BoolType: ... ->>> pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) - -[ - "foo", - "bar", - "baz" -] -""" - -def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: - """ - Create variable-length or fixed size binary type. - - Parameters - ---------- - length : int, optional, default -1 - If length == -1 then return a variable length binary type. If length is - greater than or equal to 0 then return a fixed size binary type of - width `length`. - - Examples - -------- - Create an instance of a variable-length binary type: - - >>> import pyarrow as pa - >>> pa.binary() - DataType(binary) - - and use the variable-length binary type to create an array: - - >>> pa.array(['foo', 'bar', 'baz'], type=pa.binary()) - - [ - 666F6F, - 626172, - 62617A - ] - - Create an instance of a fixed-size binary type: - - >>> pa.binary(3) - FixedSizeBinaryType(fixed_size_binary[3]) - - and use the fixed-length binary type to create an array: - - >>> pa.array(['foo', 'bar', 'baz'], type=pa.binary(3)) - - [ - 666F6F, - 626172, - 62617A - ] - """ - -def large_binary() -> LargeBinaryType: - """ - Create large variable-length binary type. - - This data type may not be supported by all Arrow implementations. Unless - you need to represent data larger than 2GB, you should prefer binary(). - - Examples - -------- - Create an instance of large variable-length binary type: - - >>> import pyarrow as pa - >>> pa.large_binary() - DataType(large_binary) - - and use the type to create an array: - - >>> pa.array(['foo', 'bar', 'baz'], type=pa.large_binary()) - - [ - 666F6F, - 626172, - 62617A - ] - """ - -def large_string() -> LargeStringType: - """ - Create large UTF8 variable-length string type. - - This data type may not be supported by all Arrow implementations. Unless - you need to represent data larger than 2GB, you should prefer string(). - - Examples - -------- - Create an instance of large UTF8 variable-length binary type: - - >>> import pyarrow as pa - >>> pa.large_string() - DataType(large_string) - - and use the type to create an array: - - >>> pa.array(['foo', 'bar'] * 50, type=pa.large_string()) - - [ - "foo", - "bar", - ... - "foo", - "bar" - ] - """ -large_utf8 = large_string -""" -Alias for large_string(). +def uint8() -> UInt8Type: ... -Examples --------- -Create an instance of large UTF8 variable-length binary type: ->>> import pyarrow as pa ->>> pa.large_utf8() -DataType(large_string) +def int8() -> Int8Type: ... -and use the type to create an array: ->>> pa.array(['foo', 'bar'] * 50, type=pa.large_utf8()) - -[ - "foo", - "bar", - ... - "foo", - "bar" -] -""" +def uint16() -> UInt16Type: ... -def binary_view() -> BinaryViewType: - """ - Create a variable-length binary view type. - Examples - -------- - Create an instance of a string type: +def int16() -> Int16Type: ... - >>> import pyarrow as pa - >>> pa.binary_view() - DataType(binary_view) - """ -def string_view() -> StringViewType: - """ - Create UTF8 variable-length string view type. +def uint32() -> Uint32Type: ... - Examples - -------- - Create an instance of a string type: - >>> import pyarrow as pa - >>> pa.string_view() - DataType(string_view) - """ +def int32() -> Int32Type: ... -def list_( - value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] | _Size | None = None -) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: - """ - Create ListType instance from child data type or field. - - Parameters - ---------- - value_type : DataType or Field - list_size : int, optional, default -1 - If length == -1 then return a variable length list type. If length is - greater than or equal to 0 then return a fixed size list type. - - Returns - ------- - list_type : DataType - - Examples - -------- - Create an instance of ListType: - - >>> import pyarrow as pa - >>> pa.list_(pa.string()) - ListType(list) - >>> pa.list_(pa.int32(), 2) - FixedSizeListType(fixed_size_list[2]) - - Use the ListType to create a scalar: - - >>> pa.scalar(['foo', None], type=pa.list_(pa.string(), 2)) - - - or an array: - - >>> pa.array([[1, 2], [3, 4]], pa.list_(pa.int32(), 2)) - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - """ - -def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: - """ - Create LargeListType instance from child data type or field. - - This data type may not be supported by all Arrow implementations. - Unless you need to represent data larger than 2**31 elements, you should - prefer list_(). - - Parameters - ---------- - value_type : DataType or Field - - Returns - ------- - list_type : DataType - - Examples - -------- - Create an instance of LargeListType: - - >>> import pyarrow as pa - >>> pa.large_list(pa.int8()) - LargeListType(large_list) - - Use the LargeListType to create an array: - - >>> pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) - - [ - [ - -1, - 3 - ], - [ - -1, - 3 - ], - ... - """ -def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: - """ - Create ListViewType instance from child data type or field. +def int64() -> Int64Type: ... - This data type may not be supported by all Arrow implementations - because it is an alternative to the ListType. - Parameters - ---------- - value_type : DataType or Field +def uint64() -> UInt64Type: ... - Returns - ------- - list_view_type : DataType - Examples - -------- - Create an instance of ListViewType: +def timestamp(unit: _Unit, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: ... - >>> import pyarrow as pa - >>> pa.list_view(pa.string()) - ListViewType(list_view) - """ -def large_list_view( - value_type: _DataTypeT | Field[_DataTypeT], -) -> LargeListViewType[_DataTypeT]: - """ - Create LargeListViewType instance from child data type or field. +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: ... - This data type may not be supported by all Arrow implementations - because it is an alternative to the ListType. - Parameters - ---------- - value_type : DataType or Field +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: ... - Returns - ------- - list_view_type : DataType - Examples - -------- - Create an instance of LargeListViewType: +def duration(unit: _Unit) -> DurationType[_Unit]: ... - >>> import pyarrow as pa - >>> pa.large_list_view(pa.int8()) - LargeListViewType(large_list_view) - """ -def map_( - key_type: _K, item_type: _ValueT, key_sorted: _Ordered | None = None -) -> MapType[_K, _ValueT, _Ordered]: - """ - Create MapType instance from key and item data types or fields. - - Parameters - ---------- - key_type : DataType or Field - item_type : DataType or Field - keys_sorted : bool - - Returns - ------- - map_type : DataType - - Examples - -------- - Create an instance of MapType: - - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()) - MapType(map) - >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) - MapType(map) - - Use MapType to create an array: - - >>> data = [[{'key': 'a', 'value': 1}, {'key': 'b', 'value': 2}], [{'key': 'c', 'value': 3}]] - >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) - - [ - keys: - [ - "a", - "b" - ] - values: - [ - 1, - 2 - ], - keys: - [ - "c" - ] - values: - [ - 3 - ] - ] - """ +def month_day_nano_interval() -> MonthDayNanoIntervalType: ... -def dictionary( - index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered | None = None -) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: - """ - Dictionary (categorical, or simply encoded) type. - Parameters - ---------- - index_type : DataType - value_type : DataType - ordered : bool +def date32() -> Date32Type: ... - Returns - ------- - type : DictionaryType - Examples - -------- - Create an instance of dictionary type: +def date64() -> Date64Type: ... - >>> import pyarrow as pa - >>> pa.dictionary(pa.int64(), pa.utf8()) - DictionaryType(dictionary) - Use dictionary type to create an array: +def float16() -> Float16Type: ... + + +def float32() -> Float32Type: ... + + +def float64() -> Float64Type: ... + + +def decimal32(precision: _Precision, scale: _Scale | + None = None) -> Decimal32Type[_Precision, _Scale| Literal[0]]: ... + + +def decimal64(precision: _Precision, scale: _Scale | + None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal128(precision: _Precision, scale: _Scale | + None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal256(precision: _Precision, scale: _Scale | + None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: ... + + +def string() -> StringType: ... + + +utf8 = string + + +def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: ... + + +def large_binary() -> LargeBinaryType: ... + + +def large_string() -> LargeStringType: ... + + +large_utf8 = large_string + + +def binary_view() -> BinaryViewType: ... + + +def string_view() -> StringViewType: ... + + +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] | _Size | None = None +) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: ... + + +def large_list(value_type: _DataTypeT | + Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... + + +def list_view(value_type: _DataTypeT | + Field[_DataTypeT]) -> ListViewType[_DataTypeT]: ... + + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT], +) -> LargeListViewType[_DataTypeT]: ... + + +def map_( + key_type: _K, item_type: _ValueT, key_sorted: _Ordered | None = None +) -> MapType[_K, _ValueT, _Ordered]: ... + + +def dictionary( + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered | None = None +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... - >>> pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())) - - ... - -- dictionary: - [ - "a", - "b", - "d" - ] - -- indices: - [ - 0, - 1, - null, - 2 - ] - """ def struct( fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] | Mapping[str, Field[Any]], -) -> StructType: - """ - Create StructType instance from fields. - - A struct is a nested type parameterized by an ordered sequence of types - (which can all be distinct), called its fields. - - Parameters - ---------- - fields : iterable of Fields or tuples, or mapping of strings to DataTypes - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - - Examples - -------- - Create an instance of StructType from an iterable of tuples: - - >>> import pyarrow as pa - >>> fields = [ - ... ('f1', pa.int32()), - ... ('f2', pa.string()), - ... ] - >>> struct_type = pa.struct(fields) - >>> struct_type - StructType(struct) - - Retrieve a field from a StructType: - - >>> struct_type[0] - pyarrow.Field - >>> struct_type['f1'] - pyarrow.Field - - Create an instance of StructType from an iterable of Fields: - - >>> fields = [ - ... pa.field('f1', pa.int32()), - ... pa.field('f2', pa.string(), nullable=False), - ... ] - >>> pa.struct(fields) - StructType(struct) - - Returns - ------- - type : DataType - """ +) -> StructType: ... + def sparse_union( child_fields: list[Field[Any]], type_codes: list[int] | None = None -) -> SparseUnionType: - """ - Create SparseUnionType from child fields. - - A sparse union is a nested type where each logical value is taken from - a single child. A buffer of 8-bit type ids indicates which child - a given logical value is to be taken from. - - In a sparse union, each child array should have the same length as the - union array, regardless of the actual number of union values that - refer to it. - - Parameters - ---------- - child_fields : sequence of Field values - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - type_codes : list of integers, default None - - Returns - ------- - type : SparseUnionType - """ +) -> SparseUnionType: ... + def dense_union( child_fields: list[Field[Any]], type_codes: list[int] | None = None -) -> DenseUnionType: - """ - Create DenseUnionType from child fields. - - A dense union is a nested type where each logical value is taken from - a single child, at a specific offset. A buffer of 8-bit type ids - indicates which child a given logical value is to be taken from, - and a buffer of 32-bit offsets indicates at which physical position - in the given child array the logical value is to be taken from. - - Unlike a sparse union, a dense union allows encoding only the child array - values which are actually referred to by the union array. This is - counterbalanced by the additional footprint of the offsets buffer, and - the additional indirection cost when looking up values. - - Parameters - ---------- - child_fields : sequence of Field values - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - type_codes : list of integers, default None - - Returns - ------- - type : DenseUnionType - """ +) -> DenseUnionType: ... + def union( child_fields: list[Field[Any]], mode: Literal["sparse"] | Literal["dense"], type_codes: list[int] | None = None -) -> SparseUnionType | DenseUnionType: - """ - Create UnionType from child fields. - - A union is a nested type where each logical value is taken from a - single child. A buffer of 8-bit type ids indicates which child - a given logical value is to be taken from. - - Unions come in two flavors: sparse and dense - (see also `pyarrow.sparse_union` and `pyarrow.dense_union`). - - Parameters - ---------- - child_fields : sequence of Field values - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - mode : str - Must be 'sparse' or 'dense' - type_codes : list of integers, default None - - Returns - ------- - type : UnionType - """ +) -> SparseUnionType | DenseUnionType: ... + def run_end_encoded( run_end_type: _RunEndType, value_type: _BasicValueT -) -> RunEndEncodedType[_RunEndType, _BasicValueT]: - """ - Create RunEndEncodedType from run-end and value types. - - Parameters - ---------- - run_end_type : pyarrow.DataType - The integer type of the run_ends array. Must be 'int16', 'int32', or 'int64'. - value_type : pyarrow.DataType - The type of the values array. - - Returns - ------- - type : RunEndEncodedType - """ - -def json_(storage_type: DataType = ...) -> JsonType: - """ - Create instance of JSON extension type. - - Parameters - ---------- - storage_type : DataType, default pyarrow.string() - The underlying data type. Can be on of the following types: - string, large_string, string_view. - - Returns - ------- - type : JsonType - - Examples - -------- - Create an instance of JSON extension type: - - >>> import pyarrow as pa - >>> pa.json_(pa.utf8()) - JsonType(extension) - - Use the JSON type to create an array: - - >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) - - [ - "{"a": 1}", - "{"b": 2}" - ] - """ - -def uuid() -> UuidType: - """ - Create UuidType instance. - - Returns - ------- - type : UuidType - """ +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... + + +def json_(storage_type: DataType = ...) -> JsonType: ... + + +def uuid() -> UuidType: ... + def fixed_shape_tensor( value_type: _ValueT, shape: Sequence[int], dim_names: Sequence[str] | None = None, permutation: Sequence[int] | None = None, -) -> FixedShapeTensorType[_ValueT]: - """ - Create instance of fixed shape tensor extension type with shape and optional - names of tensor dimensions and indices of the desired logical - ordering of dimensions. - - Parameters - ---------- - value_type : DataType - Data type of individual tensor elements. - shape : tuple or list of integers - The physical shape of the contained tensors. - dim_names : tuple or list of strings, default None - Explicit names to tensor dimensions. - permutation : tuple or list integers, default None - Indices of the desired ordering of the original dimensions. - The indices contain a permutation of the values ``[0, 1, .., N-1]`` where - N is the number of dimensions. The permutation indicates which dimension - of the logical layout corresponds to which dimension of the physical tensor. - For more information on this parameter see - :ref:`fixed_shape_tensor_extension`. - - Examples - -------- - Create an instance of fixed shape tensor extension type: - - >>> import pyarrow as pa - >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) - >>> tensor_type - FixedShapeTensorType(extension) - - Inspect the data type: - - >>> tensor_type.value_type - DataType(int32) - >>> tensor_type.shape - [2, 2] - - Create a table with fixed shape tensor extension array: - - >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] - >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) - >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) - >>> pa.table([tensor], names=["tensor_array"]) - pyarrow.Table - tensor_array: extension - ---- - tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] - - Create an instance of fixed shape tensor extension type with names - of tensor dimensions: - - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), - ... dim_names=['C', 'H', 'W']) - >>> tensor_type.dim_names - ['C', 'H', 'W'] - - Create an instance of fixed shape tensor extension type with - permutation: - - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), - ... permutation=[0, 2, 1]) - >>> tensor_type.permutation - [0, 2, 1] - - Returns - ------- - type : FixedShapeTensorType - """ - -def bool8() -> Bool8Type: - """ - Create instance of bool8 extension type. - - Examples - -------- - Create an instance of bool8 extension type: - - >>> import pyarrow as pa - >>> type = pa.bool8() - >>> type - Bool8Type(extension) - - Inspect the data type: - - >>> type.storage_type - DataType(int8) - - Create a table with a bool8 array: - - >>> arr = [-1, 0, 1, 2, None] - >>> storage = pa.array(arr, pa.int8()) - >>> other = pa.ExtensionArray.from_storage(type, storage) - >>> pa.table([other], names=["unknown_col"]) - pyarrow.Table - unknown_col: extension - ---- - unknown_col: [[-1,0,1,2,null]] - - Returns - ------- - type : Bool8Type - """ - -def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: - """ - Create instance of opaque extension type. - - Parameters - ---------- - storage_type : DataType - The underlying data type. - type_name : str - The name of the type in the external system. - vendor_name : str - The name of the external system. - - Examples - -------- - Create an instance of an opaque extension type: - - >>> import pyarrow as pa - >>> type = pa.opaque(pa.binary(), "other", "jdbc") - >>> type - OpaqueType(extension) - - Inspect the data type: - - >>> type.storage_type - DataType(binary) - >>> type.type_name - 'other' - >>> type.vendor_name - 'jdbc' - - Create a table with an opaque array: - - >>> arr = [None, b"foobar"] - >>> storage = pa.array(arr, pa.binary()) - >>> other = pa.ExtensionArray.from_storage(type, storage) - >>> pa.table([other], names=["unknown_col"]) - pyarrow.Table - unknown_col: extension - ---- - unknown_col: [[null,666F6F626172]] - - Returns - ------- - type : OpaqueType - """ - -def type_for_alias(name: Any) -> DataType: - """ - Return DataType given a string alias if one exists. - - Parameters - ---------- - name : str - The alias of the DataType that should be retrieved. - - Returns - ------- - type : DataType - """ +) -> FixedShapeTensorType[_ValueT]: ... + + +def bool8() -> Bool8Type: ... + + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ... + + +def type_for_alias(name: Any) -> DataType: ... + def schema( fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], metadata: dict[bytes | str, bytes | str] | None = None, -) -> Schema: - """ - Construct pyarrow.Schema from collection of fields. - - Parameters - ---------- - fields : iterable of Fields or tuples, or mapping of strings to DataTypes - Can also pass an object that implements the Arrow PyCapsule Protocol - for schemas (has an ``__arrow_c_schema__`` method). - metadata : dict, default None - Keys and values must be coercible to bytes. - - Examples - -------- - Create a Schema from iterable of tuples: - - >>> import pyarrow as pa - >>> pa.schema([ - ... ('some_int', pa.int32()), - ... ('some_string', pa.string()), - ... pa.field('some_required_string', pa.string(), nullable=False) - ... ]) - some_int: int32 - some_string: string - some_required_string: string not null - - Create a Schema from iterable of Fields: - - >>> pa.schema([ - ... pa.field('some_int', pa.int32()), - ... pa.field('some_string', pa.string()) - ... ]) - some_int: int32 - some_string: string - - DataTypes can also be passed as strings. The following is equivalent to the - above example: - - >>> pa.schema([ - ... pa.field('some_int', "int32"), - ... pa.field('some_string', "string") - ... ]) - some_int: int32 - some_string: string - - Or more concisely: - - >>> pa.schema([ - ... ('some_int', "int32"), - ... ('some_string', "string") - ... ]) - some_int: int32 - some_string: string - - Returns - ------- - schema : pyarrow.Schema - """ - -def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: - """ - Convert NumPy dtype to pyarrow.DataType. - - Parameters - ---------- - dtype : the numpy dtype to convert - - - Examples - -------- - Create a pyarrow DataType from NumPy dtype: - - >>> import pyarrow as pa - >>> import numpy as np - >>> pa.from_numpy_dtype(np.dtype('float16')) - DataType(halffloat) - >>> pa.from_numpy_dtype('U') - DataType(string) - >>> pa.from_numpy_dtype(bool) - DataType(bool) - >>> pa.from_numpy_dtype(np.str_) - DataType(string) - """ +) -> Schema: ... + + +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: ... + __all__ = [ "_Weakrefable", diff --git a/python/pyarrow-stubs/acero.pyi b/python/pyarrow-stubs/acero.pyi index 2abb608b32c..b3bc83382fb 100644 --- a/python/pyarrow-stubs/acero.pyi +++ b/python/pyarrow-stubs/acero.pyi @@ -32,6 +32,7 @@ from .compute import Expression, FunctionOptions _StrOrExpr: TypeAlias = str | Expression + class Declaration(lib._Weakrefable): def __init__( self, @@ -44,16 +45,23 @@ class Declaration(lib._Weakrefable): def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... def to_table(self, use_threads: bool = True) -> lib.Table: ... -class ExecNodeOptions(lib._Weakrefable): ... + +class ExecNodeOptions(lib._Weakrefable): + ... + class TableSourceNodeOptions(ExecNodeOptions): def __init__(self, table: lib.Table) -> None: ... + class FilterNodeOptions(ExecNodeOptions): def __init__(self, filter_expression: Expression) -> None: ... + class ProjectNodeOptions(ExecNodeOptions): - def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... + def __init__(self, expressions: list[Expression], + names: list[str] | None = None) -> None: ... + class AggregateNodeOptions(ExecNodeOptions): def __init__( @@ -62,6 +70,7 @@ class AggregateNodeOptions(ExecNodeOptions): keys: list[_StrOrExpr] | None = None, ) -> None: ... + class OrderByNodeOptions(ExecNodeOptions): def __init__( self, @@ -70,6 +79,7 @@ class OrderByNodeOptions(ExecNodeOptions): null_placement: Literal["at_start", "at_end"] = "at_end", ) -> None: ... + class HashJoinNodeOptions(ExecNodeOptions): def __init__( self, @@ -91,6 +101,7 @@ class HashJoinNodeOptions(ExecNodeOptions): output_suffix_for_right: str = "", ) -> None: ... + class AsofJoinNodeOptions(ExecNodeOptions): def __init__( self, diff --git a/python/pyarrow-stubs/array.pyi b/python/pyarrow-stubs/array.pyi index 3027d689372..7aa67fc8955 100644 --- a/python/pyarrow-stubs/array.pyi +++ b/python/pyarrow-stubs/array.pyi @@ -45,7 +45,7 @@ from pyarrow._stubs_typing import ( SupportArrowArray, SupportArrowDeviceArray, ) -from pyarrow.lib import ( # type: ignore[attr-defined] +from pyarrow.lib import ( # type: ignore[attr-defined] Buffer, Device, # type: ignore[reportAttributeAccessIssue] MemoryManager, # type: ignore[reportAttributeAccessIssue] @@ -84,6 +84,7 @@ from ._types import ( ) from ._stubs_typing import NullableCollection + def array( values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Any | None = None, @@ -92,302 +93,57 @@ def array( from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> ArrayLike: - """ - Create pyarrow.Array instance from a Python object. - - Parameters - ---------- - obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array - If both type and size are specified may be a single use iterable. If - not strongly-typed, Arrow type will be inferred for resulting array. - Any Arrow-compatible array that implements the Arrow PyCapsule Protocol - (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) - can be passed as well. - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred from - the data. - mask : array[bool], optional - Indicate which values are null (True) or not null (False). - size : int64, optional - Size of the elements. If the input is larger than size bail at this - length. For iterators, if size is larger than the input iterator this - will be treated as a "max size", but will involve an initial allocation - of size followed by a resize to the actual size (so if you know the - exact size specifying it correctly will give you better performance). - from_pandas : bool, default None - Use pandas's semantics for inferring nulls from values in - ndarray-like data. If passed, the mask tasks precedence, but - if a value is unmasked (not-null), but still null according to - pandas semantics, then it is null. Defaults to False if not - passed explicitly by user, or True if a pandas object is - passed in. - safe : bool, default True - Check for overflows or other unsafe conversions. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Returns - ------- - array : pyarrow.Array or pyarrow.ChunkedArray - A ChunkedArray instead of an Array is returned if: - - - the object data overflowed binary storage. - - the object's ``__arrow_array__`` protocol method returned a chunked - array. - - Notes - ----- - Timezone will be preserved in the returned array for timezone-aware data, - else no timezone will be returned for naive timestamps. - Internally, UTC values are stored for timezone-aware data with the - timezone set in the data type. - - Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by - default converted as MonthDayNanoIntervalArray. relativedelta leapdays - are ignored as are all absolute fields on both objects. datetime.timedelta - can also be converted to MonthDayNanoIntervalArray but this requires - passing MonthDayNanoIntervalType explicitly. - - Converting to dictionary array will promote to a wider integer type for - indices if the number of distinct values cannot be represented, even if - the index type was explicitly set. This means that if there are more than - 127 values the returned dictionary array's index type will be at least - pa.int16() even if pa.int8() was passed to the function. Note that an - explicit index type will not be demoted even if it is wider than required. - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> pa.array(pd.Series([1, 2])) - - [ - 1, - 2 - ] - - >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) - - ... - -- dictionary: - [ - "a", - "b" - ] - -- indices: - [ - 0, - 1, - 0 - ] - - >>> import numpy as np - >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) - - [ - 1, - null - ] - - >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) - >>> arr.type.index_type - DataType(int16) - """ +) -> ArrayLike: ... + def asarray( values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: _DataTypeT | Any | None = None, -) -> Array[Scalar[_DataTypeT]] | ArrayLike: - """ - Convert to pyarrow.Array, inferring type if not provided. - - Parameters - ---------- - values : array-like - This can be a sequence, numpy.ndarray, pyarrow.Array or - pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be - a ChunkedArray, otherwise the output will be a Array. - type : string or DataType - Explicitly construct the array with this type. Attempt to cast if - indicated type is different. - - Returns - ------- - arr : Array or ChunkedArray - """ +) -> Array[Scalar[_DataTypeT]] | ArrayLike: ... + def nulls( size: int, type: Any | None = None, memory_pool: MemoryPool | None = None, -) -> ArrayLike: - """ - Create a strongly-typed Array instance with all elements null. - - Parameters - ---------- - size : int - Array length. - type : pyarrow.DataType, default None - Explicit type for the array. By default use NullType. - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. - - Returns - ------- - arr : Array - - Examples - -------- - >>> import pyarrow as pa - >>> pa.nulls(10) - - 10 nulls - - >>> pa.nulls(3, pa.uint32()) - - [ - null, - null, - null - ] - """ +) -> ArrayLike: ... + def repeat( value: Any, size: int, memory_pool: MemoryPool | None = None, -) -> ArrayLike: - """ - Create an Array instance whose slots are the given scalar. - - Parameters - ---------- - value : Scalar-like object - Either a pyarrow.Scalar or any python object coercible to a Scalar. - size : int - Number of times to repeat the scalar in the output Array. - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. - - Returns - ------- - arr : Array - - Examples - -------- - >>> import pyarrow as pa - >>> pa.repeat(10, 3) - - [ - 10, - 10, - 10 - ] - - >>> pa.repeat([1, 2], 2) - - [ - [ - 1, - 2 - ], - [ - 1, - 2 - ] - ] - - >>> pa.repeat("string", 3) - - [ - "string", - "string", - "string" - ] - - >>> pa.repeat(pa.scalar({'a': 1, 'b': [1, 2]}), 2) - - -- is_valid: all not null - -- child 0 type: int64 - [ - 1, - 1 - ] - -- child 1 type: list - [ - [ - 1, - 2 - ], - [ - 1, - 2 - ] - ] - """ - -def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: - """ - Attempt to infer Arrow data type that can hold the passed Python - sequence type in an Array object - - Parameters - ---------- - values : array-like - Sequence to infer type from. - mask : ndarray (bool type), optional - Optional exclusion mask where True marks null, False non-null. - from_pandas : bool, default False - Use pandas's NA/null sentinel values for type inference. - - Returns - ------- - type : DataType - """ +) -> ArrayLike: ... + + +def infer_type(values: Iterable[Any], mask: Mask, + from_pandas: bool = False) -> DataType: ... + class ArrayStatistics(_Weakrefable): - """ - The class for statistics of an array. - """ + @property - def null_count(self) -> int: - """ - The number of nulls. - """ + def null_count(self) -> int: ... + @property - def distinct_count(self) -> int: - """ - The number of distinct values. - """ + def distinct_count(self) -> int: ... + @property - def min(self) -> Any: - """ - The minimum value. - """ + def min(self) -> Any: ... + @property - def is_min_exact(self) -> bool: - """ - Whether the minimum value is an exact value or not. - """ + def is_min_exact(self) -> bool: ... + @property - def max(self) -> Any: - """ - The maximum value. - """ + def max(self) -> Any: ... @property - def is_max_exact(self) -> bool: - """ - Whether the maximum value is an exact value or not. - """ + def is_max_exact(self) -> bool: ... + _ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): def to_pandas( self, @@ -407,287 +163,38 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, coerce_temporal_nanoseconds: bool = False, - ) -> _ConvertAs: - """ - Convert to a pandas-compatible NumPy array or DataFrame, as appropriate - - Parameters - ---------- - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. - categories : list, default empty - List of fields that should be returned as pandas.Categorical. Only - applies to table-like data structures. - strings_to_categorical : bool, default False - Encode string (UTF8) and binary types to pandas.Categorical. - zero_copy_only : bool, default False - Raise an ArrowException if this function call would require copying - the underlying data. - integer_object_nulls : bool, default False - Cast integers with nulls to objects - date_as_object : bool, default True - Cast dates to objects. If False, convert to datetime64 dtype with - the equivalent time unit (if supported). Note: in pandas version - < 2.0, only datetime64[ns] conversion is supported. - timestamp_as_object : bool, default False - Cast non-nanosecond timestamps (np.datetime64) to objects. This is - useful in pandas version 1.x if you have timestamps that don't fit - in the normal date range of nanosecond timestamps (1678 CE-2262 CE). - Non-nanosecond timestamps are supported in pandas version 2.0. - If False, all timestamps are converted to datetime64 dtype. - use_threads : bool, default True - Whether to parallelize the conversion using multiple threads. - deduplicate_objects : bool, default True - Do not create multiple copies Python objects when created, to save - on memory use. Conversion will be slower. - ignore_metadata : bool, default False - If True, do not use the 'pandas' metadata to reconstruct the - DataFrame index, if present - safe : bool, default True - For certain data types, a cast is needed in order to store the - data in a pandas DataFrame or Series (e.g. timestamps are always - stored as nanoseconds in pandas). This option controls whether it - is a safe cast or not. - split_blocks : bool, default False - If True, generate one internal "block" for each column when - creating a pandas.DataFrame from a RecordBatch or Table. While this - can temporarily reduce memory note that various pandas operations - can trigger "consolidation" which may balloon memory use. - self_destruct : bool, default False - EXPERIMENTAL: If True, attempt to deallocate the originating Arrow - memory while converting the Arrow object to pandas. If you use the - object after calling to_pandas with this option it will crash your - program. - - Note that you may not see always memory usage improvements. For - example, if multiple columns share an underlying allocation, - memory can't be freed until all columns are converted. - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. - types_mapper : function, default None - A function mapping a pyarrow DataType to a pandas ExtensionDtype. - This can be used to override the default pandas type for conversion - of built-in pyarrow types or in absence of pandas_metadata in the - Table schema. The function receives a pyarrow DataType and is - expected to return a pandas ExtensionDtype or ``None`` if the - default conversion should be used for that type. If you have - a dictionary mapping, you can pass ``dict.get`` as function. - coerce_temporal_nanoseconds : bool, default False - Only applicable to pandas version >= 2.0. - A legacy option to coerce date32, date64, duration, and timestamp - time units to nanoseconds when converting to pandas. This is the - default behavior in pandas version 1.x. Set this option to True if - you'd like to use this coercion when using pandas version >= 2.0 - for backwards compatibility (not recommended otherwise). - - Returns - ------- - pandas.Series or pandas.DataFrame depending on type of object - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - - Convert a Table to pandas DataFrame: - - >>> table = pa.table([ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - ... ], names=['n_legs', 'animals']) - >>> table.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - >>> isinstance(table.to_pandas(), pd.DataFrame) - True - - Convert a RecordBatch to pandas DataFrame: - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> batch = pa.record_batch([n_legs, animals], - ... names=["n_legs", "animals"]) - >>> batch - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - >>> isinstance(batch.to_pandas(), pd.DataFrame) - True - - Convert a Chunked Array to pandas Series: - - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_pandas() - 0 2 - 1 2 - 2 4 - 3 4 - 4 5 - 5 100 - dtype: int64 - >>> isinstance(n_legs.to_pandas(), pd.Series) - True - """ + ) -> _ConvertAs: ... + _CastAs = TypeVar("_CastAs", bound=DataType) _Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) _ScalarT = TypeVar("_ScalarT", bound=Scalar) + class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): - """ - The base class for all Arrow arrays. - """ - - def diff(self, other: Self) -> str: - """ - Compare contents of this array against another one. - - Return a string containing the result of diffing this array - (on the left side) against the other array (on the right side). - - Parameters - ---------- - other : Array - The other array to compare this array with. - - Returns - ------- - diff : str - A human-readable printout of the differences. - - Examples - -------- - >>> import pyarrow as pa - >>> left = pa.array(["one", "two", "three"]) - >>> right = pa.array(["two", None, "two-and-a-half", "three"]) - >>> print(left.diff(right)) # doctest: +SKIP - - @@ -0, +0 @@ - -"one" - @@ -2, +1 @@ - +null - +"two-and-a-half" - """ + + def diff(self, other: Self) -> str: ... + def cast( self, target_type: _CastAs, safe: bool = True, options: CastOptions | None = None, memory_pool: MemoryPool | None = None, - ) -> Array[Scalar[_CastAs]]: - """ - Cast array values to another data type - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, default None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Returns - ------- - cast : Array - """ - def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: - """ - Return zero-copy "view" of array as another data type. - - The data types must have compatible columnar buffer layouts - - Parameters - ---------- - target_type : DataType - Type to construct view as. - - Returns - ------- - view : Array - """ - def sum(self, **kwargs) -> _Scalar_co: - """ - Sum the values in a numerical array. - - See :func:`pyarrow.compute.sum` for full usage. - - Parameters - ---------- - **kwargs : dict, optional - Options to pass to :func:`pyarrow.compute.sum`. - - Returns - ------- - sum : Scalar - A scalar containing the sum value. - """ + ) -> Array[Scalar[_CastAs]]: ... + + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... + + def sum(self, **kwargs) -> _Scalar_co: ... + @property def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... - def unique(self) -> Self: - """ - Compute distinct elements in array. - - Returns - ------- - unique : Array - An array of the same data type, with deduplicated elements. - """ - def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: - """ - Compute dictionary-encoded representation of array. - - See :func:`pyarrow.compute.dictionary_encode` for full usage. - - Parameters - ---------- - null_encoding : str, default "mask" - How to handle null entries. - - Returns - ------- - encoded : DictionaryArray - A dictionary-encoded version of this array. - """ - def value_counts(self) -> StructArray: - """ - Compute counts of unique elements in array. - - Returns - ------- - StructArray - An array of structs - """ + def unique(self) -> Self: ... + + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + + def value_counts(self) -> StructArray: ... + @staticmethod def from_pandas( obj: pd.Series | np.ndarray | ArrayLike, @@ -696,39 +203,8 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): type: _DataTypeT | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, - ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: - """ - Convert pandas.Series to an Arrow Array. - - This method uses Pandas semantics about what values indicate - nulls. See pyarrow.array for more general conversion from arrays or - sequences to Arrow arrays. - - Parameters - ---------- - obj : ndarray, pandas.Series, array-like - mask : array (boolean), optional - Indicate which values are null (True) or not null (False). - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred - from the data. - safe : bool, default True - Check for overflows or other unsafe conversions. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Notes - ----- - Localized timestamps will currently be returned as UTC (pandas's native - representation). Timezone-naive data will be implicitly interpreted as - UTC. - - Returns - ------- - array : pyarrow.Array or pyarrow.ChunkedArray - ChunkedArray is returned if object data overflows binary buffer. - """ + ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: ... + @staticmethod def from_buffers( type: _DataTypeT, @@ -737,68 +213,18 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): null_count: int = -1, offset=0, children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, - ) -> Array[Scalar[_DataTypeT]]: - """ - Construct an Array from a sequence of buffers. - - The concrete type returned depends on the datatype. - - Parameters - ---------- - type : DataType - The value type of the array. - length : int - The number of values in the array. - buffers : List[Buffer] - The buffers backing this array. - null_count : int, default -1 - The number of null entries in the array. Negative value means that - the null count is not known. - offset : int, default 0 - The array's logical offset (in values, not in bytes) from the - start of each buffer. - children : List[Array], default None - Nested type children with length matching type.num_fields. - - Returns - ------- - array : Array - """ + ) -> Array[Scalar[_DataTypeT]]: ... + @property def null_count(self) -> int: ... @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the array. - - In other words, the sum of bytes from all buffer - ranges referenced. + def nbytes(self) -> int: ... - Unlike `get_total_buffer_size` this method will account for array - offsets. + def get_total_buffer_size(self) -> int: ... - If buffers are shared between arrays then the shared - portion will be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the array. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - """ def __sizeof__(self) -> int: ... - def __iter__(self) -> Iterator[_Scalar_co]: - """ - Implement iter(self). - """ + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_string( self, *, @@ -807,166 +233,37 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): window: int = 10, container_window: int = 2, skip_new_lines: bool = False, - ) -> str: - """ - Render a "pretty-printed" string representation of the Array. - - Note: for data on a non-CPU device, the full array is copied to CPU - memory. - - Parameters - ---------- - indent : int, default 2 - How much to indent the internal items in the string to - the right, by default ``2``. - top_level_indent : int, default 0 - How much to indent right the entire content of the array, - by default ``0``. - window : int - How many primitive items to preview at the begin and end - of the array when the array is bigger than the window. - The other items will be ellipsed. - container_window : int - How many container items (such as a list in a list array) - to preview at the begin and end of the array when the array - is bigger than the window. - skip_new_lines : bool - If the array should be rendered as a single line of text - or if each element should be on its own line. - element_size_limit : int, default 100 - Maximum number of characters of a single element before it is truncated. - """ + ) -> str: ... + format = to_string - def equals(self, other: Self) -> bool: - """ - Parameters - ---------- - other : pyarrow.Array - - Returns - ------- - bool - """ - def __len__(self) -> int: - """ - Return len(self). - """ - def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: - """ - Return BooleanArray indicating the null values. - - Parameters - ---------- - nan_is_null : bool (optional, default False) - Whether floating-point NaN values should also be considered null. - - Returns - ------- - array : boolean Array - """ - def is_nan(self) -> BooleanArray: - """ - Return BooleanArray indicating the NaN values. - - Returns - ------- - array : boolean Array - """ - def is_valid(self) -> BooleanArray: - """ - Return BooleanArray indicating the non-null values. - """ + def equals(self, other: Self) -> bool: ... + + def __len__(self) -> int: ... + + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... + + def is_nan(self) -> BooleanArray: ... + + def is_valid(self) -> BooleanArray: ... + def fill_null( self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType - ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: - """ - See :func:`pyarrow.compute.fill_null` for usage. - - Parameters - ---------- - fill_value : any - The replacement value for null entries. - - Returns - ------- - result : Array - A new array with nulls replaced by the given value. - """ - def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: - """ - Slice or return value at given index - - Parameters - ---------- - key : integer or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - value : Scalar (index) or Array (slice) - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this array. - - Parameters - ---------- - offset : int, default 0 - Offset from start of array to slice. - length : int, default None - Length of slice (default is until end of Array starting from - offset). - - Returns - ------- - sliced : Array - An array with the same datatype, containing the sliced values. - """ - def take(self, indices: Indices) -> Self: - """ - Select values from an array. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the array whose values will be returned. - - Returns - ------- - taken : Array - An array with the same datatype, containing the taken values. - """ - def drop_null(self) -> Self: - """ - Remove missing values from an array. - """ + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... + + def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + def filter( self, mask: Mask, *, null_selection_behavior: Literal["drop", "emit_null"] = "drop", - ) -> Self: - """ - Select values from an array. - - See :func:`pyarrow.compute.filter` for full usage. - - Parameters - ---------- - mask : Array or array-like - The boolean mask to filter the array with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled. - - Returns - ------- - filtered : Array - An array of the same type, with only the elements selected by - the boolean mask. - """ + ) -> Self: ... def index( self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]], @@ -975,567 +272,190 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): end: int | None = None, *, memory_pool: MemoryPool | None = None, - ) -> Int64Scalar: - """ - Find the first index of a value. - - See :func:`pyarrow.compute.index` for full usage. - - Parameters - ---------- - value : Scalar or object - The value to look for in the array. - start : int, optional - The start index where to look for `value`. - end : int, optional - The end index where to look for `value`. - memory_pool : MemoryPool, optional - A memory pool for potential memory allocations. - - Returns - ------- - index : Int64Scalar - The index of the value in the array (-1 if not found). - """ - def sort(self, order: Order = "ascending", **kwargs) -> Self: - """ - Sort the Array - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : Array - """ - def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: - """ - Return a NumPy view or copy of this array. - - By default, tries to return a view of this array. This is only - supported for primitive arrays with the same memory layout as NumPy - (i.e. integers, floating point, ..) and without any nulls. - - For the extension arrays, this method simply delegates to the - underlying storage array. - - Parameters - ---------- - zero_copy_only : bool, default True - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable : bool, default False - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - - Returns - ------- - array : numpy.ndarray - """ + ) -> Int64Scalar: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def to_numpy(self, zero_copy_only: bool = True, + writable: bool = False) -> np.ndarray: ... + def to_pylist( self: Array[Scalar[_BasicDataType[_AsPyType]]], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: - """ - Convert to a list of native Python objects. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - lst : list - """ + ) -> list[_AsPyType | None]: ... + tolist = to_pylist - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ + def validate(self, *, full: bool = False) -> None: ... + @property - def offset(self) -> int: - """ - A relative position into another array's data. - - The purpose is to enable zero-copy slicing. This value defaults to zero - but must be applied on all operations with the physical storage - buffers. - """ - def buffers(self) -> list[Buffer | None]: - """ - Return a list of Buffer objects pointing to this array's physical - storage. - - To correctly interpret these buffers, you need to also apply the offset - multiplied with the size of the stored data type. - """ - def copy_to(self, destination: MemoryManager | Device) -> Self: - """ - Construct a copy of the array with all buffers on destination - device. - - This method recursively copies the array's buffers and those of its - children onto the destination MemoryManager device and returns the - new Array. - - Parameters - ---------- - destination : pyarrow.MemoryManager or pyarrow.Device - The destination device to copy the array to. - - Returns - ------- - Array - """ - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the array type - is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ + def offset(self) -> int: ... + + def buffers(self) -> list[Buffer | None]: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + @classmethod - def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: - """ - Import Array from a C ArrowArray struct, given its pointer - and the imported array type. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArray struct. - type: DataType or int - Either a DataType object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_array__(self, requested_schema=None) -> Any: - """ - Get a pair of PyCapsules containing a C ArrowArray representation of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the array to this data type. - If None, the array will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowArray, - respectively. - """ + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + @classmethod def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowDeviceArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the array type - is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + @classmethod - def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: - """ - Import Array from a C ArrowDeviceArray struct, given its pointer - and the imported array type. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - type: DataType or int - Either a DataType object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - - def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: - """ - Get a pair of PyCapsules containing a C ArrowDeviceArray representation - of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the array to this data type. - If None, the array will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - kwargs - Currently no additional keyword arguments are supported, but - this method will accept any keyword with a value of ``None`` - for compatibility with future keywords. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, - respectively. - """ + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + @classmethod def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def __dlpack__(self, stream: int | None = None) -> Any: - """ - Export a primitive array as a DLPack capsule. - - Parameters - ---------- - stream : int, optional - A Python integer representing a pointer to a stream. Currently not supported. - Stream is provided by the consumer to the producer to instruct the producer - to ensure that operations can safely be performed on the array. - - Returns - ------- - capsule : PyCapsule - A DLPack capsule for the array, pointing to a DLManagedTensor. - """ - def __dlpack_device__(self) -> tuple[int, int]: - """ - Return the DLPack device tuple this arrays resides on. - - Returns - ------- - tuple : Tuple[int, int] - Tuple with index specifying the type of the device (where - CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the - device which is 0 by default for CPU. - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the array resides. + def __dlpack__(self, stream: int | None = None) -> Any: ... - Returns - ------- - DeviceAllocationType - """ + def __dlpack_device__(self) -> tuple[int, int]: ... @property - def is_cpu(self) -> bool: - """ - Whether the array is CPU-accessible. - """ + def device_type(self) -> DeviceAllocationType: ... + @property - def statistics(self) -> ArrayStatistics | None: - """ - Statistics of the array. - """ + def is_cpu(self) -> bool: ... + + @property + def statistics(self) -> ArrayStatistics | None: ... + class NullArray(Array[NullScalar]): - """ - Concrete class for Arrow arrays of null data type. - """ + ... + class BooleanArray(Array[BooleanScalar]): - """ - Concrete class for Arrow arrays of boolean data type. - """ + @property def false_count(self) -> int: ... @property def true_count(self) -> int: ... + class NumericArray(Array[_ScalarT]): - """ - A base class for Arrow numeric arrays. - """ + ... + + class IntegerArray(NumericArray[_ScalarT]): - """ - A base class for Arrow integer arrays. - """ + ... + + class FloatingPointArray(NumericArray[_ScalarT]): - """ - A base class for Arrow floating-point arrays. - """ + ... + + class Int8Array(IntegerArray[Int8Scalar]): - """ - Concrete class for Arrow arrays of int8 data type. - """ + ... + + class UInt8Array(IntegerArray[UInt8Scalar]): - """ - Concrete class for Arrow arrays of uint8 data type. - """ + ... + + class Int16Array(IntegerArray[Int16Scalar]): - """ - Concrete class for Arrow arrays of int16 data type. - """ + ... + + class UInt16Array(IntegerArray[UInt16Scalar]): - """ - Concrete class for Arrow arrays of uint16 data type. - """ + ... + + class Int32Array(IntegerArray[Int32Scalar]): - """ - Concrete class for Arrow arrays of int32 data type. - """ + ... + + class UInt32Array(IntegerArray[UInt32Scalar]): - """ - Concrete class for Arrow arrays of uint32 data type. - """ + ... + + class Int64Array(IntegerArray[Int64Scalar]): - """ - Concrete class for Arrow arrays of int64 data type. - """ + ... + + class UInt64Array(IntegerArray[UInt64Scalar]): - """ - Concrete class for Arrow arrays of uint64 data type. - """ + ... + + class Date32Array(NumericArray[Date32Scalar]): - """ - Concrete class for Arrow arrays of date32 data type. - """ + ... + + class Date64Array(NumericArray[Date64Scalar]): - """ - Concrete class for Arrow arrays of date64 data type. - """ + ... + + class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]): - """ - Concrete class for Arrow arrays of timestamp data type. - """ + ... + + class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]): - """ - Concrete class for Arrow arrays of time32 data type. - """ + ... + + class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]): - """ - Concrete class for Arrow arrays of time64 data type. - """ + ... + + class DurationArray(NumericArray[DurationScalar[_Unit]]): - """ - Concrete class for Arrow arrays of duration data type. - """ + ... + + class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]): - """ - Concrete class for Arrow arrays of interval[MonthDayNano] type. - """ + ... + + class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): - """ - Concrete class for Arrow arrays of float16 data type. - """ + ... + + class FloatArray(FloatingPointArray[FloatScalar]): - """ - Concrete class for Arrow arrays of float32 data type. - """ + ... + + class DoubleArray(FloatingPointArray[DoubleScalar]): - """ - Concrete class for Arrow arrays of float64 data type. - """ + ... + + class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]): - """ - Concrete class for Arrow arrays of a fixed-size binary data type. - """ + ... + + class Decimal32Array(FixedSizeBinaryArray): - """ - """ + ... + + class Decimal64Array(FixedSizeBinaryArray): - """ - Concrete class for Arrow arrays of decimal64 data type. - """ + ... + + class Decimal128Array(FixedSizeBinaryArray): - """ - Concrete class for Arrow arrays of decimal128 data type. - """ + ... + + class Decimal256Array(FixedSizeBinaryArray): - """ - Concrete class for Arrow arrays of decimal256 data type. - """ + ... + class BaseListArray(Array[_ScalarT]): - def flatten(self, recursive: bool = False) -> Array: - """ - Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray - according to 'recursive'. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - recursive : bool, default False, optional - When True, flatten this logical list-array recursively until an - array of non-list values is formed. - - When False, flatten only the top level. - - Returns - ------- - result : Array - - Examples - -------- - - Basic logical list-array's flatten - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - - When recursive=True, nested list arrays are flattened recursively - until an array of non-list values is formed. - - >>> array = pa.array([ - ... None, - ... [ - ... [1, None, 2], - ... None, - ... [3, 4] - ... ], - ... [], - ... [ - ... [], - ... [5, 6], - ... None - ... ], - ... [ - ... [7, 8] - ... ] - ... ], type=pa.list_(pa.list_(pa.int64()))) - >>> array.flatten(True) - - [ - 1, - null, - 2, - 3, - 4, - 5, - 6, - 7, - 8 - ] - """ - def value_parent_indices(self) -> Int64Array: - """ - Return array of same length as list child values array where each - output value is the index of the parent list array slot containing each - child value. - - Examples - -------- - >>> import pyarrow as pa - >>> arr = pa.array([[1, 2, 3], [], None, [4]], - ... type=pa.list_(pa.int32())) - >>> arr.value_parent_indices() - - [ - 0, - 0, - 0, - 3 - ] - """ - def value_lengths(self) -> Int32Array: - """ - Return integers array with values equal to the respective length of - each list element. Null list values are null in the output. - - Examples - -------- - >>> import pyarrow as pa - >>> arr = pa.array([[1, 2, 3], [], None, [4]], - ... type=pa.list_(pa.int32())) - >>> arr.value_lengths() - - [ - 3, - 0, - null, - 1 - ] - """ + def flatten(self, recursive: bool = False) -> Array: ... + + def value_parent_indices(self) -> Int64Array: ... + + def value_lengths(self) -> Int32Array: ... + class ListArray(BaseListArray[_ScalarT]): - """ - Concrete class for Arrow arrays of a list data type. - """ + @classmethod def from_arrays( cls, @@ -1545,158 +465,17 @@ class ListArray(BaseListArray[_ScalarT]): type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> ListArray[ListScalar[_DataTypeT | Int64Type | Float64Type | StringType | BinaryType]] | ListArray: - """ - Construct ListArray from arrays of int32 offsets and values. - - Parameters - ---------- - offsets : Array (int32 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_array : ListArray - - Examples - -------- - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> offsets = pa.array([0, 2, 4]) - >>> pa.ListArray.from_arrays(offsets, values) - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - >>> # nulls in the offsets array become null lists - >>> offsets = pa.array([0, None, 2, 4]) - >>> pa.ListArray.from_arrays(offsets, values) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - """ + ) -> ListArray[ListScalar[_DataTypeT | Int64Type | Float64Type | StringType | BinaryType]] | ListArray: ... + @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the ListArray - ignoring the array's offset. - - If any of the list elements are null, but are backed by a - non-empty sub-list, those elements will be included in the - output. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. - - Returns - ------- - values : Array - - See Also - -------- - ListArray.flatten : ... - - Examples - -------- - - The values include null elements from sub-lists: - - >>> import pyarrow as pa - >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) - >>> array.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - - If an array is sliced, the slice still uses the same - underlying data as the original array, just with an - offset. Since values ignores the offset, the values are the - same: - - >>> sliced = array.slice(1, 2) - >>> sliced - - [ - null, - [ - 3, - 4, - null, - 6 - ] - ] - >>> sliced.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - """ + def values(self) -> Array: ... + @property - def offsets(self) -> Int32Array: - """ - Return the list offsets as an int32 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `ListArray.from_arrays` and get back the same - list array if the original one has nulls. - - Returns - ------- - offsets : Int32Array - - Examples - -------- - >>> import pyarrow as pa - >>> array = pa.array([[1, 2], None, [3, 4, 5]]) - >>> array.offsets - - [ - 0, - 2, - 2, - 5 - ] - """ + def offsets(self) -> Int32Array: ... + class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): - """ - Concrete class for Arrow arrays of a large list data type. - Identical to ListArray, but 64-bit offsets. - """ @classmethod def from_arrays( cls, @@ -1706,113 +485,17 @@ class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> LargeListArray[_DataTypeT] | LargeListArray[_DataTypeT]: - """ - Construct LargeListArray from arrays of int64 offsets and values. - - Parameters - ---------- - offsets : Array (int64 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_array : LargeListArray - """ - @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the LargeListArray - ignoring the array's offset. - - If any of the list elements are null, but are backed by a - non-empty sub-list, those elements will be included in the - output. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. - - Returns - ------- - values : Array - - See Also - -------- - LargeListArray.flatten : ... - - Examples - -------- - - The values include null elements from the sub-lists: - - >>> import pyarrow as pa - >>> array = pa.array( - ... [[1, 2], None, [3, 4, None, 6]], - ... type=pa.large_list(pa.int32()), - ... ) - >>> array.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - - If an array is sliced, the slice still uses the same - underlying data as the original array, just with an - offset. Since values ignores the offset, the values are the - same: - - >>> sliced = array.slice(1, 2) - >>> sliced - - [ - null, - [ - 3, - 4, - null, - 6 - ] - ] - >>> sliced.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - """ + ) -> LargeListArray[_DataTypeT] | LargeListArray[_DataTypeT]: ... + @property - def offsets(self) -> Int64Array: - """ - Return the list offsets as an int64 array. + def values(self) -> Array: ... - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `LargeListArray.from_arrays` and get back the - same list array if the original one has nulls. + @property + def offsets(self) -> Int64Array: ... - Returns - ------- - offsets : Int64Array - """ class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): - """ - Concrete class for Arrow arrays of a list view data type. - """ + @classmethod def from_arrays( cls, @@ -1822,194 +505,20 @@ class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> ListViewArray[_DataTypeT] | ListViewArray[_DataTypeT]: - """ - Construct ListViewArray from arrays of int32 offsets, sizes, and values. - - Parameters - ---------- - offsets : Array (int32 type) - sizes : Array (int32 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_view_array : ListViewArray - - Examples - -------- - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> offsets = pa.array([0, 1, 2]) - >>> sizes = pa.array([2, 2, 2]) - >>> pa.ListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - [ - 2, - 3 - ], - [ - 3, - 4 - ] - ] - >>> # use a null mask to represent null values - >>> mask = pa.array([False, True, False]) - >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - >>> # null values can be defined in either offsets or sizes arrays - >>> # WARNING: this will result in a copy of the offsets or sizes arrays - >>> offsets = pa.array([0, None, 2]) - >>> pa.ListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - """ + ) -> ListViewArray[_DataTypeT] | ListViewArray[_DataTypeT]: ... + @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the ListViewArray - ignoring the array's offset and sizes. - - The values array may be out of order and/or contain additional values - that are not found in the logical representation of the array. The only - guarantee is that each non-null value in the ListView Array is contiguous. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's order and offset. - - Returns - ------- - values : Array - - Examples - -------- - The values include null elements from sub-lists: - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 1, - 2 - ], - [], - [ - 2, - null, - 3, - 4 - ] - ] - >>> array.values - - [ - 1, - 2, - null, - 3, - 4 - ] - """ + def values(self) -> Array: ... + @property - def offsets(self) -> Int32Array: - """ - Return the list offsets as an int32 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `ListViewArray.from_arrays` and get back the same - list array if the original one has nulls. - - Returns - ------- - offsets : Int32Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array.offsets - - [ - 0, - 0, - 1 - ] - """ + def offsets(self) -> Int32Array: ... + @property - def sizes(self) -> Int32Array: - """ - Return the list sizes as an int32 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `ListViewArray.from_arrays` and get back the same - list array if the original one has nulls. - - Returns - ------- - sizes : Int32Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array.sizes - - [ - 2, - 0, - 4 - ] - """ + def sizes(self) -> Int32Array: ... + class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): - """ - Concrete class for Arrow arrays of a large list view data type. - Identical to ListViewArray, but with 64-bit offsets. - """ @classmethod def from_arrays( cls, @@ -2019,199 +528,20 @@ class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): type: _DataTypeT | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> LargeListViewArray[_DataTypeT]: - """ - Construct LargeListViewArray from arrays of int64 offsets and values. - - Parameters - ---------- - offsets : Array (int64 type) - sizes : Array (int64 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_view_array : LargeListViewArray - - Examples - -------- - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> offsets = pa.array([0, 1, 2]) - >>> sizes = pa.array([2, 2, 2]) - >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - [ - 2, - 3 - ], - [ - 3, - 4 - ] - ] - >>> # use a null mask to represent null values - >>> mask = pa.array([False, True, False]) - >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - >>> # null values can be defined in either offsets or sizes arrays - >>> # WARNING: this will result in a copy of the offsets or sizes arrays - >>> offsets = pa.array([0, None, 2]) - >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - """ + ) -> LargeListViewArray[_DataTypeT]: ... + @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the LargeListArray - ignoring the array's offset. - - The values array may be out of order and/or contain additional values - that are not found in the logical representation of the array. The only - guarantee is that each non-null value in the ListView Array is contiguous. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's order and offset. - - Returns - ------- - values : Array - - See Also - -------- - LargeListArray.flatten : ... - - Examples - -------- - - The values include null elements from sub-lists: - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 1, - 2 - ], - [], - [ - 2, - null, - 3, - 4 - ] - ] - >>> array.values - - [ - 1, - 2, - null, - 3, - 4 - ] - """ + def values(self) -> Array: ... + @property - def offsets(self) -> Int64Array: - """ - Return the list view offsets as an int64 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `LargeListViewArray.from_arrays` and get back the - same list array if the original one has nulls. - - Returns - ------- - offsets : Int64Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array.offsets - - [ - 0, - 0, - 1 - ] - """ + def offsets(self) -> Int64Array: ... + @property - def sizes(self) -> Int64Array: - """ - Return the list view sizes as an int64 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `LargeListViewArray.from_arrays` and get back the - same list array if the original one has nulls. - - Returns - ------- - sizes : Int64Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array.sizes - - [ - 2, - 0, - 4 - ] - """ + def sizes(self) -> Int64Array: ... + class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): - """ - Concrete class for Arrow arrays of a fixed size list data type. - """ + @classmethod def from_arrays( cls, @@ -2220,109 +550,18 @@ class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): *, type: None = None, mask: Mask | None = None, - ) -> FixedSizeListArray[_DataTypeT, _Size | None]: - """ - Construct FixedSizeListArray from array of values and a list length. - - Parameters - ---------- - values : Array (any type) - list_size : int - The fixed length of the lists. - type : DataType, optional - If not specified, a default ListType with the values' type and - `list_size` length is used. - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - - Returns - ------- - FixedSizeListArray - - Examples - -------- - - Create from a values array and a list size: - - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) - >>> arr - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - - Or create from a values array, list size and matching type: - - >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) - >>> arr - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - """ + ) -> FixedSizeListArray[_DataTypeT, _Size | None]: ... + @property - def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: - """ - Return the underlying array of values which backs the - FixedSizeListArray ignoring the array's offset. - - Note even null elements are included. - - Compare with :meth:`flatten`, which returns only the non-null - sub-list values. - - Returns - ------- - values : Array - - See Also - -------- - FixedSizeListArray.flatten : ... - - Examples - -------- - >>> import pyarrow as pa - >>> array = pa.array( - ... [[1, 2], None, [3, None]], - ... type=pa.list_(pa.int32(), 2) - ... ) - >>> array.values - - [ - 1, - 2, - null, - null, - 3, - null - ] - """ + def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: ... + _MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) _MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): - """ - Concrete class for Arrow arrays of a map data type. - """ + @classmethod def from_arrays( cls, @@ -2334,159 +573,28 @@ class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): type: MapType[_MapKeyT, _MapItemT] | None = None, pool: MemoryPool | None = None, mask: Mask | None = None, - ) -> MapArray[_MapKeyT, _MapItemT]: - """ - Construct MapArray from arrays of int32 offsets and key, item arrays. - - Parameters - ---------- - offsets : array-like or sequence (int32 type) - keys : array-like or sequence (any type) - items : array-like or sequence (any type) - type : DataType, optional - If not specified, a default MapArray with the keys' and items' type is used. - pool : MemoryPool - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - map_array : MapArray - - Examples - -------- - First, let's understand the structure of our dataset when viewed in a rectangular data model. - The total of 5 respondents answered the question "How much did you like the movie x?". - The value -1 in the integer array means that the value is missing. The boolean array - represents the null bitmask corresponding to the missing values in the integer array. - - >>> import pyarrow as pa - >>> movies_rectangular = np.ma.masked_array([ - ... [10, -1, -1], - ... [8, 4, 5], - ... [-1, 10, 3], - ... [-1, -1, -1], - ... [-1, -1, -1] - ... ], - ... [ - ... [False, True, True], - ... [False, False, False], - ... [True, False, False], - ... [True, True, True], - ... [True, True, True], - ... ]) - - To represent the same data with the MapArray and from_arrays, the data is - formed like this: - - >>> offsets = [ - ... 0, # -- row 1 start - ... 1, # -- row 2 start - ... 4, # -- row 3 start - ... 6, # -- row 4 start - ... 6, # -- row 5 start - ... 6, # -- row 5 end - ... ] - >>> movies = [ - ... "Dark Knight", # ---------------------------------- row 1 - ... "Dark Knight", "Meet the Parents", "Superman", # -- row 2 - ... "Meet the Parents", "Superman", # ----------------- row 3 - ... ] - >>> likings = [ - ... 10, # -------- row 1 - ... 8, 4, 5, # --- row 2 - ... 10, 3 # ------ row 3 - ... ] - >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() - 0 [(Dark Knight, 10)] - 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... - 2 [(Meet the Parents, 10), (Superman, 3)] - 3 [] - 4 [] - dtype: object - - If the data in the empty rows needs to be marked as missing, it's possible - to do so by modifying the offsets argument, so that we specify `None` as - the starting positions of the rows we want marked as missing. The end row - offset still has to refer to the existing value from keys (and values): - - >>> offsets = [ - ... 0, # ----- row 1 start - ... 1, # ----- row 2 start - ... 4, # ----- row 3 start - ... None, # -- row 4 start - ... None, # -- row 5 start - ... 6, # ----- row 5 end - ... ] - >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() - 0 [(Dark Knight, 10)] - 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... - 2 [(Meet the Parents, 10), (Superman, 3)] - 3 None - 4 None - dtype: object - """ + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @property - def keys(self) -> Array: - """ - Flattened array of keys across all maps in array - """ + def keys(self) -> Array: ... + @property - def items(self) -> Array: - """ - Flattened array of items across all maps in array - """ + def items(self) -> Array: ... + class UnionArray(Array[UnionScalar]): - """ - Concrete class for Arrow arrays of a Union data type. - """ + @deprecated("Use fields() instead") - def child(self, pos: int) -> Field: - """ - DEPRECATED, use field() instead. - - Parameters - ---------- - pos : int - The physical index of the union child field (not its type code). - - Returns - ------- - field : pyarrow.Field - The given child field. - """ - def field(self, pos: int) -> Array: - """ - Return the given child field as an individual array. - - For sparse unions, the returned array has its offset, length, - and null count adjusted. - - For dense unions, the returned array is unchanged. - - Parameters - ---------- - pos : int - The physical index of the union child field (not its type code). - - Returns - ------- - field : Array - The given child field. - """ + def child(self, pos: int) -> Field: ... + + def field(self, pos: int) -> Array: ... + @property - def type_codes(self) -> Int8Array: - """ - Get the type codes array. - """ + def type_codes(self) -> Int8Array: ... + @property - def offsets(self) -> Int32Array: - """ - Get the value offsets array (dense arrays only). + def offsets(self) -> Int32Array: ... - Does not account for any slice offset. - """ @staticmethod def from_dense( types: Int8Array, @@ -2494,50 +602,19 @@ class UnionArray(Array[UnionScalar]): children: NullableCollection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, - ) -> UnionArray: - """ - Construct dense UnionArray from arrays of int8 types, int32 offsets and - children arrays - - Parameters - ---------- - types : Array (int8 type) - value_offsets : Array (int32 type) - children : list - field_names : list - type_codes : list - - Returns - ------- - union_array : UnionArray - """ + ) -> UnionArray: ... + @staticmethod def from_sparse( types: Int8Array, children: NullableCollection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, - ) -> UnionArray: - """ - Construct sparse UnionArray from arrays of int8 types and children - arrays - - Parameters - ---------- - types : Array (int8 type) - children : list - field_names : list - type_codes : list - - Returns - ------- - union_array : UnionArray - """ + ) -> UnionArray: ... + class StringArray(Array[StringScalar]): - """ - Concrete class for Arrow arrays of string (or utf8) data type. - """ + @staticmethod def from_buffers( # type: ignore[override] length: int, @@ -2546,30 +623,11 @@ class StringArray(Array[StringScalar]): null_bitmap: Buffer | None = None, null_count: int | None = -1, offset: int | None = 0, - ) -> StringArray: - """ - Construct a StringArray from value_offsets and data buffers. - If there are nulls in the data, also a null_bitmap and the matching - null_count must be passed. - - Parameters - ---------- - length : int - value_offsets : Buffer - data : Buffer - null_bitmap : Buffer, optional - null_count : int, default 0 - offset : int, default 0 - - Returns - ------- - string_array : StringArray - """ + ) -> StringArray: ... + class LargeStringArray(Array[LargeStringScalar]): - """ - Concrete class for Arrow arrays of large string (or utf8) data type. - """ + @staticmethod def from_buffers( # type: ignore[override] length: int, @@ -2578,71 +636,39 @@ class LargeStringArray(Array[LargeStringScalar]): null_bitmap: Buffer | None = None, null_count: int | None = -1, offset: int | None = 0, - ) -> StringArray: - """ - Construct a LargeStringArray from value_offsets and data buffers. - If there are nulls in the data, also a null_bitmap and the matching - null_count must be passed. - - Parameters - ---------- - length : int - value_offsets : Buffer - data : Buffer - null_bitmap : Buffer, optional - null_count : int, default 0 - offset : int, default 0 - - Returns - ------- - string_array : StringArray - """ + ) -> StringArray: ... + class StringViewArray(Array[StringViewScalar]): - """ - Concrete class for Arrow arrays of string (or utf8) view data type. - """ + ... + class BinaryArray(Array[BinaryScalar]): - """ - Concrete class for Arrow arrays of variable-sized binary data type. - """ + @property - def total_values_length(self) -> int: - """ - The number of bytes from beginning to end of the data buffer addressed - by the offsets of this BinaryArray. - """ + def total_values_length(self) -> int: ... + class LargeBinaryArray(Array[LargeBinaryScalar]): - """ - Concrete class for Arrow arrays of large variable-sized binary data type. - """ + @property - def total_values_length(self) -> int: - """ - The number of bytes from beginning to end of the data buffer addressed - by the offsets of this LargeBinaryArray. - """ + def total_values_length(self) -> int: ... + class BinaryViewArray(Array[BinaryViewScalar]): - """ - Concrete class for Arrow arrays of variable-sized binary view data type. - """ + ... + class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): - """ - Concrete class for dictionary-encoded Arrow arrays. - """ + def dictionary_encode(self) -> Self: ... # type: ignore[override] - def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: - """ - Decodes the DictionaryArray to an Array. - """ + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + @property def indices(self) -> Array[Scalar[_IndexT]]: ... @property def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + @staticmethod def from_buffers( # type: ignore[override] type: _BasicValueT, @@ -2651,30 +677,8 @@ class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): dictionary: Array | np.ndarray | pd.Series, null_count: int = -1, offset: int = 0, - ) -> DictionaryArray[Any, _BasicValueT]: - """ - Construct a DictionaryArray from buffers. - - Parameters - ---------- - type : pyarrow.DataType - length : int - The number of values in the array. - buffers : List[Buffer] - The buffers backing the indices array. - dictionary : pyarrow.Array, ndarray or pandas.Series - The array of values referenced by the indices. - null_count : int, default -1 - The number of null entries in the indices array. Negative value means that - the null count is not known. - offset : int, default 0 - The array's logical offset (in values, not in bytes) from the - start of each buffer. - - Returns - ------- - dict_array : DictionaryArray - """ + ) -> DictionaryArray[Any, _BasicValueT]: ... + @staticmethod def from_arrays( indices: Indices, @@ -2684,64 +688,15 @@ class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): from_pandas: bool = False, safe: bool = True, memory_pool: MemoryPool | None = None, - ) -> DictionaryArray: - """ - Construct a DictionaryArray from indices and values. - - Parameters - ---------- - indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type - Non-negative integers referencing the dictionary values by zero - based index. - dictionary : pyarrow.Array, ndarray or pandas.Series - The array of values referenced by the indices. - mask : ndarray or pandas.Series, bool type - True values indicate that indices are actually null. - ordered : bool, default False - Set to True if the category values are ordered. - from_pandas : bool, default False - If True, the indices should be treated as though they originated in - a pandas.Categorical (null encoded as -1). - safe : bool, default True - If True, check that the dictionary indices are in range. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise uses default pool. - - Returns - ------- - dict_array : DictionaryArray - """ + ) -> DictionaryArray: ... + class StructArray(Array[StructScalar]): - """ - Concrete class for Arrow arrays of a struct data type. - """ - def field(self, index: int | str) -> Array: - """ - Retrieves the child array belonging to field. - - Parameters - ---------- - index : Union[int, str] - Index / position or name of the field. - - Returns - ------- - result : Array - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: - """ - Return one individual array for each field in the struct. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Returns - ------- - result : List[Array] - """ + + def field(self, index: int | str) -> Array: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + @staticmethod def from_arrays( arrays: Iterable[Array], @@ -2750,78 +705,21 @@ class StructArray(Array[StructScalar]): mask=None, memory_pool: MemoryPool | None = None, type: StructType | None = None, - ) -> StructArray: - """ - Construct StructArray from collection of arrays representing - each field in the struct. - - Either field names, field instances or a struct type must be passed. - - Parameters - ---------- - arrays : sequence of Array - names : List[str] (optional) - Field names for each struct child. - fields : List[Field] (optional) - Field instances for each struct child. - mask : pyarrow.Array[bool] (optional) - Indicate which values are null (True) or not null (False). - memory_pool : MemoryPool (optional) - For memory allocations, if required, otherwise uses default pool. - type : pyarrow.StructType (optional) - Struct type for name and type of each child. - - Returns - ------- - result : StructArray - """ - def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: - """ - Sort the StructArray - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - by : str or None, default None - If to sort the array by one of its fields - or by the whole array. - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : StructArray - """ + ) -> StructArray: ... + + def sort(self, order: Order = "ascending", by: str | + None = None, **kwargs) -> StructArray: ... + class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): - """ - Concrete class for Arrow run-end encoded arrays. - """ + @staticmethod def from_arrays( run_ends: Int16Array | Int32Array | Int64Array, values: Array, type: DataType | None = None, - ) -> RunEndEncodedArray[Int16Type | Int32Type | Int64Type, _BasicValueT]: # type: ignore[type-var] - """ - Construct RunEndEncodedArray from run_ends and values arrays. - - Parameters - ---------- - run_ends : Array (int16, int32, or int64 type) - The run_ends array. - values : Array (any type) - The values array. - type : pyarrow.DataType, optional - The run_end_encoded(run_end_type, value_type) array type. - - Returns - ------- - RunEndEncodedArray - """ + ) -> RunEndEncodedArray[Int16Type | Int32Type | Int64Type, _BasicValueT]: ... # type: ignore[type-var] + @staticmethod def from_buffers( # type: ignore[override] type: DataType, @@ -2830,424 +728,72 @@ class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): null_count: int = -1, offset=0, children: tuple[Array, Array] | None = None, - ) -> RunEndEncodedArray[Any, _BasicValueT]: - """ - Construct a RunEndEncodedArray from all the parameters that make up an - Array. - - RunEndEncodedArrays do not have buffers, only children arrays, but this - implementation is needed to satisfy the Array interface. - - Parameters - ---------- - type : DataType - The run_end_encoded(run_end_type, value_type) type. - length : int - The logical length of the run-end encoded array. Expected to match - the last value of the run_ends array (children[0]) minus the offset. - buffers : List[Buffer] - Empty List or [None]. - null_count : int, default -1 - The number of null entries in the array. Run-end encoded arrays - are specified to not have valid bits and null_count always equals 0. - offset : int, default 0 - The array's logical offset (in values, not in bytes) from the - start of each buffer. - children : List[Array] - Nested type children containing the run_ends and values arrays. - - Returns - ------- - RunEndEncodedArray - """ - @property - def run_ends(self) -> Array[Scalar[_RunEndType]]: - """ - An array holding the logical indexes of each run-end. + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... - The physical offset to the array is applied. - """ @property - def values(self) -> Array[Scalar[_BasicValueT]]: - """ - An array holding the values of each run. - - The physical offset to the array is applied. - """ - def find_physical_offset(self) -> int: - """ - Find the physical offset of this REE array. + def run_ends(self) -> Array[Scalar[_RunEndType]]: ... - This is the offset of the run that contains the value of the first - logical element of this array considering its offset. + @property + def values(self) -> Array[Scalar[_BasicValueT]]: ... - This function uses binary-search, so it has a O(log N) cost. - """ - def find_physical_length(self) -> int: - """ - Find the physical length of this REE array. + def find_physical_offset(self) -> int: ... - The physical length of an REE is the number of physical values (and - run-ends) necessary to represent the logical range of values from offset - to length. + def find_physical_length(self) -> int: ... - This function uses binary-search, so it has a O(log N) cost. - """ _ArrayT = TypeVar("_ArrayT", bound=Array) + class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]): - """ - Concrete class for Arrow extension arrays. - """ + @property def storage(self) -> Any: ... + @staticmethod - def from_storage(typ: BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: - """ - Construct ExtensionArray from type and storage array. - - Parameters - ---------- - typ : DataType - The extension type for the result array. - storage : Array - The underlying storage for the result array. - - Returns - ------- - ext_array : ExtensionArray - """ + def from_storage(typ: BaseExtensionType, + storage: _ArrayT) -> ExtensionArray[_ArrayT]: ... + class JsonArray(ExtensionArray[_ArrayT]): - """ - Concrete class for Arrow arrays of JSON data type. - - This does not guarantee that the JSON data actually - is valid JSON. - - Examples - -------- - Define the extension type for JSON array - - >>> import pyarrow as pa - >>> json_type = pa.json_(pa.large_utf8()) - - Create an extension array - - >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] - >>> storage = pa.array(arr, pa.large_utf8()) - >>> pa.ExtensionArray.from_storage(json_type, storage) - - [ - null, - "{ "id":30, "values":["a", "b"] }" - ] - """ - """ - Concrete class for Arrow arrays of JSON data type. - - This does not guarantee that the JSON data actually - is valid JSON. - - Examples - -------- - Define the extension type for JSON array - - >>> import pyarrow as pa - >>> json_type = pa.json_(pa.large_utf8()) - - Create an extension array - - >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] - >>> storage = pa.array(arr, pa.large_utf8()) - >>> pa.ExtensionArray.from_storage(json_type, storage) - - [ - null, - "{ "id":30, "values":["a", "b"] }" - ] - """ + ... + class UuidArray(ExtensionArray[_ArrayT]): - """ - Concrete class for Arrow arrays of UUID data type. - """ + ... + class FixedShapeTensorArray(ExtensionArray[_ArrayT]): - """ - Concrete class for fixed shape tensor extension arrays. - - Examples - -------- - Define the extension type for tensor array - - >>> import pyarrow as pa - >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) - - Create an extension array - - >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] - >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) - >>> pa.ExtensionArray.from_storage(tensor_type, storage) - - [ - [ - 1, - 2, - 3, - 4 - ], - [ - 10, - 20, - 30, - 40 - ], - [ - 100, - 200, - 300, - 400 - ] - ] - """ - - def to_numpy_ndarray(self) -> np.ndarray: - """ - Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. - - The resulting ndarray will have (ndim + 1) dimensions. - The size of the first dimension will be the length of the fixed shape tensor array - and the rest of the dimensions will match the permuted shape of the fixed - shape tensor. - - The conversion is zero-copy. - - Returns - ------- - numpy.ndarray - Ndarray representing tensors in the fixed shape tensor array concatenated - along the first dimension. - """ - def to_tensor(self) -> Tensor: - """ - Convert fixed shape tensor extension array to a pyarrow.Tensor. - - The resulting Tensor will have (ndim + 1) dimensions. - The size of the first dimension will be the length of the fixed shape tensor array - and the rest of the dimensions will match the permuted shape of the fixed - shape tensor. - - The conversion is zero-copy. - - Returns - ------- - pyarrow.Tensor - Tensor representing tensors in the fixed shape tensor array concatenated - along the first dimension. - """ + + def to_numpy_ndarray(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... @classmethod - def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: - """ - Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. - The first dimension of ndarray will become the length of the fixed - shape tensor array. - If input array data is not contiguous a copy will be made. - - Parameters - ---------- - obj : numpy.ndarray - dim_names : tuple or list of strings, default None - Explicit names to tensor dimensions. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = np.array( - ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - ... dtype=np.float32) - >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - - [ - [ - 1, - 2, - 3, - 4, - 5, - 6 - ], - [ - 1, - 2, - 3, - 4, - 5, - 6 - ] - ] - """ + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: ... + class OpaqueArray(ExtensionArray[_ArrayT]): - """ - Concrete class for opaque extension arrays. - - Examples - -------- - Define the extension type for an opaque array - - >>> import pyarrow as pa - >>> opaque_type = pa.opaque( - ... pa.binary(), - ... type_name="geometry", - ... vendor_name="postgis", - ... ) - - Create an extension array - - >>> arr = [None, b"data"] - >>> storage = pa.array(arr, pa.binary()) - >>> pa.ExtensionArray.from_storage(opaque_type, storage) - - [ - null, - 64617461 - ] - """ + ... + class Bool8Array(ExtensionArray): - """ - Concrete class for bool8 extension arrays. - - Examples - -------- - Define the extension type for an bool8 array - - >>> import pyarrow as pa - >>> bool8_type = pa.bool8() - - Create an extension array - - >>> arr = [-1, 0, 1, 2, None] - >>> storage = pa.array(arr, pa.int8()) - >>> pa.ExtensionArray.from_storage(bool8_type, storage) - - [ - -1, - 0, - 1, - 2, - null - ] - """ - - def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: - """ - Return a NumPy bool view or copy of this array. - - By default, tries to return a view of this array. This is only - supported for arrays without any nulls. - - Parameters - ---------- - zero_copy_only : bool, default True - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls). - writable : bool, default False - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - - Returns - ------- - array : numpy.ndarray - """ + + def to_numpy(self, zero_copy_only: bool = ..., + writable: bool = ...) -> np.ndarray: ... + @classmethod - def from_storage(cls, storage: Int8Array) -> Self: # type: ignore[override] - """ - Construct Bool8Array from Int8Array storage. - - Parameters - ---------- - storage : Int8Array - The underlying storage for the result array. - - Returns - ------- - bool8_array : Bool8Array - """ + def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + @classmethod - def from_numpy(cls, obj: np.ndarray) -> Self: - """ - Convert numpy array to a bool8 extension array without making a copy. - The input array must be 1-dimensional, with either bool_ or int8 dtype. - - Parameters - ---------- - obj : numpy.ndarray - - Returns - ------- - bool8_array : Bool8Array - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = np.array([True, False, True], dtype=np.bool_) - >>> pa.Bool8Array.from_numpy(arr) - - [ - 1, - 0, - 1 - ] - """ - -def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: - """ - Concatenate the given arrays. - - The contents of the input arrays are copied into the returned array. - - Raises - ------ - ArrowInvalid - If not all of the arrays have the same type. - - Parameters - ---------- - arrays : iterable of pyarrow.Array - Arrays to concatenate, must be identically typed. - memory_pool : MemoryPool, default None - For memory allocations. If None, the default pool is used. - - Examples - -------- - >>> import pyarrow as pa - >>> arr1 = pa.array([2, 4, 5, 100]) - >>> arr2 = pa.array([2, 4]) - >>> pa.concat_arrays([arr1, arr2]) - - [ - 2, - 4, - 5, - 100, - 2, - 4 - ] - """ - -def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: - """ - Create empty array of the given type. - """ + def from_numpy(cls, obj: np.ndarray) -> Self: ... + + +def concat_arrays(arrays: Iterable[_ArrayT], + memory_pool: MemoryPool | None = None) -> _ArrayT: ... + + +def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: ... + __all__ = [ "array", diff --git a/python/pyarrow-stubs/builder.pyi b/python/pyarrow-stubs/builder.pyi index 39372f8e512..c379bd83afb 100644 --- a/python/pyarrow-stubs/builder.pyi +++ b/python/pyarrow-stubs/builder.pyi @@ -21,86 +21,33 @@ from pyarrow.lib import MemoryPool, _Weakrefable from .array import StringArray, StringViewArray + class StringBuilder(_Weakrefable): - """ - Builder class for UTF8 strings. - This class exposes facilities for incrementally adding string values and - building the null bitmap for a pyarrow.Array (type='string'). - """ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): - """ - Append a single value to the builder. - - The value can either be a string/bytes object or a null value - (np.nan or None). - - Parameters - ---------- - value : string/bytes or np.nan/None - The value to append to the string array builder. - """ - def append_values(self, values: Iterable[str | bytes | None]): - """ - Append all the values from an iterable. - - Parameters - ---------- - values : iterable of string/bytes or np.nan/None values - The values to append to the string array builder. - """ - def finish(self) -> StringArray: - """ - Return result of builder as an Array object; also resets the builder. - - Returns - ------- - array : pyarrow.Array - """ + def append(self, value: str | bytes | None): ... + + def append_values(self, values: Iterable[str | bytes | None]): ... + + def finish(self) -> StringArray: ... + @property def null_count(self) -> int: ... def __len__(self) -> int: ... + class StringViewBuilder(_Weakrefable): - """ - Builder class for UTF8 string views. - This class exposes facilities for incrementally adding string values and - building the null bitmap for a pyarrow.Array (type='string_view'). - """ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): - """ - Append a single value to the builder. - - The value can either be a string/bytes object or a null value - (np.nan or None). - - Parameters - ---------- - value : string/bytes or np.nan/None - The value to append to the string array builder. - """ - def append_values(self, values: Iterable[str | bytes | None]): - """ - Append all the values from an iterable. - - Parameters - ---------- - values : iterable of string/bytes or np.nan/None values - The values to append to the string array builder. - """ - def finish(self) -> StringViewArray: - """ - Return result of builder as an Array object; also resets the builder. - - Returns - ------- - array : pyarrow.Array - """ + def append(self, value: str | bytes | None): ... + + def append_values(self, values: Iterable[str | bytes | None]): ... + + def finish(self) -> StringViewArray: ... + @property def null_count(self) -> int: ... def __len__(self) -> int: ... + __all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow-stubs/compute.pyi b/python/pyarrow-stubs/compute.pyi index dcedb34b14a..235e8ffc34d 100644 --- a/python/pyarrow-stubs/compute.pyi +++ b/python/pyarrow-stubs/compute.pyi @@ -111,67 +111,11 @@ from . import lib _P = ParamSpec("_P") _R = TypeVar("_R") -def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: - """Reference a column of the dataset. - - Stores only the field's name. Type and other information is known only when - the expression is bound to a dataset having an explicit scheme. - - Nested references are allowed by passing multiple names or a tuple of - names. For example ``('foo', 'bar')`` references the field named "bar" - inside the field named "foo". - - Parameters - ---------- - *name_or_index : string, multiple strings, tuple or int - The name or index of the (possibly nested) field the expression - references to. - - Returns - ------- - field_expr : Expression - Reference to the given field - - Examples - -------- - >>> import pyarrow.compute as pc - >>> pc.field("a") - - >>> pc.field(1) - - >>> pc.field(("a", "b")) - >> pc.field("a", "b") - Expression: - """Expression representing a scalar value. - - Creates an Expression object representing a scalar value that can be used - in compute expressions and predicates. - - Parameters - ---------- - value : bool, int, float or string - Python value of the scalar. This function accepts any value that can be - converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. - - Notes - ----- - This function differs from ``pyarrow.scalar()`` in the following way: - - * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents - a single value in Arrow's memory model. - * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing - a scalar value that can be used in compute expressions, predicates, and - dataset filtering operations. - - Returns - ------- - scalar_expr : Expression - An Expression representing the scalar value - """ +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: ... + + +def scalar(value: bool | float | str) -> Expression: ... + def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... @@ -274,53 +218,10 @@ def all( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: - """ - Test whether all elements in a boolean array evaluate to true. - - Null values are ignored by default. - If the `skip_nulls` option is set to false, then Kleene logic is used. - See "kleene_and" for more details on Kleene logic. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar: ... + any = _clone_signature(all) -""" -Test whether any element in a boolean array evaluates to true. - -Null values are ignored by default. -If the `skip_nulls` option is set to false, then Kleene logic is used. -See "kleene_or" for more details on Kleene logic. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def approximate_median( array: NumericScalar | NumericArray, @@ -330,28 +231,8 @@ def approximate_median( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Approximate median of a numeric array with T-Digest algorithm. - - Nulls and NaNs are ignored. - A null scalar is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.DoubleScalar: ... + def count( array: lib.Array | lib.ChunkedArray, @@ -360,25 +241,8 @@ def count( *, options: CountOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Count the number of null / non-null values. - - By default, only non-null values are counted. - This can be changed through CountOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - options : pyarrow.compute.CountOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar: ... + def count_distinct( array: lib.Array | lib.ChunkedArray, @@ -387,25 +251,8 @@ def count_distinct( *, options: CountOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Count the number of unique values. - - By default, only non-null values are counted. - This can be changed through CountOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - options : pyarrow.compute.CountOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar: ... + def first( array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], @@ -415,29 +262,8 @@ def first( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: - """ - Compute the first value in each group. - - Null values are ignored by default. - If skip_nulls = false, then this will return the first and last values - regardless if it is null - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _ScalarT: ... + def first_last( array: lib.Array[Any] | lib.ChunkedArray[Any], @@ -447,29 +273,8 @@ def first_last( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: - """ - Compute the first and last values of an array. - - Null values are ignored by default. - If skip_nulls = false, then this will return the first and last values - regardless if it is null - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.StructScalar: ... + def index( data: lib.Array[Any] | lib.ChunkedArray[Any], @@ -478,139 +283,13 @@ def index( end: int | None = None, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Find the index of the first occurrence of a given value. - - Parameters - ---------- - data : Array-like - value : Scalar-like object - The value to search for. - start : int, optional - end : int, optional - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - index : int - the index, or -1 if not found - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) - >>> pc.index(arr, "ipsum") - - >>> pc.index(arr, "ipsum", start=2) - - >>> pc.index(arr, "amet") - - """ +) -> lib.Int64Scalar: ... + last = _clone_signature(first) -""" -Compute the first and last values of an array. - -Null values are ignored by default. -If skip_nulls = false, then this will return the first and last values -regardless if it is null - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True -In [15]: print(pc.last.__doc__) -Compute the first value in each group. - -Null values are ignored by default. -If skip_nulls = false, then this will return the first and last values -regardless if it is null - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" max = _clone_signature(first) -""" -Compute the minimum or maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" min = _clone_signature(first) -""" -Compute the minimum or maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" min_max = _clone_signature(first_last) -""" -Compute the minimum and maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def mean( array: FloatScalar | FloatArray @@ -623,33 +302,8 @@ def mean( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Scalar[Any]: - """ - Compute the mean of a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - The result is a double for integer and floating point arguments, - and a decimal with the same bit-width/precision/scale for decimal arguments. - For integers and floats, NaN is returned if min_count = 0 and - there are no values. For decimals, null is returned instead. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Scalar[Any]: ... + def mode( array: NumericScalar | NumericArray, @@ -660,46 +314,8 @@ def mode( min_count: int = 0, options: ModeOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: - """ - Compute the modal (most common) values of a numeric array. - - Compute the n most common values and their respective occurrence counts. - The output has type `struct`, where T is the - input type. - The results are ordered by descending `count` first, and ascending `mode` - when breaking ties. - Nulls are ignored. If there are no non-null values in the array, - an empty array is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - n : int, default 1 - Number of distinct most-common values to return. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ModeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) - >>> modes = pc.mode(arr, 2) - >>> modes[0] - - >>> modes[1] - - """ +) -> lib.StructArray: ... + def product( array: _ScalarT | lib.NumericArray[_ScalarT], @@ -709,29 +325,8 @@ def product( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: - """ - Compute the product of values in a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _ScalarT: ... + def quantile( array: NumericScalar | NumericArray, @@ -743,43 +338,8 @@ def quantile( min_count: int = 0, options: QuantileOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Compute an array of quantiles of a numeric array or chunked array. - - By default, 0.5 quantile (median) is returned. - If quantile lies between two data points, an interpolated value is - returned based on selected interpolation method. - Nulls and NaNs are ignored. - An array of nulls is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to compute. All values must be in - [0, 1]. - interpolation : str, default "linear" - How to break ties between competing data points for a given quantile. - Accepted values are: - - - "linear": compute an interpolation - - "lower": always use the smallest of the two data points - - "higher": always use the largest of the two data points - - "nearest": select the data point that is closest to the quantile - - "midpoint": compute the (unweighted) mean of the two data points - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.QuantileOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.DoubleArray: ... + def stddev( array: NumericScalar | NumericArray, @@ -790,32 +350,8 @@ def stddev( min_count: int = 0, options: VarianceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Calculate the standard deviation of a numeric array. - - The number of degrees of freedom can be controlled using VarianceOptions. - By default (`ddof` = 0), the population standard deviation is calculated. - Nulls are ignored. If there are not enough non-null values in the array - to satisfy `ddof`, null is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.VarianceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.DoubleScalar: ... + def sum( array: _NumericScalarT | NumericArray[_NumericScalarT], @@ -825,29 +361,8 @@ def sum( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: - """ - Compute the sum of a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericScalarT: ... + def tdigest( array: NumericScalar | NumericArray, @@ -860,37 +375,8 @@ def tdigest( min_count: int = 0, options: TDigestOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Approximate quantiles of a numeric array with T-Digest algorithm. - - By default, 0.5 quantile (median) is returned. - Nulls and NaNs are ignored. - An array of nulls is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to approximate. All values must be - in [0, 1]. - delta : int, default 100 - Compression parameter for the T-digest algorithm. - buffer_size : int, default 500 - Buffer size for the T-digest algorithm. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.TDigestOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ +) -> lib.DoubleArray: ... + def variance( array: NumericScalar | NumericArray, @@ -901,32 +387,8 @@ def variance( min_count: int = 0, options: VarianceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Calculate the variance of a numeric array. - - The number of degrees of freedom can be controlled using VarianceOptions. - By default (`ddof` = 0), the population variance is calculated. - Nulls are ignored. If there are not enough non-null values in the array - to satisfy `ddof`, null is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.VarianceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.DoubleScalar: ... + def top_k_unstable( values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, @@ -934,43 +396,8 @@ def top_k_unstable( sort_keys: list | None = None, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Select the indices of the top-k ordered elements from array- or table-like - data. - - This is a specialization for :func:`select_k_unstable`. Output is not - guaranteed to be stable. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get top indices from. - k : int - The number of `k` elements to keep. - sort_keys : List-like - Column key names to order by when input is table-like data. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array - Indices of the top-k ordered elements - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) - >>> pc.top_k_unstable(arr, k=3) - - [ - 5, - 4, - 2 - ] - """ +) -> lib.Array: ... + def bottom_k_unstable( values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, @@ -978,79 +405,18 @@ def bottom_k_unstable( sort_keys: list | None = None, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Select the indices of the bottom-k ordered elements from - array- or table-like data. - - This is a specialization for :func:`select_k_unstable`. Output is not - guaranteed to be stable. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get bottom indices from. - k : int - The number of `k` elements to keep. - sort_keys : List-like - Column key names to order by when input is table-like data. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array of indices - Indices of the bottom-k ordered elements - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) - >>> pc.bottom_k_unstable(arr, k=3) - - [ - 0, - 1, - 2 - ] - """ +) -> lib.Array: ... + # ========================= 2. Element-wise (“scalar”) functions ========================= # ========================= 2.1 Arithmetic ========================= def abs( x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: - """ - Calculate the absolute value of the argument element-wise. - - Results will wrap around on integer overflow. - Use function "abs_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: ... -abs_checked = _clone_signature(abs) -""" -Calculate the absolute value of the argument element-wise. -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "abs". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" +abs_checked = _clone_signature(abs) def add( x: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, @@ -1058,41 +424,10 @@ def add( /, *, memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: - """ - Add the arguments element-wise. - - Results will wrap around on integer overflow. - Use function "add_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -add_checked = _clone_signature(add) -""" -Add the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "add". +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" +add_checked = _clone_signature(add) def divide( x: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, @@ -1100,126 +435,26 @@ def divide( /, *, memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: - """ - Divide the arguments element-wise. - - Integer division by zero returns an error. However, integer overflow - wraps around, and floating-point division by zero returns an infinite. - Use function "divide_checked" if you want to get an error - in all the aforementioned cases. - - Parameters - ---------- - dividend : Array-like or scalar-like - Argument to compute function. - divisor : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + divide_checked = _clone_signature(divide) -""" -Divide the arguments element-wise. - -An error is returned when trying to divide by zero, or when -integer overflow is encountered. - -Parameters ----------- -dividend : Array-like or scalar-like - Argument to compute function. -divisor : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def exp( exponent: _FloatArrayT | ArrayOrChunkedArray[NonFloatNumericScalar] | _FloatScalarT | NonFloatNumericScalar | lib.DoubleScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression: - """ - Compute Euler's number raised to the power of specified exponent, element-wise. - - If exponent is null the result will be null. +) -> _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression: ... - Parameters - ---------- - exponent : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ multiply = _clone_signature(add) -""" -Multiply the arguments element-wise. - -Results will wrap around on integer overflow. -Use function "multiply_checked" if you want overflow -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" multiply_checked = _clone_signature(add) -""" -Multiply the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "multiply". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def negate( x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: - """ - Negate the argument element-wise. - - Results will wrap around on integer overflow. - Use function "negate_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -negate_checked = _clone_signature(negate) -""" -Negate the arguments element-wise. +) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: ... -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "negate". -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" +negate_checked = _clone_signature(negate) def power( base: _NumericScalarT | _NumericArrayT | Expression | _NumericArrayT | NumericScalar, @@ -1227,39 +462,10 @@ def power( /, *, memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT | _NumericArrayT | Expression: - """ - Raise arguments to power element-wise. - - Integer to negative integer power returns an error. However, integer overflow - wraps around. If either base or exponent is null the result will be null. - - Parameters - ---------- - base : Array-like or scalar-like - Argument to compute function. - exponent : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericScalarT | _NumericArrayT | Expression: ... + power_checked = _clone_signature(power) -""" -Raise arguments to power element-wise. - -An error is returned when integer to negative integer power is encountered, -or integer overflow is encountered. - -Parameters ----------- -base : Array-like or scalar-like - Argument to compute function. -exponent : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def sign( x: NumericOrDurationArray | NumericOrDurationScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None @@ -1268,262 +474,42 @@ def sign( | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar | Expression -): - """ - Get the signedness of the arguments element-wise. - - Output is any of (-1,1) for nonzero inputs and 0 for zero input. - NaN values return NaN. Integral values return signedness as Int8 and - floating-point values return it with the same type as the input values. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ +): ... -def sqrt(x: NumericArray | NumericScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray | FloatScalar | Expression: - """ - Takes the square root of arguments element-wise. - A negative argument returns a NaN. For a variant that returns an - error, use function "sqrt_checked". +def sqrt(x: NumericArray | NumericScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray | FloatScalar | Expression: ... - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ sqrt_checked = _clone_signature(sqrt) -""" -Takes the square root of arguments element-wise. - -A negative argument returns an error. For a variant that returns a -NaN, use function "sqrt". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" subtract = _clone_signature(add) -""" -Subtract the arguments element-wise. - -Results will wrap around on integer overflow. -Use function "subtract_checked" if you want overflow -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" subtract_checked = _clone_signature(add) -""" -Subtract the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "subtract". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.1 Bit-wise functions ========================= def bit_wise_and( x: _NumericScalarT | _NumericArrayT | NumericScalar | Expression | ArrayOrChunkedArray[NumericScalar], y: _NumericScalarT | _NumericArrayT | NumericScalar | Expression | ArrayOrChunkedArray[NumericScalar], /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericScalarT | _NumericArrayT | Expression: - """ - Bit-wise AND the arguments element-wise. - - Null values return null. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericScalarT | _NumericArrayT | Expression: ... + def bit_wise_not( x: _NumericScalarT | _NumericArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericScalarT | _NumericArrayT | Expression: - """ - Bit-wise negate the arguments element-wise. - - Null values return null. +) -> _NumericScalarT | _NumericArrayT | Expression: ... - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ bit_wise_or = _clone_signature(bit_wise_and) -""" -Bit-wise OR the arguments element-wise. - -Null values return null. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" bit_wise_xor = _clone_signature(bit_wise_and) -""" -Bit-wise XOR the arguments element-wise. - -Null values return null. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" shift_left = _clone_signature(bit_wise_and) -""" -Left shift `x` by `y`. - -The shift operates as if on the two's complement representation of the number. -In other words, this is equivalent to multiplying `x` by 2 to the power `y`, -even if overflow occurs. -`x` is returned if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -Use function "shift_left_checked" if you want an invalid shift amount -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" shift_left_checked = _clone_signature(bit_wise_and) -""" -Left shift `x` by `y`. - -The shift operates as if on the two's complement representation of the number. -In other words, this is equivalent to multiplying `x` by 2 to the power `y`, -even if overflow occurs. -An error is raised if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -See "shift_left" for a variant that doesn't fail for an invalid shift amount. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" shift_right = _clone_signature(bit_wise_and) -""" -Right shift `x` by `y`. - -This is equivalent to dividing `x` by 2 to the power `y`. -`x` is returned if `y` (the amount to shift by) is: (1) negative or -(2) greater than or equal to the precision of `x`. -Use function "shift_right_checked" if you want an invalid shift amount -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" shift_right_checked = _clone_signature(bit_wise_and) -""" -Right shift `x` by `y`. - -This is equivalent to dividing `x` by 2 to the power `y`. -An error is raised if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -See "shift_right" for a variant that doesn't fail for an invalid shift amount - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.2 Rounding functions ========================= -def ceil(x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT | _FloatArrayT | Expression: - """ - Round up to the nearest integer. +def ceil(x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT | _FloatArrayT | Expression: ... - Compute the smallest integer value not less in magnitude than `x`. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ floor = _clone_signature(ceil) -""" -Round down to the nearest integer. - -Compute the largest integer value not greater in magnitude than `x`. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def round( x: _NumericScalarT | _NumericArrayT | Expression, @@ -1544,30 +530,8 @@ def round( *, options: RoundOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT | _NumericArrayT | Expression: - """ - Round to a given precision. - - Options are used to control the number of digits and rounding mode. - Default behavior is to round to the nearest integer and - use half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - ndigits : int, default 0 - Number of fractional digits to round to. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericScalarT | _NumericArrayT | Expression: ... + def round_to_multiple( x: _NumericScalarT | _NumericArrayT | Expression, @@ -1588,31 +552,8 @@ def round_to_multiple( *, options: RoundToMultipleOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT | _NumericArrayT | Expression: - """ - Round to a given multiple. - - Options are used to control the rounding multiple and rounding mode. - Default behavior is to round to the nearest integer and - use half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - multiple : numeric scalar, default 1.0 - Multiple to round to. Should be a scalar of a type compatible - with the argument to be rounded. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundToMultipleOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericScalarT | _NumericArrayT | Expression: ... + def round_binary( x: _NumericScalarT | _NumericArrayT | Expression, @@ -1633,1058 +574,132 @@ def round_binary( *, options: RoundBinaryOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT | Expression: - """ - Round to the given precision. - - Options are used to control the rounding mode. - Default behavior is to use the half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - s : Array-like or scalar-like - Argument to compute function. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundBinaryOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT | Expression: ... -trunc = _clone_signature(ceil) -""" -Compute the integral part. -Compute the nearest integer not greater in magnitude than `x`. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" +trunc = _clone_signature(ceil) # ========================= 2.3 Logarithmic functions ========================= def ln( x: FloatScalar | FloatArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: - """ - Compute natural logarithm. - - Non-positive values return -inf or NaN. Null values return null. - Use function "ln_checked" if you want non-positive values to raise an error. +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: ... - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ ln_checked = _clone_signature(ln) -""" -Compute natural logarithm. - -Non-positive values raise an error. Null values return null. -Use function "ln" if you want non-positive values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" log10 = _clone_signature(ln) -""" -Compute base 10 logarithm. - -Non-positive values return -inf or NaN. Null values return null. -Use function "log10_checked" if you want non-positive values -to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" log10_checked = _clone_signature(ln) -""" -Compute base 10 logarithm. - -Non-positive values raise an error. Null values return null. -Use function "log10" if you want non-positive values -to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" log1p = _clone_signature(ln) -""" -Compute natural log of (1+x). - -Values <= -1 return -inf or NaN. Null values return null. -This function may be more precise than log(1 + x) for x close to zero. -Use function "log1p_checked" if you want invalid values to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" log1p_checked = _clone_signature(ln) -""" -Compute natural log of (1+x). - -Values <= -1 return -inf or NaN. Null values return null. -This function may be more precise than log(1 + x) for x close to zero. -Use function "log1p" if you want invalid values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" log2 = _clone_signature(ln) -""" -Compute base 2 logarithm. - -Non-positive values return -inf or NaN. Null values return null. -Use function "log2_checked" if you want non-positive values -to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" log2_checked = _clone_signature(ln) -""" -Compute base 2 logarithm. - -Non-positive values raise an error. Null values return null. -Use function "log2" if you want non-positive values -to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def logb( x: FloatScalar | FloatArray | Expression | Any, b: FloatScalar | FloatArray | Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression | Any: - """ - Compute base `b` logarithm. - - Values <= 0 return -inf or NaN. Null values return null. - Use function "logb_checked" if you want non-positive values to raise an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - b : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression | Any: ... + logb_checked = _clone_signature(logb) -""" -Compute base `b` logarithm. - -Values <= 0 return -inf or NaN. Null values return null. -Use function "logb" if you want non-positive values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -b : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.4 Trigonometric functions ========================= acos = _clone_signature(ln) -""" -Compute the inverse cosine. - -NaN is returned for invalid input values; -to raise an error instead, see "acos_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" acos_checked = _clone_signature(ln) -""" -Compute the inverse cosine. - -Invalid input values raise an error; -to return NaN instead, see "acos". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" asin = _clone_signature(ln) -""" -Compute the inverse sine. - -NaN is returned for invalid input values; -to raise an error instead, see "asin_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" asin_checked = _clone_signature(ln) -""" -Compute the inverse sine. - -Invalid input values raise an error; -to return NaN instead, see "asin". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" atan = _clone_signature(ln) -""" -Compute the inverse tangent of x. - -The return value is in the range [-pi/2, pi/2]; -for a full return range [-pi, pi], see "atan2". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cos = _clone_signature(ln) -""" -Compute the cosine. - -NaN is returned for invalid input values; -to raise an error instead, see "cos_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cos_checked = _clone_signature(ln) -""" -Compute the cosine. - -Infinite values raise an error; -to return NaN instead, see "cos". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" sin = _clone_signature(ln) -""" -Compute the sine. - -NaN is returned for invalid input values; -to raise an error instead, see "sin_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" sin_checked = _clone_signature(ln) -""" -Compute the sine. - -Invalid input values raise an error; -to return NaN instead, see "sin". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" tan = _clone_signature(ln) -""" -Compute the tangent. - -NaN is returned for invalid input values; -to raise an error instead, see "tan_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" tan_checked = _clone_signature(ln) -""" -Compute the tangent. - -Infinite values raise an error; -to return NaN instead, see "tan". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def atan2( y: FloatScalar | FloatArray | Expression | Any, x: FloatScalar | FloatArray | Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: - """ - Compute the inverse tangent of y/x. - - The return value is in the range [-pi, pi]. - - Parameters - ---------- - y : Array-like or scalar-like - Argument to compute function. - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: ... + # ========================= 2.5 Comparisons functions ========================= def equal( x: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, y: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Compare values for equality (x == y). - - A null on either side emits a null comparison result. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + greater = _clone_signature(equal) -""" -Compare values for ordered inequality (x > y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" greater_equal = _clone_signature(equal) -""" -Compare values for ordered inequality (x >= y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" less = _clone_signature(equal) -""" -Compare values for ordered inequality (x < y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" less_equal = _clone_signature(equal) -""" -Compare values for ordered inequality (x <= y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" not_equal = _clone_signature(equal) -""" -Compare values for inequality (x != y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def max_element_wise( *args: ScalarOrArray[_Scalar_CoT] | Expression, skip_nulls: bool = True, options: ElementWiseAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _Scalar_CoT | Expression: - """ - Find the element-wise maximum value. - - Nulls are ignored (by default) or propagated. - NaN is preferred over null, but not over any valid value. - - Parameters - ---------- - *args : Array-like or scalar-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - options : pyarrow.compute.ElementWiseAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _Scalar_CoT | Expression: ... + min_element_wise = _clone_signature(max_element_wise) -""" -Find the element-wise minimum value. - -Nulls are ignored (by default) or propagated. -NaN is preferred over null, but not over any valid value. - -Parameters ----------- -*args : Array-like or scalar-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -options : pyarrow.compute.ElementWiseAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.6 Logical functions ========================= def and_( x: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], y: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | lib.BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar]: - """ - Logical 'and' boolean values. - - When a null is encountered in either input, a null is output. - For a different null behavior, see function "and_kleene". - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar]: ... + and_kleene = _clone_signature(and_) -""" -Logical 'and' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true and null = null -- null and true = null -- false and null = false -- null and false = false -- null and null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'and' false is always false. -For a different null behavior, see function "and". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" and_not = _clone_signature(and_) -""" -Logical 'and not' boolean values. - -When a null is encountered in either input, a null is output. -For a different null behavior, see function "and_not_kleene". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" and_not_kleene = _clone_signature(and_) -""" -Logical 'and not' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true and not null = null -- null and not false = null -- false and not null = false -- null and not true = false -- null and not null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'and not' true is always false, as is false -'and not' an unknown value. -For a different null behavior, see function "and_not". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" or_ = _clone_signature(and_) -""" -Logical 'or' boolean values. - -When a null is encountered in either input, a null is output. -For a different null behavior, see function "or_kleene". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" or_kleene = _clone_signature(and_) -""" -Logical 'or' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true or null = true -- null or true = true -- false or null = null -- null or false = null -- null or null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'or' true is always true. -For a different null behavior, see function "or". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" xor = _clone_signature(and_) -""" -Logical 'xor' boolean values. - -When a null is encountered in either input, a null is output. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def invert( x: lib.BooleanScalar | _BooleanArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | _BooleanArrayT | Expression: - """ - Invert boolean values. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | _BooleanArrayT | Expression: ... + # ========================= 2.10 String predicates ========================= def ascii_is_alnum( strings: StringScalar | StringArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Classify strings as ASCII alphanumeric. - - For each string in `strings`, emit true iff the string is non-empty - and consists only of alphanumeric ASCII characters. Null strings emit null. +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ ascii_is_alpha = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII alphabetic. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphabetic ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_is_decimal = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII decimal. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of decimal ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_is_lower = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII lowercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of lowercase ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_is_printable = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII printable. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of printable ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_is_space = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII whitespace. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of whitespace ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_is_upper = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII uppercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of uppercase ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_alnum = _clone_signature(ascii_is_alnum) -""" -Classify strings as alphanumeric. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphanumeric Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_alpha = _clone_signature(ascii_is_alnum) -""" -Classify strings as alphabetic. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphabetic Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_decimal = _clone_signature(ascii_is_alnum) -""" -Classify strings as decimal. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of decimal Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_digit = _clone_signature(ascii_is_alnum) -""" -Classify strings as digits. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of Unicode digits. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_lower = _clone_signature(ascii_is_alnum) -""" -Classify strings as lowercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of lowercase Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_numeric = _clone_signature(ascii_is_alnum) -""" -Classify strings as numeric. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of numeric Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_printable = _clone_signature(ascii_is_alnum) -""" -Classify strings as printable. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of printable Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_space = _clone_signature(ascii_is_alnum) -""" -Classify strings as whitespace. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of whitespace Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_upper = _clone_signature(ascii_is_alnum) -""" -Classify strings as uppercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of uppercase Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_is_title = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII titlecase. - -For each string in `strings`, emit true iff the string is title-cased, -i.e. it has at least one cased character, each uppercase character -follows an uncased character, and each lowercase character follows -an uppercase character. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_is_title = _clone_signature(ascii_is_alnum) -""" -Classify strings as titlecase. - -For each string in `strings`, emit true iff the string is title-cased, -i.e. it has at least one cased character, each uppercase character -follows an uncased character, and each lowercase character follows -an uppercase character. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" string_is_ascii = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII. - -For each string in `strings`, emit true iff the string consists only -of ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.11 String transforms ========================= def ascii_capitalize( strings: _StringScalarT | _StringArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringScalarT | _StringArrayT | Expression: - """ - Capitalize the first character of ASCII input. +) -> _StringScalarT | _StringArrayT | Expression: ... - For each string in `strings`, return a capitalized version. - - This function assumes the input is fully ASCII. If it may contain - non-ASCII characters, use "utf8_capitalize" instead. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ ascii_lower = _clone_signature(ascii_capitalize) -""" -Transform ASCII input to lowercase. - -For each string in `strings`, return a lowercase version. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_lower" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_reverse = _clone_signature(ascii_capitalize) -""" -Reverse ASCII input. - -For each ASCII string in `strings`, return a reversed version. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_reverse" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_swapcase = _clone_signature(ascii_capitalize) -""" -Transform ASCII input by inverting casing. - -For each string in `strings`, return a string with opposite casing. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_swapcase" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_title = _clone_signature(ascii_capitalize) -""" -Titlecase each word of ASCII input. - -For each string in `strings`, return a titlecased version. -Each word in the output will start with an uppercase character and its -remaining characters will be lowercase. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_title" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_upper = _clone_signature(ascii_capitalize) -""" -Transform ASCII input to uppercase. - -For each string in `strings`, return an uppercase version. - -This function assumes the input is fully ASCII. It it may contain -non-ASCII characters, use "utf8_upper" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def binary_length( strings: lib.BinaryScalar | lib.StringScalar | lib.LargeBinaryScalar | lib.LargeStringScalar @@ -2694,20 +709,8 @@ def binary_length( | lib.ChunkedArray[lib.LargeBinaryScalar] | lib.ChunkedArray[lib.LargeStringScalar] | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: - """ - Compute string lengths. - - For each string in `strings`, emit its length of bytes. - Null values emit null. +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: ... - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ def binary_repeat( strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, @@ -2715,21 +718,8 @@ def binary_repeat( /, *, memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT | Expression: - """ - Repeat a binary string. - - For each binary string in `strings`, return a replicated version. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - num_repeats : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT | Expression: ... + def binary_replace_slice( strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, @@ -2740,48 +730,13 @@ def binary_replace_slice( *, options: ReplaceSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: - """ - Replace a slice of a binary string. - - For each string in `strings`, replace a slice of the string defined by `start` - and `stop` indices with the given `replacement`. `start` is inclusive - and `stop` is exclusive, and both are measured in bytes. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - options : pyarrow.compute.ReplaceSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + def binary_reverse( strings: _BinaryScalarT | _BinaryArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> _BinaryScalarT | _BinaryArrayT | Expression: - """ - Reverse binary input. - - For each binary string in `strings`, return a reversed version. +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... - This function reverses the binary data at a byte-level. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ def replace_substring( strings: _StringScalarT | _StringArrayT | Expression, @@ -2792,109 +747,24 @@ def replace_substring( max_replacements: int | None = None, options: ReplaceSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _StringArrayT | Expression: - """ - Replace matching non-overlapping substrings with replacement. - - For each string in `strings`, replace non-overlapping substrings that match - the given literal `pattern` with the given `replacement`. - If `max_replacements` is given and not equal to -1, it limits the - maximum amount replacements per input, counted from the left. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - replacement : str - What to replace the pattern with. - max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). - options : pyarrow.compute.ReplaceSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringScalarT | _StringArrayT | Expression: ... + replace_substring_regex = _clone_signature(replace_substring) -""" -Replace matching non-overlapping substrings with replacement. - -For each string in `strings`, replace non-overlapping substrings that match -the given regular expression `pattern` with the given `replacement`. -If `max_replacements` is given and not equal to -1, it limits the -maximum amount replacements per input, counted from the left. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -replacement : str - What to replace the pattern with. -max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). -options : pyarrow.compute.ReplaceSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def utf8_capitalize( strings: _StringScalarT | _StringArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringScalarT | _StringArrayT | Expression: - """ - Capitalize the first character of input. - - For each string in `strings`, return a capitalized version, - with the first character uppercased and the others lowercased. +) -> _StringScalarT | _StringArrayT | Expression: ... - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ def utf8_length( strings: lib.StringScalar | lib.LargeStringScalar | lib.StringArray | lib.ChunkedArray[lib.StringScalar] | lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar] | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: - """ - Compute UTF8 string lengths. +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: ... - For each string in `strings`, emit its length in UTF8 characters. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ utf8_lower = _clone_signature(utf8_capitalize) -""" -Transform input to lowercase. - -For each string in `strings`, return a lowercase version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def utf8_replace_slice( strings: _StringScalarT | _StringArrayT | Expression, @@ -2905,89 +775,13 @@ def utf8_replace_slice( *, options: ReplaceSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _StringArrayT | Expression: - """ - Replace a slice of a string. - - For each string in `strings`, replace a slice of the string defined by `start` - and `stop` indices with the given `replacement`. `start` is inclusive - and `stop` is exclusive, and both are measured in UTF8 characters. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - options : pyarrow.compute.ReplaceSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringScalarT | _StringArrayT | Expression: ... + utf8_reverse = _clone_signature(utf8_capitalize) -""" -Reverse input. - -For each string in `strings`, return a reversed version. - -This function operates on Unicode codepoints, not grapheme -clusters. Hence, it will not correctly reverse grapheme clusters -composed of multiple codepoints. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_swapcase = _clone_signature(utf8_capitalize) -""" -Transform input lowercase characters to uppercase and uppercase characters to lowercase. - -For each string in `strings`, return an opposite case version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_title = _clone_signature(utf8_capitalize) -""" -Titlecase each word of input. - -For each string in `strings`, return a titlecased version. -Each word in the output will start with an uppercase character and its -remaining characters will be lowercase. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_upper = _clone_signature(utf8_capitalize) -""" -Transform input to uppercase. - -For each string in `strings`, return an uppercase version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory poo -""" # ========================= 2.12 String padding ========================= def ascii_center( @@ -2999,157 +793,14 @@ def ascii_center( *, options: PadOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _StringArrayT | Expression: - """ - Center strings by padding with a given character. - - For each string in `strings`, emit a centered string by padding both sides - with the given ASCII character. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - width : int - Desired string length. - padding : str, default " " - What to pad the string with. Should be one byte or codepoint. - lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). - options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringScalarT | _StringArrayT | Expression: ... + ascii_lpad = _clone_signature(ascii_center) -""" -Right-align strings by padding with a given character. - -For each string in `strings`, emit a right-aligned string by prepending -the given ASCII character. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_rpad = _clone_signature(ascii_center) -""" -Left-align strings by padding with a given character. - -For each string in `strings`, emit a left-aligned string by appending -the given ASCII character. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_center = _clone_signature(ascii_center) -""" -Center strings by padding with a given character. - -For each string in `strings`, emit a centered string by padding both sides -with the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_lpad = _clone_signature(ascii_center) -""" -Right-align strings by padding with a given character. - -For each string in `strings`, emit a right-aligned string by prepending -the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_rpad = _clone_signature(ascii_center) -""" -Left-align strings by padding with a given character. - -For each string in `strings`, emit a left-aligned string by appending -the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.13 String trimming ========================= def ascii_ltrim( @@ -3159,127 +810,14 @@ def ascii_ltrim( *, options: TrimOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _StringArrayT | Expression: - """ - Trim leading characters. - - For each string in `strings`, remove any leading characters - from the `characters` option (as given in TrimOptions). - Null values emit null. - Both the `strings` and the `characters` are interpreted as - ASCII; to trim non-ASCII characters, use `utf8_ltrim`. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - characters : str - Individual characters to be trimmed from the string. - options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringScalarT | _StringArrayT | Expression: ... + ascii_rtrim = _clone_signature(ascii_ltrim) -""" -Trim trailing characters. - -For each string in `strings`, remove any trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. -Both the `strings` and the `characters` are interpreted as -ASCII; to trim non-ASCII characters, use `utf8_rtrim`. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_trim = _clone_signature(ascii_ltrim) -""" -Trim leading and trailing characters. - -For each string in `strings`, remove any leading or trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. -Both the `strings` and the `characters` are interpreted as -ASCII; to trim non-ASCII characters, use `utf8_trim`. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_ltrim = _clone_signature(ascii_ltrim) -""" -Trim leading characters. - -For each string in `strings`, remove any leading characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_rtrim = _clone_signature(ascii_ltrim) -""" -Trim trailing characters. - -For each string in `strings`, remove any trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_trim = _clone_signature(ascii_ltrim) -""" -Trim leading and trailing characters. - -For each string in `strings`, remove any leading or trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def ascii_ltrim_whitespace( strings: _StringScalarT | _StringArrayT | Expression, @@ -3287,97 +825,14 @@ def ascii_ltrim_whitespace( *, options: TrimOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _StringArrayT | Expression: - """ - Trim leading ASCII whitespace characters. - - For each string in `strings`, emit a string with leading ASCII whitespace - characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode - whitespace characters. Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringScalarT | _StringArrayT | Expression: ... + ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim trailing ASCII whitespace characters. - -For each string in `strings`, emit a string with trailing ASCII whitespace -characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode -whitespace characters. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading and trailing ASCII whitespace characters. - -For each string in `strings`, emit a string with leading and trailing ASCII -whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode -whitespace characters. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading whitespace characters. - -For each string in `strings`, emit a string with leading whitespace -characters removed, where whitespace characters are defined by the Unicode -standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim trailing whitespace characters. - -For each string in `strings`, emit a string with trailing whitespace -characters removed, where whitespace characters are defined by the Unicode -standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading and trailing whitespace characters. - -For each string in `strings`, emit a string with leading and trailing -whitespace characters removed, where whitespace characters are defined -by the Unicode standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.14 String splitting ========================= def ascii_split_whitespace( @@ -3388,31 +843,8 @@ def ascii_split_whitespace( reverse: bool = False, options: SplitOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: - """ - Split string according to any ASCII whitespace. - - Split each string according any non-zero length sequence of ASCII - whitespace characters. The output for each string input is a list - of strings. - - The maximum number of splits and direction of splitting - (forward, reverse) can optionally be defined in SplitOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - options : pyarrow.compute.SplitOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: ... + def split_pattern( strings: _StringOrBinaryScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, @@ -3423,86 +855,11 @@ def split_pattern( reverse: bool = False, options: SplitOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: - """ - Split string according to separator. - - Split each string according to the exact `pattern` defined in - SplitPatternOptions. The output for each string input is a list - of strings. - - The maximum number of splits and direction of splitting - (forward, reverse) can optionally be defined in SplitPatternOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - String pattern to split on. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - options : pyarrow.compute.SplitPatternOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: ... + split_pattern_regex = _clone_signature(split_pattern) -""" -Split string according to regex pattern. - -Split each string according to the regex `pattern` defined in -SplitPatternOptions. The output for each string input is a list -of strings. - -The maximum number of splits and direction of splitting -(forward, reverse) can optionally be defined in SplitPatternOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - String pattern to split on. -max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). -reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. -options : pyarrow.compute.SplitPatternOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" utf8_split_whitespace = _clone_signature(ascii_split_whitespace) -""" -Split string according to any Unicode whitespace. - -Split each string according any non-zero length sequence of Unicode -whitespace characters. The output for each string input is a list -of strings. - -The maximum number of splits and direction of splitting -(forward, reverse) can optionally be defined in SplitOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). -reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. -options : pyarrow.compute.SplitOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.15 String component extraction ========================= def extract_regex( @@ -3512,49 +869,14 @@ def extract_regex( *, options: ExtractRegexOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar | lib.StructArray | Expression: - """ - Extract substrings captured by a regex pattern. - - For each string in `strings`, match the regular expression and, if - successful, emit a struct with field names and values coming from the - regular expression's named capture groups. If the input is null or the - regular expression fails matching, a null output value is emitted. - - Regular expression matching is done using the Google RE2 library. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Regular expression with named capture fields. - options : pyarrow.compute.ExtractRegexOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.StructScalar | lib.StructArray | Expression: ... + # ========================= 2.16 String join ========================= def binary_join( strings, separator, /, *, memory_pool: lib.MemoryPool | None = None -) -> StringScalar | StringArray: - """ - Join a list of strings together with a separator. - - Concatenate the strings in `list`. The `separator` is inserted - between each given string. - Any null input and any null `list` element emits a null output. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - separator : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> StringScalar | StringArray: ... + def binary_join_element_wise( *strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, @@ -3562,30 +884,8 @@ def binary_join_element_wise( null_replacement: str = "", options: JoinOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: - """ - Join string arguments together, with the last argument as separator. - - Concatenate the `strings` except for the last one. The last argument - in `strings` is inserted between each given string. - Any null separator element emits a null output. Null elements either - emit a null (the default), are skipped, or replaced with a given string. - - Parameters - ---------- - *strings : Array-like or scalar-like - Argument to compute function. - null_handling : str, default "emit_null" - How to handle null values in the inputs. - Accepted values are "emit_null", "skip", "replace". - null_replacement : str, default "" - Replacement string to emit for null inputs if `null_handling` - is "replace". - options : pyarrow.compute.JoinOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + # ========================= 2.17 String Slicing ========================= def binary_slice( @@ -3597,34 +897,8 @@ def binary_slice( *, options: SliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _BinaryScalarT | _BinaryArrayT | Expression: - """ - Slice binary string. - - For each binary string in `strings`, emit the substring defined by - (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is - inclusive and `stop` is exclusive. All three values are measured in - bytes. - If `step` is negative, the string will be advanced in reversed order. - An error is raised if `step` is zero. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - options : pyarrow.compute.SliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + def utf8_slice_codeunits( strings: _StringScalarT | _StringArrayT | Expression, @@ -3635,34 +909,8 @@ def utf8_slice_codeunits( *, options: SliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT | _StringArrayT | Expression: - """ - Slice string. - - For each string in `strings`, emit the substring defined by - (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is - inclusive and `stop` is exclusive. All three values are measured in - UTF8 codeunits. - If `step` is negative, the string will be advanced in reversed order. - An error is raised if `step` is zero. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - options : pyarrow.compute.SliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _StringScalarT | _StringArrayT | Expression: ... + # ========================= 2.18 Containment tests ========================= def count_substring( @@ -3678,49 +926,10 @@ def count_substring( ignore_case: bool = False, options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: - """ - Count occurrences of substring. - - For each string in `strings`, emit the number of occurrences of the given - literal pattern. - Null inputs emit null. The pattern must be given in MatchSubstringOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: ... + count_substring_regex = _clone_signature(count_substring) -""" -Count occurrences of substring. - -For each string in `strings`, emit the number of occurrences of the given -regular expression pattern. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def ends_with( strings: StringScalar | BinaryScalar | StringArray | BinaryArray | Expression, @@ -3730,72 +939,11 @@ def ends_with( ignore_case: bool = False, options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Check if strings end with a literal pattern. - - For each string in `strings`, emit true iff it ends with a given pattern. - The pattern must be given in MatchSubstringOptions. - If ignore_case is set, only simple case folding is performed. - - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + find_substring = _clone_signature(count_substring) -""" -Find first occurrence of substring. - -For each string in `strings`, emit the index in bytes of the first occurrence -of the given literal pattern, or -1 if not found. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" find_substring_regex = _clone_signature(count_substring) -""" -Find location of first match of regex pattern. - -For each string in `strings`, emit the index in bytes of the first occurrence -of the given literal pattern, or -1 if not found. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def index_in( values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, @@ -3805,31 +953,8 @@ def index_in( skip_nulls: bool = False, options: SetLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar | lib.Int32Array | Expression: - """ - Return index of each element in a set of values. - - For each element in `values`, return its index in a given set of - values, or null if it is not found there. - The set of values to look for must be given in SetLookupOptions. - By default, nulls are matched against the value set, this can be - changed in SetLookupOptions. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - options : pyarrow.compute.SetLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int32Scalar | lib.Int32Array | Expression: ... + def is_in( values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, @@ -3839,169 +964,23 @@ def is_in( skip_nulls: bool = False, options: SetLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar | lib.BooleanArray: - """ - Find each element in a set of values. - - For each element in `values`, return true if it is found in a given - set of values, false otherwise. - The set of values to look for must be given in SetLookupOptions. - By default, nulls are matched against the value set, this can be - changed in SetLookupOptions. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - options : pyarrow.compute.SetLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray: ... + match_like = _clone_signature(ends_with) -""" -Match strings against SQL-style LIKE pattern. - -For each string in `strings`, emit true iff it matches a given pattern -at any position. '%' will match any number of characters, '_' will -match exactly one character, and any other character matches itself. -To match a literal '%', '_', or '\', precede the character with a backslash. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" match_substring = _clone_signature(ends_with) -""" -Match strings against literal pattern. - -For each string in `strings`, emit true iff it contains a given pattern. -Null inputs emit null. -The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" match_substring_regex = _clone_signature(ends_with) -""" -Match strings against regex pattern. - -For each string in `strings`, emit true iff it matches a given pattern -at any position. The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Null inputs emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" starts_with = _clone_signature(ends_with) -""" -Check if strings start with a literal pattern. - -For each string in `strings`, emit true iff it starts with a given pattern. -The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Null inputs emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.19 Categorizations ========================= def is_finite( values: NumericScalar | lib.NullScalar | NumericArray | lib.NullArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Return true if value is finite. - - For each input value, emit true iff the value is finite - (i.e. neither NaN, inf, nor -inf). +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ is_inf = _clone_signature(is_finite) -""" -Return true if infinity. - -For each input value, emit true iff the value is infinite (inf or -inf). - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" is_nan = _clone_signature(is_finite) -""" -Return true if NaN. - -For each input value, emit true iff the value is NaN. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def is_null( values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, @@ -4010,174 +989,30 @@ def is_null( nan_is_null: bool = False, options: NullOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Return true if null (and optionally NaN). - - For each input value, emit true iff the value is null. - True may also be emitted for NaN values by setting the `nan_is_null` flag. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - nan_is_null : bool, default False - Whether floating-point NaN values are considered null. - options : pyarrow.compute.NullOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + def is_valid( values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Return true if non-null. - - For each input value, emit true iff the value is valid (i.e. non-null). +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ true_unless_null = _clone_signature(is_valid) -""" -Return true if non-null, else return null. -For each input value, emit true iff the value -is valid (non-null), otherwise emit null. +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): ... -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -# ========================= 2.20 Selecting / multiplexing ========================= -def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): - """ - Choose values based on multiple conditions. - - `cond` must be a struct of Boolean values. `cases` can be a mix - of scalar and array arguments (of any type, but all must be the - same type or castable to a common type), with either exactly one - datum per child of `cond`, or one more `cases` than children of - `cond` (in which case we have an "else" value). - - Each row of the output will be the corresponding value of the - first datum in `cases` for which the corresponding child of `cond` - is true, or otherwise the "else" value (if given), or null. - - Essentially, this implements a switch-case or if-else, if-else... statement. - - Parameters - ---------- - cond : Array-like or scalar-like - Argument to compute function. - *cases : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): - """ - Choose values from several arrays. - - For each row, the value of the first argument is used as a 0-based index - into the list of `values` arrays (i.e. index 0 selects the first of the - `values` arrays). The output value is the corresponding value of the - selected argument. - - If an index is null, the output will be null. - - Parameters - ---------- - indices : Array-like or scalar-like - Argument to compute function. - *values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): ... + def coalesce( *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None -) -> _ScalarOrArrayT: - """ - Select the first non-null value. - - Each row of the output will be the value from the first corresponding input - for which the value is not null. If all inputs are null in a row, the output - will be null. - - Parameters - ---------- - *values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _ScalarOrArrayT: ... + fill_null = coalesce -"""Replace each null element in values with a corresponding -element from fill_value. - -If fill_value is scalar-like, then every null element in values -will be replaced with fill_value. If fill_value is array-like, -then the i-th element in values will be replaced with the i-th -element in fill_value. - -The fill_value's type must be the same as that of values, or it -must be able to be implicitly casted to the array's type. - -This is an alias for :func:`coalesce`. - -Parameters ----------- -values : Array, ChunkedArray, or Scalar-like object - Each null element is replaced with the corresponding value - from fill_value. -fill_value : Array, ChunkedArray, or Scalar-like object - If not same type as values, will attempt to cast. - -Returns -------- -result : depends on inputs - Values with all null elements replaced - -Examples --------- ->>> import pyarrow as pa ->>> arr = pa.array([1, 2, None, 3], type=pa.int8()) ->>> fill_value = pa.scalar(5, type=pa.int8()) ->>> arr.fill_null(fill_value) - -[ - 1, - 2, - 5, - 3 -] ->>> arr = pa.array([1, 2, None, 4, None]) ->>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) - -[ - 1, - 2, - 30, - 4, - 50 -] -""" def if_else( cond: ArrayLike | ScalarLike, @@ -4186,25 +1021,8 @@ def if_else( /, *, memory_pool: lib.MemoryPool | None = None, -) -> ArrayLike | ScalarLike: - """ - Choose values based on a condition. - - `cond` must be a Boolean scalar/ array. - `left` or `right` must be of the same type scalar/ array. - `null` values in `cond` will be promoted to the output. - - Parameters - ---------- - cond : Array-like or scalar-like - Argument to compute function. - left : Array-like or scalar-like - Argument to compute function. - right : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> ArrayLike | ScalarLike: ... + # ========================= 2.21 Structural transforms ========================= @@ -4213,21 +1031,8 @@ def list_value_length( /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array | lib.Int64Array | Expression: - """ - Compute list lengths. - - `lists` must have a list-like type. - For each non-null value in `lists`, its length is emitted. - Null values emit a null in the output. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int32Array | lib.Int64Array | Expression: ... + def make_struct( *args: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, @@ -4236,29 +1041,8 @@ def make_struct( field_metadata: list[lib.KeyValueMetadata] | None = None, options: MakeStructOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar | lib.StructArray | Expression: - """ - Wrap Arrays into a StructArray. - - Names of the StructArray's fields are - specified through MakeStructOptions. - - Parameters - ---------- - *args : Array-like or scalar-like - Argument to compute function. - field_names : sequence of str - Names of the struct fields to create. - field_nullability : sequence of bool, optional - Nullability information for each struct field. - If omitted, all fields are nullable. - field_metadata : sequence of KeyValueMetadata, optional - Metadata for each struct field. - options : pyarrow.compute.MakeStructOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.StructScalar | lib.StructArray | Expression: ... + # ========================= 2.22 Conversions ========================= def ceil_temporal( @@ -4284,163 +1068,11 @@ def ceil_temporal( calendar_based_origin: bool = False, options: RoundTemporalOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _TemporalScalarT | _TemporalArrayT | Expression: - """ - Round temporal values up to nearest multiple of specified time unit. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - multiple : int, default 1 - Number of units to round to. - unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. - calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. - options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _TemporalScalarT | _TemporalArrayT | Expression: ... + floor_temporal = _clone_signature(ceil_temporal) -""" -Round temporal values down to nearest multiple of specified time unit. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -timestamps : Array-like or scalar-like - Argument to compute function. -multiple : int, default 1 - Number of units to round to. -unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". -week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. -ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. -calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. -options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" round_temporal = _clone_signature(ceil_temporal) -""" -Round temporal values to the nearest multiple of specified time unit. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -timestamps : Array-like or scalar-like - Argument to compute function. -multiple : int, default 1 - Number of units to round to. -unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". -week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. -ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. -calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. -options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def cast( arr: lib.Scalar | lib.Array | lib.ChunkedArray, @@ -4448,60 +1080,8 @@ def cast( safe: bool | None = None, options: CastOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Scalar[_DataTypeT] | lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]]: - """ - Cast array values to another data type. Can also be invoked as an array - instance method. - - Parameters - ---------- - arr : Array-like - target_type : DataType or str - Type to cast to - safe : bool, default True - Check for overflows or other unsafe conversions - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Examples - -------- - >>> from datetime import datetime - >>> import pyarrow as pa - >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) - >>> arr.type - TimestampType(timestamp[us]) - - You can use ``pyarrow.DataType`` objects to specify the target type: - - >>> cast(arr, pa.timestamp("ms")) - - [ - 2010-01-01 00:00:00.000, - 2015-01-01 00:00:00.000 - ] - - >>> cast(arr, pa.timestamp("ms")).type - TimestampType(timestamp[ms]) - - Alternatively, it is also supported to use the string aliases for these - types: - - >>> arr.cast("timestamp[ms]") - - [ - 2010-01-01 00:00:00.000, - 2015-01-01 00:00:00.000 - ] - >>> arr.cast("timestamp[ms]").type - TimestampType(timestamp[ms]) - - Returns - ------- - casted : Array - The cast result as a new Array - """ +) -> lib.Scalar[_DataTypeT] | lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... + def strftime( timestamps: TemporalScalar | TemporalArray | Expression, @@ -4511,34 +1091,8 @@ def strftime( *, options: StrftimeOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StringScalar | lib.StringArray | Expression: - """ - Format temporal values according to a format string. - - For each input value, emit a formatted string. - The time format string and locale can be set using StrftimeOptions. - The output precision of the "%S" (seconds) format code depends on - the input time precision: it is an integer for timestamps with - second precision, a real number with the required number of fractional - digits for higher precisions. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database, or if the specified locale - does not exist on this system. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - format : str, default "%Y-%m-%dT%H:%M:%S" - Pattern for formatting input values. - locale : str, default "C" - Locale to use for locale-specific format specifiers. - options : pyarrow.compute.StrftimeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.StringScalar | lib.StringArray | Expression: ... + def strptime( strings: StringScalar | StringArray | Expression, @@ -4549,53 +1103,14 @@ def strptime( *, options: StrptimeOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampScalar | lib.TimestampArray | Expression: - """ - Parse timestamps. - - For each string in `strings`, parse it as a timestamp. - The timestamp unit and the expected string pattern must be given - in StrptimeOptions. Null inputs emit null. If a non-null string - fails parsing, an error is returned by default. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - format : str - Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". - Note that the semantics of the format follow the C/C++ strptime, not the Python one. - There are differences in behavior, for example how the "%y" placeholder - handles years with less than four digits. - unit : str - Timestamp unit of the output. - Accepted values are "s", "ms", "us", "ns". - error_is_null : boolean, default False - Return null on parsing errors if true or raise if false. - options : pyarrow.compute.StrptimeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + # ========================= 2.23 Temporal component extraction ========================= def day( values: TemporalScalar | TemporalArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar | lib.Int64Array | Expression: - """ - Extract day number. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + def day_of_week( values: TemporalScalar | TemporalArray | Expression, @@ -4605,50 +1120,10 @@ def day_of_week( week_start: int = 1, options: DayOfWeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array | Expression: - """ - Extract day of the week number. - - By default, the week starts on Monday represented by 0 and ends on Sunday - represented by 6. - `DayOfWeekOptions.week_start` can be used to set another starting day using - the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). - Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - options : pyarrow.compute.DayOfWeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + day_of_year = _clone_signature(day) -""" -Extract day of year number. - -January 1st maps to day number 1, February 1st to 32, etc. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def hour( values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any] @@ -4659,80 +1134,22 @@ def hour( /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array | Expression: - """ - Extract hour value. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + def is_dst( values: lib.TimestampScalar | lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar] | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Extracts if currently observing daylight savings. - - IsDaylightSavings returns true if a timestamp has a daylight saving - offset in the given timezone. - Null values emit null. - An error is returned if the values do not have a defined timezone. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + def iso_week( values: lib.TimestampScalar | lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]] | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar | lib.Int64Array | Expression: - """ - Extract ISO week of year number. - - First ISO week has the majority (4 or more) of its days in January. - ISO week starts on Monday. The week number starts with 1 and can run - up to 53. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + iso_year = _clone_signature(iso_week) -""" -Extract ISO year number. - -First week of an ISO year has the majority (4 or more) of its days in January. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def is_leap_year( values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar | lib.TimestampArray @@ -4744,199 +1161,20 @@ def is_leap_year( /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar | lib.BooleanArray | Expression: - """ - Extract if year is a leap year. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + microsecond = _clone_signature(iso_week) -""" -Extract microsecond values. - -Microsecond returns number of microseconds since the last full millisecond. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" millisecond = _clone_signature(iso_week) -""" -Extract millisecond values. - -Millisecond returns number of milliseconds since the last full second. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" minute = _clone_signature(iso_week) -""" -Extract minute values. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" month = _clone_signature(day_of_week) -""" -Extract month number. - -Month is encoded as January=1, December=12. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" nanosecond = _clone_signature(hour) -""" -Extract nanosecond values. - -Nanosecond returns number of nanoseconds since the last full microsecond. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" quarter = _clone_signature(day_of_week) -""" -Extract quarter of year number. - -First quarter maps to 1 and forth quarter maps to 4. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" second = _clone_signature(hour) -""" -Extract second values. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" subsecond = _clone_signature(hour) -""" -Extract subsecond values. - -Subsecond returns the fraction of a second since the last full second. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" us_week = _clone_signature(iso_week) -""" -Extract US week of year number. - -First US week has the majority (4 or more) of its days in January. -US week starts on Monday. The week number starts with 1 and can run -up to 53. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" us_year = _clone_signature(iso_week) -""" -Extract US epidemiological year number. - -First week of US epidemiological year has the majority (4 or more) of -it's days in January. Last week of US epidemiological year has the -year's last Wednesday in it. US epidemiological week starts on Sunday. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" year = _clone_signature(iso_week) -""" -Extract year number. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def week( values: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, @@ -4947,274 +1185,39 @@ def week( first_week_is_fully_in_year: bool = False, options: WeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array | Expression: - """ - Extract week of year number. - - First week has the majority (4 or more) of its days in January. - Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using - DayOfWeekOptions.count_from_zero. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - count_from_zero : bool, default False - If True, dates at the start of a year that fall into the last week - of the previous year emit 0. - If False, they emit 52 or 53 (the week number of the last week - of the previous year). - first_week_is_fully_in_year : bool, default False - If True, week number 0 is fully in January. - If False, a week that begins on December 29, 30 or 31 is considered - to be week number 0 of the following year. - options : pyarrow.compute.WeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + def year_month_day( values: TemporalScalar | TemporalArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructScalar | lib.StructArray | Expression: - """ - Extract (year, month, day) struct. - - Null values emit null. - An error is returned in the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.StructScalar | lib.StructArray | Expression: ... + # ========================= 2.24 Temporal difference ========================= -def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Compute the number of days and milliseconds between two timestamps. - - Returns the number of days and milliseconds from `start` to `end`. - That is, first the difference in days is computed as if both - timestamps were truncated to the day, then the difference between time times - of the two timestamps is computed as if both times were truncated to the - millisecond. - Null values return null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... + def days_between( start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar | lib.Int64Array: - """ - Compute the number of days between two timestamps. - - Returns the number of day boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the day. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array: ... + hours_between = _clone_signature(days_between) -""" -Compute the number of hours between two timestamps. - -Returns the number of hour boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the hour. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" microseconds_between = _clone_signature(days_between) -""" -Compute the number of microseconds between two timestamps. - -Returns the number of microsecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the microsecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" milliseconds_between = _clone_signature(days_between) -""" -Compute the number of millisecond boundaries between two timestamps. - -Returns the number of millisecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the millisecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" minutes_between = _clone_signature(days_between) -""" -Compute the number of millisecond boundaries between two timestamps. - -Returns the number of millisecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the millisecond. -Null values emit null. -In [152]: print(pc.minutes_between.__doc__) -Compute the number of minute boundaries between two timestamps. - -Returns the number of minute boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the minute. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def month_day_nano_interval_between( start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: - """ - Compute the number of months, days and nanoseconds between two timestamps. - - Returns the number of months, days, and nanoseconds from `start` to `end`. - That is, first the difference in months is computed as if both timestamps - were truncated to the months, then the difference between the days - is computed, and finally the difference between the times of the two - timestamps is computed as if both times were truncated to the nanosecond. - Null values return null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Compute the number of months between two timestamps. - - Returns the number of month boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the month. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ... + + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... + nanoseconds_between = _clone_signature(days_between) -""" -Compute the number of nanoseconds between two timestamps. - -Returns the number of nanosecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the nanosecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" quarters_between = _clone_signature(days_between) -""" -Compute the number of quarters between two timestamps. - -Returns the number of quarter start boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the quarter. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" seconds_between = _clone_signature(days_between) -""" -Compute the number of seconds between two timestamps. - -Returns the number of second boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the second. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" def weeks_between( start, @@ -5225,50 +1228,10 @@ def weeks_between( week_start: int = 1, options: DayOfWeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array: - """ - Compute the number of weeks between two timestamps. - - Returns the number of week boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the week. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - options : pyarrow.compute.DayOfWeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Scalar | lib.Int64Array: ... + years_between = _clone_signature(days_between) -""" -Compute the number of years between two timestamps. - -Returns the number of year boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the year. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 2.25 Timezone handling ========================= def assume_timezone( @@ -5280,58 +1243,14 @@ def assume_timezone( nonexistent: Literal["raise", "earliest", "latest"] = "raise", options: AssumeTimezoneOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression: - """ - Convert naive timestamp to timezone-aware timestamp. - - Input timestamps are assumed to be relative to the timezone given in the - `timezone` option. They are converted to UTC-relative timestamps and - the output type has its timezone set to the value of the `timezone` - option. Null values emit null. - This function is meant to be used when an external system produces - "timezone-naive" timestamps which need to be converted to - "timezone-aware" timestamps. An error is returned if the timestamps - already have a defined timezone. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - timezone : str - Timezone to assume for the input. - ambiguous : str, default "raise" - How to handle timestamps that are ambiguous in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - nonexistent : str, default "raise" - How to handle timestamps that don't exist in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - options : pyarrow.compute.AssumeTimezoneOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression: ... + def local_timestamp( timestamps: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.TimestampScalar | lib.TimestampArray | Expression: - """ - Convert timestamp to a timezone-naive local time timestamp. - - LocalTimestamp converts timezone-aware timestamp to local timestamp - of the given timestamp's timezone and removes timezone metadata. - Alternative name for this timestamp is also wall clock time. - If input is in UTC or without timezone, then unchanged input values - without timezone metadata are returned. - Null values emit null. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + # ========================= 2.26 Random number generation ========================= def random( @@ -5340,28 +1259,8 @@ def random( initializer: Literal["system"] | int = "system", options: RandomOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Generate numbers in the range [0, 1). - - Generated values are uniformly-distributed, double-precision - in range [0, 1). Algorithm and seed can be changed via RandomOptions. - - Parameters - ---------- - n : int - Number of values to generate, must be greater than or equal to 0 - initializer : int or str - How to initialize the underlying random generator. - If an integer is given, it is used as a seed. - If "system" is given, the random generator is initialized with - a system-specific source of (hopefully true) randomness. - Other values are invalid. - options : pyarrow.compute.RandomOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.DoubleArray: ... + # ========================= 3. Array-wise (“vector”) functions ========================= @@ -5374,168 +1273,15 @@ def cumulative_sum( skip_nulls: bool = False, options: CumulativeSumOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT | Expression: - """ - Compute the cumulative sum over a numeric input. - - `values` must be numeric. Return an array/chunked array which is the - cumulative sum computed over `values`. Results will wrap around on - integer overflow. Use function "cumulative_sum_checked" if you want - overflow to return an error. The default start is 0. - - Parameters - ---------- - values : Array-like - Argument to compute function. - start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. - skip_nulls : bool, default False - When false, the first encountered null is propagated. - options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericArrayT | Expression: ... + cumulative_sum_checked = _clone_signature(cumulative_sum) -""" -Compute the cumulative sum over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative sum computed over `values`. This function returns an error -on overflow. For a variant that doesn't fail on overflow, use -function "cumulative_sum". The default start is 0. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cumulative_prod = _clone_signature(cumulative_sum) -""" -Compute the cumulative product over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative product computed over `values`. Results will wrap around on -integer overflow. Use function "cumulative_prod_checked" if you want -overflow to return an error. The default start is 1. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cumulative_prod_checked = _clone_signature(cumulative_sum) -""" -Compute the cumulative product over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative product computed over `values`. This function returns an error -on overflow. For a variant that doesn't fail on overflow, use -function "cumulative_prod". The default start is 1. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cumulative_max = _clone_signature(cumulative_sum) -""" -Compute the cumulative max over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative max computed over `values`. The default start is the minimum -value of input type (so that any other value will replace the -start as the new maximum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cumulative_min = _clone_signature(cumulative_sum) -""" -Compute the cumulative min over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative min computed over `values`. The default start is the maximum -value of input type (so that any other value will replace the -start as the new minimum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" cumulative_mean = _clone_signature(cumulative_sum) -""" -Compute the cumulative max over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative max computed over `values`. The default start is the minimum -value of input type (so that any other value will replace the -start as the new maximum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" # ========================= 3.2 Associative transforms ========================= def dictionary_encode( @@ -5615,45 +1361,6 @@ def drop_null( filter = array_filter take = array_take -""" -Select values (or records) from array- or table-like data given integer -selection indices. - -The result will be of the same type(s) as the input, with elements taken -from the input array (or record batch / table fields) at the given -indices. If an index is null then the corresponding value in the output -will be null. - -Parameters ----------- -data : Array, ChunkedArray, RecordBatch, or Table -indices : Array, ChunkedArray - Must be of integer type -boundscheck : boolean, default True - Whether to boundscheck the indices. If False and there is an out of - bounds index, will likely cause the process to crash. -memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - -Returns -------- -result : depends on inputs - Selected values for the given indices - -Examples --------- ->>> import pyarrow as pa ->>> arr = pa.array(["a", "b", "c", None, "e", "f"]) ->>> indices = pa.array([0, None, 4, 3]) ->>> arr.take(indices) - -[ - "a", - null, - "e", - null -] -""" # ========================= 3.4 Containment tests ========================= def indices_nonzero( @@ -5665,20 +1372,8 @@ def indices_nonzero( /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array | Expression: - """ - Return the indices of the values in the array that are non-zero. - - For each input value, check if it's zero, false or null. Emit the index - of the value in the array if it's none of the those. +) -> lib.UInt64Array | Expression: ... - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ # ========================= 3.5 Sorts and partitions ========================= def array_sort_indices( @@ -5689,33 +1384,8 @@ def array_sort_indices( null_placement: _Placement = "at_end", options: ArraySortOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array | Expression: - """ - Return the indices that would sort an array. - - This function computes an array of indices that define a stable sort - of the input array. By default, Null values are considered greater - than any other value and are therefore sorted at the end of the array. - For floating-point types, NaNs are considered greater than any - other non-null value, but smaller than null values. - - The handling of nulls and NaNs can be changed in ArraySortOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - null_placement : str, default "at_end" - Where nulls in the input should be sorted. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.ArraySortOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.UInt64Array | Expression: ... + def partition_nth_indices( array: lib.Array | lib.ChunkedArray | Expression, @@ -5725,39 +1395,8 @@ def partition_nth_indices( null_placement: _Placement = "at_end", options: PartitionNthOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array | Expression: - """ - Return the indices that would partition an array around a pivot. - - This functions computes an array of indices that define a non-stable - partial sort of the input array. - - The output is such that the `N`'th index points to the `N`'th element - of the input in sorted order, and all indices before the `N`'th point - to elements in the input less or equal to elements at or after the `N`'th. - - By default, null values are considered greater than any other value - and are therefore partitioned towards the end of the array. - For floating-point types, NaNs are considered greater than any - other non-null value, but smaller than null values. - - The pivot index `N` must be given in PartitionNthOptions. - The handling of nulls and NaNs can also be changed in PartitionNthOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - pivot : int - Index into the equivalent sorted array of the pivot element. - null_placement : str, default "at_end" - Where nulls in the input should be partitioned. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.PartitionNthOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.UInt64Array | Expression: ... + def rank( input: lib.Array | lib.ChunkedArray, @@ -5768,49 +1407,8 @@ def rank( tiebreaker: Literal["min", "max", "first", "dense"] = "first", options: RankOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: - """ - Compute ordinal ranks of an array (1-based). - - This function computes a rank of the input array. - By default, null values are considered greater than any other value and - are therefore sorted at the end of the input. For floating-point types, - NaNs are considered greater than any other non-null value, but smaller - than null values. The default tiebreaker is to assign ranks in order of - when ties appear in the input. - - The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - tiebreaker : str, default "first" - Configure how ties between equal values are handled. - Accepted values are: - - - "min": Ties get the smallest possible rank in sorted order. - - "max": Ties get the largest possible rank in sorted order. - - "first": Ranks are assigned in order of when ties appear in the - input. This ensures the ranks are a stable permutation - of the input. - - "dense": The ranks span a dense [1, M] interval where M is the - number of distinct values in the input. - options : pyarrow.compute.RankOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.UInt64Array: ... + def select_k_unstable( input: lib.Array | lib.ChunkedArray | Expression, @@ -5820,35 +1418,8 @@ def select_k_unstable( *, options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array | Expression: - """ - Select the indices of the first `k` ordered elements from the input. - - This function selects an array of indices of the first `k` ordered elements - from the `input` array, record batch or table specified in the column keys - (`options.sort_keys`). Output is not guaranteed to be stable. - Null values are considered greater than any other value and are - therefore ordered at the end. For floating-point types, NaNs are considered - greater than any other non-null value, but smaller than null values. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - k : int - Number of leading values to select in sorted order - (i.e. the largest values if sort order is "descending", - the smallest otherwise). - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - options : pyarrow.compute.SelectKOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.UInt64Array | Expression: ... + def sort_indices( input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, @@ -5858,58 +1429,15 @@ def sort_indices( null_placement: _Placement = "at_end", options: SortOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array | Expression: - """ - Return the indices that would sort an array, record batch or table. - - This function computes an array of indices that define a stable sort - of the input array, record batch or table. By default, null values are - considered greater than any other value and are therefore sorted at the - end of the input. For floating-point types, NaNs are considered greater - than any other non-null value, but smaller than null values. - - The handling of nulls and NaNs can be changed in SortOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - null_placement : str, default "at_end" - Where nulls in input should be sorted, only applying to - columns/fields mentioned in `sort_keys`. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.SortOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.UInt64Array | Expression: ... + # ========================= 3.6 Structural transforms ========================= def list_element( lists: lib.Array[ListScalar[_DataTypeT]] | lib.ChunkedArray[ListScalar[_DataTypeT]] | ListScalar[_DataTypeT] | Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] | _DataTypeT | Expression: - """ - Compute elements using of nested list values using an index. - - `lists` must have a list-like type. - For each value in each list of `lists`, the element at `index` - is emitted. Null values emit a null in the output. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - index : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] | _DataTypeT | Expression: ... + def list_flatten( lists: ArrayOrChunkedArray[ListScalar[Any]] | Expression, @@ -5918,49 +1446,13 @@ def list_flatten( *, options: ListFlattenOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[Any] | Expression: - """ - Flatten list values. - - `lists` must have a list-like type (lists, list-views, and - fixed-size lists). - Return an array with the top list level flattened unless - `recursive` is set to true in ListFlattenOptions. When that - is that case, flattening happens recursively until a non-list - array is formed. - - Null list values do not emit anything to the output. - - Parameters - ---------- - lists : Array-like - Argument to compute function. - recursive : bool, default False - When True, the list array is flattened recursively until an array - of non-list values is formed. - options : pyarrow.compute.ListFlattenOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.ListArray[Any] | Expression: ... + def list_parent_indices( lists: ArrayOrChunkedArray[Any] | Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Array | Expression: - """ - Compute parent indices of nested list values. - - `lists` must have a list-like or list-view type. - For each value in each list of `lists`, the top-level list index - is emitted. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.Int64Array | Expression: ... + def list_slice( lists: ArrayOrChunkedArray[Any] | Expression, @@ -5972,35 +1464,8 @@ def list_slice( *, options: ListSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[Any] | Expression: - """ - Compute slice of list-like array. - - `lists` must have a list-like type. - For each list element, compute a slice, returning a new list array. - A variable or fixed size list array is returned, depending on options. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing inner list elements (inclusive). - stop : Optional[int], default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. (NotImplemented) - step : int, default 1 - Slice step. - return_fixed_size_list : Optional[bool], default None - Whether to return a FixedSizeListArray. If true _and_ stop is after - a list element's length, nulls will be appended to create the - requested slice size. The default of `None` will return the same - type which was passed in. - options : pyarrow.compute.ListSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> lib.ListArray[Any] | Expression: ... + def map_lookup( container, @@ -6010,28 +1475,8 @@ def map_lookup( *, options: MapLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, -): - """ - Find the items corresponding to a given key in a Map. - - For a given query key (passed via MapLookupOptions), extract - either the FIRST, LAST or ALL items from a Map that have - matching keys. - - Parameters - ---------- - container : Array-like or scalar-like - Argument to compute function. - query_key : Scalar or Object can be converted to Scalar - The key to search for. - occurrence : str - The occurrence(s) to return from the Map - Accepted values are "first", "last", or "all". - options : pyarrow.compute.MapLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +): ... + def struct_field( values, @@ -6040,62 +1485,14 @@ def struct_field( *, options: StructFieldOptions | None = None, memory_pool: lib.MemoryPool | None = None, -): - """ - Extract children of a struct or union by index. - - Given a list of indices (passed via StructFieldOptions), extract - the child array or scalar with the given child index, recursively. - - For union inputs, nulls are emitted for union values that reference - a different child than specified. Also, the indices are always - in physical order, not logical type codes - for example, the first - child is always index 0. - - An empty list of indices returns the argument unchanged. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - indices : List[str], List[bytes], List[int], Expression, bytes, str, or int - List of indices for chained field lookup, for example `[4, 1]` - will look up the second nested field in the fifth outer field. - options : pyarrow.compute.StructFieldOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Carry non-null values backward to fill null slots. - - Given an array, propagate next valid observation backward to previous valid - or nothing if all next values are null. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Carry non-null values forward to fill null slots. - - Given an array, propagate last valid observation forward to next valid - or nothing if all previous values are null. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +): ... + + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... + + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... + def replace_with_mask( values, @@ -6104,28 +1501,8 @@ def replace_with_mask( /, *, memory_pool: lib.MemoryPool | None = None, -): - """ - Replace items selected with a mask. - - Given an array and a boolean mask (either scalar or of equal length), - along with replacement values (either scalar or array), - each element of the array for which the corresponding mask element is - true will be replaced by the next value from the replacements, - or with null if the mask is null. - Hence, for replacement arrays, len(replacements) == sum(mask == true). - - Parameters - ---------- - values : Array-like - Argument to compute function. - mask : Array-like - Argument to compute function. - replacements : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +): ... + # ========================= 3.7 Pairwise functions ========================= def pairwise_diff( @@ -6135,51 +1512,7 @@ def pairwise_diff( *, options: PairwiseOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT | Expression: - """ - Compute first order difference of an array. - - Computes the first order difference of an array, It internally calls - the scalar function "subtract" to compute - differences, so its - behavior and supported types are the same as - "subtract". The period can be specified in :struct:`PairwiseOptions`. - - Results will wrap around on integer overflow. Use function - "pairwise_diff_checked" if you want overflow to return an error. - - Parameters - ---------- - input : Array-like - Argument to compute function. - period : int, default 1 - Period for applying the period function. - options : pyarrow.compute.PairwiseOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ +) -> _NumericOrTemporalArrayT | Expression: ... + pairwise_diff_checked = _clone_signature(pairwise_diff) -""" -Compute first order difference of an array. - -Computes the first order difference of an array, It internally calls -the scalar function "subtract_checked" (or the checked variant) to compute -differences, so its behavior and supported types are the same as -"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. - -This function returns an error on overflow. For a variant that doesn't -fail on overflow, use function "pairwise_diff". - -Parameters ----------- -input : Array-like - Argument to compute function. -period : int, default 1 - Period for applying the period function. -options : pyarrow.compute.PairwiseOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" diff --git a/python/pyarrow-stubs/config.pyi b/python/pyarrow-stubs/config.pyi index 7c2eb8a9c98..62555a506f3 100644 --- a/python/pyarrow-stubs/config.pyi +++ b/python/pyarrow-stubs/config.pyi @@ -17,11 +17,13 @@ from typing import NamedTuple + class VersionInfo(NamedTuple): major: int minor: int patch: int + class BuildInfo(NamedTuple): version: str version_info: VersionInfo @@ -35,17 +37,21 @@ class BuildInfo(NamedTuple): package_kind: str build_type: str + class RuntimeInfo(NamedTuple): simd_level: str detected_simd_level: str + cpp_build_info: BuildInfo cpp_version: str cpp_version_info: VersionInfo + def runtime_info() -> RuntimeInfo: ... def set_timezone_db_path(path: str) -> None: ... + __all__ = [ "VersionInfo", "BuildInfo", diff --git a/python/pyarrow-stubs/dataset.pyi b/python/pyarrow-stubs/dataset.pyi index 6cb7fed43e6..160ed19ee4b 100644 --- a/python/pyarrow-stubs/dataset.pyi +++ b/python/pyarrow-stubs/dataset.pyi @@ -128,10 +128,13 @@ __all__ = [ _DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + @overload def partitioning( schema: Schema, ) -> Partitioning: ... + + @overload def partitioning( schema: Schema, @@ -139,6 +142,8 @@ def partitioning( flavor: Literal["filename"], dictionaries: dict[str, Array] | None = None, ) -> Partitioning: ... + + @overload def partitioning( schema: Schema, @@ -146,12 +151,16 @@ def partitioning( flavor: Literal["filename"], dictionaries: Literal["infer"], ) -> PartitioningFactory: ... + + @overload def partitioning( field_names: list[str], *, flavor: Literal["filename"], ) -> PartitioningFactory: ... + + @overload def partitioning( schema: Schema, @@ -159,11 +168,15 @@ def partitioning( flavor: Literal["hive"], dictionaries: Literal["infer"], ) -> PartitioningFactory: ... + + @overload def partitioning( *, flavor: Literal["hive"], ) -> PartitioningFactory: ... + + @overload def partitioning( schema: Schema, @@ -171,6 +184,8 @@ def partitioning( flavor: Literal["hive"], dictionaries: dict[str, Array] | None = None, ) -> Partitioning: ... + + def parquet_dataset( metadata_path: StrPath, schema: Schema | None = None, @@ -179,6 +194,8 @@ def parquet_dataset( partitioning: Partitioning | PartitioningFactory | None = None, partition_base_dir: str | None = None, ) -> FileSystemDataset: ... + + @overload def dataset( source: StrPath | Sequence[StrPath], @@ -190,6 +207,8 @@ def dataset( exclude_invalid_files: bool | None = None, ignore_prefixes: list[str] | None = None, ) -> FileSystemDataset: ... + + @overload def dataset( source: list[Dataset], @@ -201,6 +220,8 @@ def dataset( exclude_invalid_files: bool | None = None, ignore_prefixes: list[str] | None = None, ) -> UnionDataset: ... + + @overload def dataset( source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, @@ -212,6 +233,8 @@ def dataset( exclude_invalid_files: bool | None = None, ignore_prefixes: list[str] | None = None, ) -> InMemoryDataset: ... + + @overload def dataset( source: RecordBatch | Table, @@ -223,6 +246,8 @@ def dataset( exclude_invalid_files: bool | None = None, ignore_prefixes: list[str] | None = None, ) -> InMemoryDataset: ... + + def write_dataset( data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], base_dir: StrPath, @@ -241,6 +266,7 @@ def write_dataset( min_rows_per_group: int = 0, max_rows_per_group: int = 1024 * 1024, file_visitor: Callable[[str], None] | None = None, - existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + existing_data_behavior: Literal["error", + "overwrite_or_ignore", "delete_matching"] = "error", create_dir: bool = True, ): ... diff --git a/python/pyarrow-stubs/device.pyi b/python/pyarrow-stubs/device.pyi index 6c4f1fdeeea..d77fe2504af 100644 --- a/python/pyarrow-stubs/device.pyi +++ b/python/pyarrow-stubs/device.pyi @@ -19,6 +19,7 @@ import enum from pyarrow.lib import _Weakrefable + class DeviceAllocationType(enum.Flag): CPU = enum.auto() CUDA = enum.auto() @@ -35,71 +36,33 @@ class DeviceAllocationType(enum.Flag): WEBGPU = enum.auto() HEXAGON = enum.auto() -class Device(_Weakrefable): - """ - Abstract interface for hardware devices - This object represents a device with access to some memory spaces. - When handling a Buffer or raw memory address, it allows deciding in which - context the raw memory address should be interpreted - (e.g. CPU-accessible memory, or embedded memory on some particular GPU). - """ +class Device(_Weakrefable): @property - def type_name(self) -> str: - """ - A shorthand for this device's type. - """ + def type_name(self) -> str: ... + @property - def device_id(self) -> int: - """ - A device ID to identify this device if there are multiple of this type. + def device_id(self) -> int: ... - If there is no "device_id" equivalent (such as for the main CPU device on - non-numa systems) returns -1. - """ @property - def is_cpu(self) -> bool: - """ - Whether this device is the main CPU device. + def is_cpu(self) -> bool: ... - This shorthand method is very useful when deciding whether a memory address - is CPU-accessible. - """ @property - def device_type(self) -> DeviceAllocationType: - """ - Return the DeviceAllocationType of this device. - """ + def device_type(self) -> DeviceAllocationType: ... -class MemoryManager(_Weakrefable): - """ - An object that provides memory management primitives. - A MemoryManager is always tied to a particular Device instance. - It can also have additional parameters (such as a MemoryPool to - allocate CPU memory). +class MemoryManager(_Weakrefable): - """ @property - def device(self) -> Device: - """ - The device this MemoryManager is tied to. - """ + def device(self) -> Device: ... + @property - def is_cpu(self) -> bool: - """ - Whether this MemoryManager is tied to the main CPU device. + def is_cpu(self) -> bool: ... - This shorthand method is very useful when deciding whether a memory - address is CPU-accessible. - """ -def default_cpu_memory_manager() -> MemoryManager: - """ - Return the default CPU MemoryManager instance. +def default_cpu_memory_manager() -> MemoryManager: ... - The returned singleton instance uses the default MemoryPool. - """ -__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] +__all__ = ["DeviceAllocationType", "Device", + "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow-stubs/feather.pyi b/python/pyarrow-stubs/feather.pyi index ce8d83dbcd9..10281e91152 100644 --- a/python/pyarrow-stubs/feather.pyi +++ b/python/pyarrow-stubs/feather.pyi @@ -32,18 +32,25 @@ __all__ = [ "read_table", ] + class FeatherDataset: path_or_paths: str | list[str] validate_schema: bool - def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... + def __init__(self, path_or_paths: str | + list[str], validate_schema: bool = True) -> None: ... + def read_table(self, columns: list[str] | None = None) -> Table: ... def validate_schemas(self, piece, table: Table) -> None: ... + def read_pandas( self, columns: list[str] | None = None, use_threads: bool = True ) -> pd.DataFrame: ... + def check_chunked_overflow(name: str, col) -> None: ... + + def write_feather( df: pd.DataFrame | Table, dest: StrPath | IO, @@ -52,6 +59,8 @@ def write_feather( chunksize: int | None = None, version: Literal[1, 2] = 2, ) -> None: ... + + def read_feather( source: StrPath | IO, columns: list[str] | None = None, @@ -59,6 +68,8 @@ def read_feather( memory_map: bool = False, **kwargs, ) -> pd.DataFrame: ... + + def read_table( source: StrPath | IO, columns: list[str] | None = None, diff --git a/python/pyarrow-stubs/fs.pyi b/python/pyarrow-stubs/fs.pyi index 6c5a0af8d19..61a557ea428 100644 --- a/python/pyarrow-stubs/fs.pyi +++ b/python/pyarrow-stubs/fs.pyi @@ -45,6 +45,7 @@ from pyarrow._s3fs import ( # noqa FileStats = FileInfo + def copy_files( source: str, destination: str, @@ -55,10 +56,12 @@ def copy_files( use_threads: bool = True, ) -> None: ... + class FSSpecHandler(FileSystemHandler): # type: ignore[misc] fs: SupportedFileSystem def __init__(self, fs: SupportedFileSystem) -> None: ... + __all__ = [ # _fs "FileSelector", diff --git a/python/pyarrow-stubs/interchange/buffer.pyi b/python/pyarrow-stubs/interchange/buffer.pyi index 6890a24030c..e1d8ae949c9 100644 --- a/python/pyarrow-stubs/interchange/buffer.pyi +++ b/python/pyarrow-stubs/interchange/buffer.pyi @@ -19,6 +19,7 @@ import enum from pyarrow.lib import Buffer + class DlpackDeviceType(enum.IntEnum): CPU = 1 CUDA = 2 @@ -29,6 +30,7 @@ class DlpackDeviceType(enum.IntEnum): VPI = 9 ROCM = 10 + class _PyArrowBuffer: def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... @property diff --git a/python/pyarrow-stubs/interchange/column.pyi b/python/pyarrow-stubs/interchange/column.pyi index 970ad3e07be..04861a72b0b 100644 --- a/python/pyarrow-stubs/interchange/column.pyi +++ b/python/pyarrow-stubs/interchange/column.pyi @@ -23,6 +23,7 @@ from pyarrow.lib import Array, ChunkedArray from .buffer import _PyArrowBuffer + class DtypeKind(enum.IntEnum): INT = 0 UINT = 1 @@ -32,8 +33,10 @@ class DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 + Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + class ColumnNullType(enum.IntEnum): NON_NULLABLE = 0 USE_NAN = 1 @@ -41,26 +44,34 @@ class ColumnNullType(enum.IntEnum): USE_BITMASK = 3 USE_BYTEMASK = 4 + class ColumnBuffers(TypedDict): data: tuple[_PyArrowBuffer, Dtype] validity: tuple[_PyArrowBuffer, Dtype] | None offsets: tuple[_PyArrowBuffer, Dtype] | None + class CategoricalDescription(TypedDict): is_ordered: bool is_dictionary: bool categories: _PyArrowColumn | None + class Endianness(enum.Enum): LITTLE = "<" BIG = ">" NATIVE = "=" NA = "|" -class NoBufferPresent(Exception): ... + +class NoBufferPresent(Exception): + ... + class _PyArrowColumn: - def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... + def __init__(self, column: Array | ChunkedArray, + allow_copy: bool = True) -> None: ... + def size(self) -> int: ... @property def offset(self) -> int: ... diff --git a/python/pyarrow-stubs/interchange/dataframe.pyi b/python/pyarrow-stubs/interchange/dataframe.pyi index fb97e9a414f..cafbe0fc200 100644 --- a/python/pyarrow-stubs/interchange/dataframe.pyi +++ b/python/pyarrow-stubs/interchange/dataframe.pyi @@ -26,10 +26,12 @@ from typing import Any, Iterable, Sequence from pyarrow.interchange.column import _PyArrowColumn from pyarrow.lib import RecordBatch, Table + class _PyArrowDataFrame: def __init__( self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True ) -> None: ... + def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> _PyArrowDataFrame: ... diff --git a/python/pyarrow-stubs/interchange/from_dataframe.pyi b/python/pyarrow-stubs/interchange/from_dataframe.pyi index b13d5976337..e7f1c6e91ff 100644 --- a/python/pyarrow-stubs/interchange/from_dataframe.pyi +++ b/python/pyarrow-stubs/interchange/from_dataframe.pyi @@ -26,27 +26,39 @@ from .column import ( DtypeKind, ) + class DataFrameObject(Protocol): - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... + def __dataframe__(self, nan_as_null: bool = False, + allow_copy: bool = True) -> Any: ... + ColumnObject: TypeAlias = Any + def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... -def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... + +def protocol_df_chunk_to_pyarrow( + df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... + def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + def categorical_column_to_dictionary( col: ColumnObject, allow_copy: bool = True ) -> DictionaryArray: ... + def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... + def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... + def buffers_to_array( buffers: ColumnBuffers, data_type: tuple[DtypeKind, int, str, str], @@ -56,6 +68,7 @@ def buffers_to_array( allow_copy: bool = True, ) -> Array: ... + def validity_buffer_from_mask( validity_buff: Buffer, validity_dtype: Dtype, @@ -65,6 +78,7 @@ def validity_buffer_from_mask( allow_copy: bool = True, ) -> Buffer: ... + def validity_buffer_nan_sentinel( data_pa_buffer: Buffer, data_type: Dtype, diff --git a/python/pyarrow-stubs/io.pyi b/python/pyarrow-stubs/io.pyi index 3d630498a1d..ea259f02142 100644 --- a/python/pyarrow-stubs/io.pyi +++ b/python/pyarrow-stubs/io.pyi @@ -40,290 +40,76 @@ from pyarrow.lib import MemoryPool, _Weakrefable from .device import Device, DeviceAllocationType, MemoryManager from ._types import KeyValueMetadata -def have_libhdfs() -> bool: - """ - Return true if HDFS (HadoopFileSystem) library is set up correctly. - """ - -def io_thread_count() -> int: - """ - Return the number of threads to use for I/O operations. - - Many operations, such as scanning a dataset, will implicitly make - use of this pool. The number of threads is set to a fixed value at - startup. It can be modified at runtime by calling - :func:`set_io_thread_count()`. - - See Also - -------- - set_io_thread_count : Modify the size of this pool. - cpu_count : The analogous function for the CPU thread pool. - """ - -def set_io_thread_count(count: int) -> None: - """ - Set the number of threads to use for I/O operations. - - Many operations, such as scanning a dataset, will implicitly make - use of this pool. - - Parameters - ---------- - count : int - The max number of threads that may be used for I/O. - Must be positive. - - See Also - -------- - io_thread_count : Get the size of this pool. - set_cpu_count : The analogous function for the CPU thread pool. - """ +def have_libhdfs() -> bool: ... -Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] -class NativeFile(_Weakrefable): - """ - The base class for all Arrow streams. +def io_thread_count() -> int: ... + + +def set_io_thread_count(count: int) -> None: ... - Streams are either readable, writable, or both. - They optionally support seeking. - While this class exposes methods to read or write data from Python, the - primary intent of using a Arrow stream is to pass it to other Arrow - facilities that will make use of it, such as Arrow IPC routines. +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + +class NativeFile(_Weakrefable): - Be aware that there are subtle differences with regular Python files, - e.g. destroying a writable Arrow stream without closing it explicitly - will not flush any pending data. - """ _default_chunk_size: int def __enter__(self) -> Self: ... def __exit__(self, *args) -> None: ... @property - def mode(self) -> Mode: - """ - The file mode. Currently instances of NativeFile may support: - - * rb: binary read - * wb: binary write - * rb+: binary read and write - * ab: binary append - """ + def mode(self) -> Mode: ... + def readable(self) -> bool: ... def seekable(self) -> bool: ... def isatty(self) -> bool: ... - def fileno(self) -> int: - """ - NOT IMPLEMENTED - """ + def fileno(self) -> int: ... + @property def closed(self) -> bool: ... def close(self) -> None: ... - def size(self) -> int: - """ - Return file size - """ - def metadata(self) -> KeyValueMetadata: - """ - Return file metadata - """ - def tell(self) -> int: - """ - Return current stream position - """ - def seek(self, position: int, whence: int = 0) -> int: - """ - Change current file stream position - - Parameters - ---------- - position : int - Byte offset, interpreted relative to value of whence argument - whence : int, default 0 - Point of reference for seek offset - - Notes - ----- - Values of whence: - * 0 -- start of stream (the default); offset should be zero or positive - * 1 -- current stream position; offset may be negative - * 2 -- end of stream; offset is usually negative - - Returns - ------- - int - The new absolute stream position. - """ - def flush(self) -> None: - """ - Flush the stream, if applicable. - - An error is raised if stream is not writable. - """ - def write(self, data: bytes | SupportPyBuffer) -> int: - """ - Write data to the file. - - Parameters - ---------- - data : bytes-like object or exporter of buffer protocol - - Returns - ------- - int - nbytes: number of bytes written - """ - def read(self, nbytes: int | None = None) -> bytes: - """ - Read and return up to n bytes. - - If *nbytes* is None, then the entire remaining file contents are read. - - Parameters - ---------- - nbytes : int, default None - - Returns - ------- - data : bytes - """ - def get_stream(self, file_offset: int, nbytes: int) -> Self: - """ - Return an input stream that reads a file segment independent of the - state of the file. - - Allows reading portions of a random access file as an input stream - without interfering with each other. - - Parameters - ---------- - file_offset : int - nbytes : int - - Returns - ------- - stream : NativeFile - """ - def read_at(self, nbytes: int, offset: int) -> bytes: - """ - Read indicated number of bytes at offset from the file - - Parameters - ---------- - nbytes : int - offset : int - - Returns - ------- - data : bytes - """ - def read1(self, nbytes: int | None = None) -> bytes: - """ - Read and return up to n bytes. - - Unlike read(), if *nbytes* is None then a chunk is read, not the - entire file. - - Parameters - ---------- - nbytes : int, default None - The maximum number of bytes to read. - - Returns - ------- - data : bytes - """ + def size(self) -> int: ... + + def metadata(self) -> KeyValueMetadata: ... + + def tell(self) -> int: ... + + def seek(self, position: int, whence: int = 0) -> int: ... + + def flush(self) -> None: ... + + def write(self, data: bytes | SupportPyBuffer) -> int: ... + + def read(self, nbytes: int | None = None) -> bytes: ... + + def get_stream(self, file_offset: int, nbytes: int) -> Self: ... + + def read_at(self, nbytes: int, offset: int) -> bytes: ... + + def read1(self, nbytes: int | None = None) -> bytes: ... + def readall(self) -> bytes: ... - def readinto(self, b: SupportPyBuffer) -> int: - """ - Read into the supplied buffer - - Parameters - ---------- - b : buffer-like object - A writable buffer object (such as a bytearray). - - Returns - ------- - written : int - number of bytes written - """ - - def readline(self, size: int | None = None) -> bytes: - """Read and return a line of bytes from the file. - - If size is specified, read at most size bytes. - - Line terminator is always b"\\n". - - Parameters - ---------- - size : int - maximum number of bytes read - """ - def readlines(self, hint: int | None = None) -> list[bytes]: - """ - NOT IMPLEMENTED. Read lines of the file - - Parameters - ---------- - hint : int - maximum number of bytes read until we stop - """ - def __iter__(self) -> Self: - """ - Implement iter(self). - """ + def readinto(self, b: SupportPyBuffer) -> int: ... + + + def readline(self, size: int | None = None) -> bytes: ... + + def readlines(self, hint: int | None = None) -> list[bytes]: ... + + def __iter__(self) -> Self: ... + def __next__(self) -> bytes: ... - def read_buffer(self, nbytes: int | None = None) -> Buffer: - """ - Read from buffer. - - Parameters - ---------- - nbytes : int, optional - maximum number of bytes read - """ - def truncate(self) -> None: - """ - NOT IMPLEMENTED - """ - def writelines(self, lines: list[bytes]): - """ - Write lines to the file. - - Parameters - ---------- - lines : iterable - Iterable of bytes-like objects or exporters of buffer protocol - """ - def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: - """ - Read this file completely to a local path or destination stream. - - This method first seeks to the beginning of the file. - - Parameters - ---------- - stream_or_path : str or file-like object - If a string, a local file path to write to; otherwise, - should be a writable stream. - buffer_size : int, optional - The buffer size to use for data transfers. - """ - def upload(self, stream: IOBase, buffer_size: int | None) -> None: - """ - Write from a source stream to this file. - - Parameters - ---------- - stream : file-like object - Source stream to pipe to this file. - buffer_size : int, optional - The buffer size to use for data transfers. - """ + def read_buffer(self, nbytes: int | None = None) -> Buffer: ... + + def truncate(self) -> None: ... + + def writelines(self, lines: list[bytes]): ... + + def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: ... + + def upload(self, stream: IOBase, buffer_size: int | None) -> None: ... + def writable(self): ... @@ -331,183 +117,29 @@ class NativeFile(_Weakrefable): # Python file-like objects class PythonFile(NativeFile): - """ - A stream backed by a Python file object. - - This class allows using Python file objects with arbitrary Arrow - functions, including functions written in another language than Python. - - As a downside, there is a non-zero redirection cost in translating - Arrow stream calls to Python method calls. Furthermore, Python's - Global Interpreter Lock may limit parallelism in some situations. - - Examples - -------- - >>> import io - >>> import pyarrow as pa - >>> pa.PythonFile(io.BytesIO()) - - - Create a stream for writing: - - >>> buf = io.BytesIO() - >>> f = pa.PythonFile(buf, mode = 'w') - >>> f.writable() - True - >>> f.write(b'PythonFile') - 10 - >>> buf.getvalue() - b'PythonFile' - >>> f.close() - >>> f - - - Create a stream for reading: - - >>> buf = io.BytesIO(b'PythonFile') - >>> f = pa.PythonFile(buf, mode = 'r') - >>> f.mode - 'rb' - >>> f.read() - b'PythonFile' - >>> f - - >>> f.close() - >>> f - - """ - def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... - def truncate(self, pos: int | None = None) -> None: - """ - Parameters - ---------- - pos : int, optional - """ - -class MemoryMappedFile(NativeFile): - """ - A stream that represents a memory-mapped file. - - Supports 'r', 'r+', 'w' modes. - Examples - -------- - Create a new file with memory map: + def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: ... - >>> import pyarrow as pa - >>> mmap = pa.create_memory_map('example_mmap.dat', 10) - >>> mmap - - >>> mmap.close() - Open an existing file with memory map: +class MemoryMappedFile(NativeFile): - >>> with pa.memory_map('example_mmap.dat') as mmap: - ... mmap - ... - - """ @classmethod - def create(cls, path: str, size: int) -> Self: - """ - Create a MemoryMappedFile - - Parameters - ---------- - path : str - Where to create the file. - size : int - Size of the memory mapped file. - """ + def create(cls, path: str, size: int) -> Self: ... + def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... - def resize(self, new_size: int) -> None: - """ - Resize the map and underlying file. + def resize(self, new_size: int) -> None: ... - Parameters - ---------- - new_size : new size in bytes - """ def memory_map( path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" -) -> MemoryMappedFile: - """ - Open memory map at file path. Size of the memory map cannot change. - - Parameters - ---------- - path : str - mode : {'r', 'r+', 'w'}, default 'r' - Whether the file is opened for reading ('r'), writing ('w') - or both ('r+'). - - Returns - ------- - mmap : MemoryMappedFile - - Examples - -------- - Reading from a memory map without any memory allocation or copying: - - >>> import pyarrow as pa - >>> with pa.output_stream('example_mmap.txt') as stream: - ... stream.write(b'Constructing a buffer referencing the mapped memory') - ... - 51 - >>> with pa.memory_map('example_mmap.txt') as mmap: - ... mmap.read_at(6,45) - ... - b'memory' - """ +) -> MemoryMappedFile: ... + create_memory_map = MemoryMappedFile.create class OSFile(NativeFile): - """ - A stream backed by a regular file descriptor. - - Examples - -------- - Create a new file to write to: - - >>> import pyarrow as pa - >>> with pa.OSFile('example_osfile.arrow', mode='w') as f: - ... f.writable() - ... f.write(b'OSFile') - ... f.seekable() - ... - True - 6 - False - - Open the file to read: - - >>> with pa.OSFile('example_osfile.arrow', mode='r') as f: - ... f.mode - ... f.read() - ... - 'rb' - b'OSFile' - - Open the file to append: - - >>> with pa.OSFile('example_osfile.arrow', mode='ab') as f: - ... f.mode - ... f.write(b' is super!') - ... - 'ab' - 10 - >>> with pa.OSFile('example_osfile.arrow') as f: - ... f.read() - ... - b'OSFile is super!' - - Inspect created OSFile: - - >>> pa.OSFile('example_osfile.arrow') - - """ + def __init__( self, path: str, @@ -516,435 +148,125 @@ class OSFile(NativeFile): ) -> None: ... class FixedSizeBufferWriter(NativeFile): - """ - A stream writing to a Arrow buffer. - - Examples - -------- - Create a stream to write to ``pyarrow.Buffer``: - - >>> import pyarrow as pa - >>> buf = pa.allocate_buffer(5) - >>> with pa.output_stream(buf) as stream: - ... stream.write(b'abcde') - ... stream - ... - 5 - - - Inspect the buffer: - - >>> buf.to_pybytes() - b'abcde' - >>> buf - - """ + def __init__(self, buffer: Buffer) -> None: ... - def set_memcopy_threads(self, num_threads: int) -> None: - """ - Parameters - ---------- - num_threads : int - """ - def set_memcopy_blocksize(self, blocksize: int) -> None: - """ - Parameters - ---------- - blocksize : int64 - """ - def set_memcopy_threshold(self, threshold: int) -> None: - """ - Parameters - ---------- - threshold : int64 - """ + def set_memcopy_threads(self, num_threads: int) -> None: ... + + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + + def set_memcopy_threshold(self, threshold: int) -> None: ... + # ---------------------------------------------------------------------- # Arrow buffers class Buffer(_Weakrefable): - """ - The base class for all Arrow buffers. - - A buffer represents a contiguous memory area. Many buffers will own - their memory, though not all of them do. - """ - def __len__(self) -> int: - """ - Return len(self). - """ + + def __len__(self) -> int: ... + def _assert_cpu(self) -> None: ... @property - def size(self) -> int: - """ - The buffer size in bytes. - """ + def size(self) -> int: ... + @property - def address(self) -> int: - """ - The buffer's address, as an integer. - - The returned address may point to CPU or device memory. - Use `is_cpu()` to disambiguate. - """ - def hex(self) -> bytes: - """ - Compute hexadecimal representation of the buffer. - - Returns - ------- - : bytes - """ + def address(self) -> int: ... + + def hex(self) -> bytes: ... + @property - def is_mutable(self) -> bool: - """ - Whether the buffer is mutable. - """ + def is_mutable(self) -> bool: ... + @property - def is_cpu(self) -> bool: - """ - Whether the buffer is CPU-accessible. - """ + def is_cpu(self) -> bool: ... + @property - def device(self) -> Device: - """ - The device where the buffer resides. - - Returns - ------- - Device - """ + def device(self) -> Device: ... + @property - def memory_manager(self) -> MemoryManager: - """ - The memory manager associated with the buffer. - - Returns - ------- - MemoryManager - """ + def memory_manager(self) -> MemoryManager: ... + @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the buffer resides. - - Returns - ------- - DeviceAllocationType - """ + def device_type(self) -> DeviceAllocationType: ... + @property def parent(self) -> Buffer | None: ... - def __getitem__(self, key: builtins.slice | int) -> Self | int: - """ - Return self[key]. - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Slice this buffer. Memory is not copied. - - You can also use the Python slice notation ``buffer[start:stop]``. - - Parameters - ---------- - offset : int, default 0 - Offset from start of buffer to slice. - length : int, default None - Length of slice (default is until end of Buffer starting from - offset). - - Returns - ------- - sliced : Buffer - A logical view over this buffer. - """ - def equals(self, other: Self) -> bool: - """ - Determine if two buffers contain exactly the same data. - - Parameters - ---------- - other : Buffer - - Returns - ------- - are_equal : bool - True if buffer contents and size are equal - """ + def __getitem__(self, key: builtins.slice | int) -> Self | int: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self) -> bool: ... + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... - def to_pybytes(self) -> bytes: - """ - Return this buffer as a Python bytes object. Memory is copied. - """ + def to_pybytes(self) -> bytes: ... + class ResizableBuffer(Buffer): - """ - A base class for buffers that can be resized. - """ - - def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: - """ - Resize buffer to indicated size. - - Parameters - ---------- - new_size : int - New size of buffer (padding may be added internally). - shrink_to_fit : bool, default False - If this is true, the buffer is shrunk when new_size is less - than the current size. - If this is false, the buffer is never shrunk. - """ + + + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... + def allocate_buffer( size: int, memory_pool: MemoryPool | None = None, resizable: Literal[False] | Literal[True] | None = None -) -> Buffer | ResizableBuffer: - """ - Allocate a mutable buffer. - - Parameters - ---------- - size : int - Number of bytes to allocate (plus internal padding) - memory_pool : MemoryPool, optional - The pool to allocate memory from. - If not given, the default memory pool is used. - resizable : bool, default False - If true, the returned buffer is resizable. - - Returns - ------- - buffer : Buffer or ResizableBuffer - """ +) -> Buffer | ResizableBuffer: ... + # ---------------------------------------------------------------------- # Arrow Stream class BufferOutputStream(NativeFile): - """ - An output stream that writes to a resizable buffer. - - The buffer is produced as a result when ``getvalue()`` is called. - - Examples - -------- - Create an output stream, write data to it and finalize it with - ``getvalue()``: - - >>> import pyarrow as pa - >>> f = pa.BufferOutputStream() - >>> f.write(b'pyarrow.Buffer') - 14 - >>> f.closed - False - >>> f.getvalue() - - >>> f.closed - True - """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def getvalue(self) -> Buffer: - """ - Finalize output stream and return result as pyarrow.Buffer. + def getvalue(self) -> Buffer: ... - Returns - ------- - value : Buffer - """ class MockOutputStream(NativeFile): ... class BufferReader(NativeFile): - """ - Zero-copy reader from objects convertible to Arrow buffer. - - Parameters - ---------- - obj : Python bytes or pyarrow.Buffer - - Examples - -------- - Create an Arrow input stream and inspect it: - - >>> import pyarrow as pa - >>> data = b'reader data' - >>> buf = memoryview(data) - >>> with pa.input_stream(buf) as stream: - ... stream.size() - ... stream.read(6) - ... stream.seek(7) - ... stream.read(15) - ... - 11 - b'reader' - 7 - b'data' - """ - def __init__(self, obj) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ + + def __init__(self, obj) -> None: ... + class CompressedInputStream(NativeFile): - """ - An input stream wrapper which decompresses data on the fly. - - Parameters - ---------- - stream : string, path, pyarrow.NativeFile, or file-like object - Input stream object to wrap with the compression. - compression : str - The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). - - Examples - -------- - Create an output stream which compresses the data: - - >>> import pyarrow as pa - >>> data = b"Compressed stream" - >>> raw = pa.BufferOutputStream() - >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: - ... compressed.write(data) - ... - 17 - - Create an input stream with decompression referencing the - buffer with compressed data: - - >>> cdata = raw.getvalue() - >>> with pa.input_stream(cdata, compression="gzip") as compressed: - ... compressed.read() - ... - b'Compressed stream' - - which actually translates to the use of ``BufferReader``and - ``CompressedInputStream``: - - >>> raw = pa.BufferReader(cdata) - >>> with pa.CompressedInputStream(raw, "gzip") as compressed: - ... compressed.read() - ... - b'Compressed stream' - """ + def __init__( self, stream: StrPath | NativeFile | IOBase, compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ + ) -> None: ... + class CompressedOutputStream(NativeFile): - """ - An output stream wrapper which compresses data on the fly. - - Parameters - ---------- - stream : string, path, pyarrow.NativeFile, or file-like object - Input stream object to wrap with the compression. - compression : str - The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). - - Examples - -------- - Create an output stream which compresses the data: - - >>> import pyarrow as pa - >>> data = b"Compressed stream" - >>> raw = pa.BufferOutputStream() - >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: - ... compressed.write(data) - ... - 17 - """ + def __init__( self, stream: StrPath | NativeFile | IOBase, compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ + ) -> None: ... + class BufferedInputStream(NativeFile): - """ - An input stream that performs buffered reads from - an unbuffered input stream, which can mitigate the overhead - of many small reads in some cases. - - Parameters - ---------- - stream : NativeFile - The input stream to wrap with the buffer - buffer_size : int - Size of the temporary read buffer. - memory_pool : MemoryPool - The memory pool used to allocate the buffer. - """ + def __init__( self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None - ) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ - def detach(self) -> NativeFile: - """ - Release the raw InputStream. - Further operations on this stream are invalid. - - Returns - ------- - raw : NativeFile - The underlying raw input stream - """ + ) -> None: ... + + def detach(self) -> NativeFile: ... + class BufferedOutputStream(NativeFile): - """ - An output stream that performs buffered reads from - an unbuffered output stream, which can mitigate the overhead - of many small writes in some cases. - - Parameters - ---------- - stream : NativeFile - The writable output stream to wrap with the buffer - buffer_size : int - Size of the buffer that should be added. - memory_pool : MemoryPool - The memory pool used to allocate the buffer. - """ + def __init__( self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None - ) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ - def detach(self) -> NativeFile: - """ - Flush any buffered writes and release the raw OutputStream. - Further operations on this stream are invalid. - - Returns - ------- - raw : NativeFile - The underlying raw output stream. - """ + ) -> None: ... + + def detach(self) -> NativeFile: ... + class TransformInputStream(NativeFile): - """ - Transform an input stream. - - Parameters - ---------- - stream : NativeFile - The stream to transform. - transform_func : callable - The transformation to apply. - """ - def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ + + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + class Transcoder: def __init__(self, decoder, encoder) -> None: ... @@ -952,82 +274,21 @@ class Transcoder: def transcoding_input_stream( stream: NativeFile, src_encoding: str, dest_encoding: str -) -> TransformInputStream: - """ - Add a transcoding transformation to the stream. - Incoming data will be decoded according to ``src_encoding`` and - then re-encoded according to ``dest_encoding``. - - Parameters - ---------- - stream : NativeFile - The stream to which the transformation should be applied. - src_encoding : str - The codec to use when reading data. - dest_encoding : str - The codec to use for emitted data. - """ - -def py_buffer(obj: SupportPyBuffer) -> Buffer: - """ - Construct an Arrow buffer from a Python bytes-like or buffer-like object - - Parameters - ---------- - obj : object - the object from which the buffer should be constructed. - """ - -def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: - """ - Construct an Arrow buffer with the given *address* and *size*. - - The buffer will be optionally backed by the Python *base* object, if given. - The *base* object will be kept alive as long as this buffer is alive, - including across language boundaries (for example if the buffer is - referenced by C++ code). - - Parameters - ---------- - address : int - The starting address of the buffer. The address can - refer to both device or host memory but it must be - accessible from device after mapping it with - `get_device_address` method. - size : int - The size of device buffer in bytes. - base : {None, object} - Object that owns the referenced memory. - """ +) -> TransformInputStream: ... + + +def py_buffer(obj: SupportPyBuffer) -> Buffer: ... + + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ... + def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... # --------------------------------------------------------------------- class CacheOptions(_Weakrefable): - """ - Cache options for a pre-buffered fragment scan. - - Parameters - ---------- - hole_size_limit : int, default 8KiB - The maximum distance in bytes between two consecutive ranges; beyond - this value, ranges are not combined. - range_size_limit : int, default 32MiB - The maximum size in bytes of a combined range; if combining two - consecutive ranges would produce a range of a size greater than this, - they are not combined - lazy : bool, default True - lazy = false: request all byte ranges when PreBuffer or WillNeed is called. - lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader - needs them. - lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the - range that is currently being read. - prefetch_limit : int, default 0 - The maximum number of ranges to be prefetched. This is only used for - lazy cache to asynchronously read some ranges after reading the target - range. - """ + hole_size_limit: int range_size_limit: int @@ -1040,10 +301,8 @@ class CacheOptions(_Weakrefable): range_size_limit: int | None = None, lazy: bool = True, prefetch_limit: int = 0, - ) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ + ) -> None: ... + @classmethod def from_network_metrics( cls, @@ -1051,211 +310,45 @@ class CacheOptions(_Weakrefable): transfer_bandwidth_mib_per_sec: int, ideal_bandwidth_utilization_frac: float = 0.9, max_ideal_request_size_mib: int = 64, - ) -> Self: - """ - Create suitable CacheOptions based on provided network metrics. - - Typically this will be used with object storage solutions like Amazon S3, - Google Cloud Storage and Azure Blob Storage. - - Parameters - ---------- - time_to_first_byte_millis : int - Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call - setup latency of a new read request. The value is a positive integer. - transfer_bandwidth_mib_per_sec : int - Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive - integer. - ideal_bandwidth_utilization_frac : int, default 0.9 - Transfer bandwidth utilization fraction (per connection) to maximize the net - data load. The value is a positive float less than 1. - max_ideal_request_size_mib : int, default 64 - The maximum single data request size (in MiB) to maximize the net data load. - - Returns - ------- - CacheOptions - """ + ) -> Self: ... + class Codec(_Weakrefable): - """ - Compression codec. - - Parameters - ---------- - compression : str - Type of compression codec to initialize, valid values are: 'gzip', - 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and - 'snappy'. - compression_level : int, None - Optional parameter specifying how aggressively to compress. The - possible ranges and effect of this parameter depend on the specific - codec chosen. Higher values compress more but typically use more - resources (CPU/RAM). Some codecs support negative values. - - gzip - The compression_level maps to the memlevel parameter of - deflateInit2. Higher levels use more RAM but are faster - and should have higher compression ratios. - - bz2 - The compression level maps to the blockSize100k parameter of - the BZ2_bzCompressInit function. Higher levels use more RAM - but are faster and should have higher compression ratios. - - brotli - The compression level maps to the BROTLI_PARAM_QUALITY - parameter. Higher values are slower and should have higher - compression ratios. - - lz4/lz4_frame/lz4_raw - The compression level parameter is not supported and must - be None - - zstd - The compression level maps to the compressionLevel parameter - of ZSTD_initCStream. Negative values are supported. Higher - values are slower and should have higher compression ratios. - - snappy - The compression level parameter is not supported and must - be None - - - Raises - ------ - ValueError - If invalid compression value is passed. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.Codec.is_available('gzip') - True - >>> codec = pa.Codec('gzip') - >>> codec.name - 'gzip' - >>> codec.compression_level - 9 - """ - def __init__(self, compression: Compression, compression_level: int | None = None) -> None: - """ - Initialize self. See help(type(self)) for accurate signature. - """ + + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + @classmethod - def detect(cls, path: StrPath) -> Self: - """ - Detect and instantiate compression codec based on file extension. - - Parameters - ---------- - path : str, path-like - File-path to detect compression from. - - Raises - ------ - TypeError - If the passed value is not path-like. - ValueError - If the compression can't be detected from the path. - - Returns - ------- - Codec - """ + def detect(cls, path: StrPath) -> Self: ... + @staticmethod - def is_available(compression: Compression) -> bool: - """ - Returns whether the compression support has been built and enabled. - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - - Returns - ------- - bool - """ + def is_available(compression: Compression) -> bool: ... + @staticmethod - def supports_compression_level(compression: Compression) -> int: - """ - Returns true if the compression level parameter is supported - for the given codec. - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ + def supports_compression_level(compression: Compression) -> int: ... + @staticmethod - def default_compression_level(compression: Compression) -> int: - """ - Returns the compression level that Arrow will use for the codec if - None is specified. - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ + def default_compression_level(compression: Compression) -> int: ... + @staticmethod - def minimum_compression_level(compression: Compression) -> int: - """ - Returns the smallest valid value for the compression level - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ + def minimum_compression_level(compression: Compression) -> int: ... + @staticmethod - def maximum_compression_level(compression: Compression) -> int: - """ - Returns the largest valid value for the compression level - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ + def maximum_compression_level(compression: Compression) -> int: ... + @property - def name(self) -> Compression: - """ - Returns the name of the codec - """ + def name(self) -> Compression: ... + @property - def compression_level(self) -> int: - """ - Returns the compression level parameter of the codec - """ + def compression_level(self) -> int: ... + def compress( self, buf: Buffer | bytes | SupportPyBuffer, *, asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, - ) -> Buffer | bytes: - """ - Compress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol - asbytes : bool, default False - Return result as Python bytes object, otherwise Buffer - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any - - Returns - ------- - compressed : pyarrow.Buffer or bytes (if asbytes=True) - """ + ) -> Buffer | bytes: ... + def decompress( self, buf: Buffer | bytes | SupportPyBuffer, @@ -1263,24 +356,8 @@ class Codec(_Weakrefable): *, asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, - ) -> Buffer | bytes: - """ - Decompress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or memoryview-compatible object - decompressed_size : int, default None - Size of the decompressed result - asbytes : boolean, default False - Return result as Python bytes object, otherwise Buffer - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any. - - Returns - ------- - uncompressed : pyarrow.Buffer or bytes (if asbytes=True) - """ + ) -> Buffer | bytes: ... + def compress( buf: Buffer | bytes | SupportPyBuffer, @@ -1288,25 +365,8 @@ def compress( *, asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, -) -> Buffer | bytes: - """ - Compress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol - codec : str, default 'lz4' - Compression codec. - Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} - asbytes : bool, default False - Return result as Python bytes object, otherwise Buffer. - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any. - - Returns - ------- - compressed : pyarrow.Buffer or bytes (if asbytes=True) - """ +) -> Buffer | bytes: ... + def decompress( buf: Buffer | bytes | SupportPyBuffer, @@ -1315,148 +375,22 @@ def decompress( *, asbytes: Literal[False] | Literal[True] | None = None, memory_pool: MemoryPool | None = None, -) -> Buffer | bytes: - """ - Decompress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or memoryview-compatible object - Input object to decompress data from. - decompressed_size : int, default None - Size of the decompressed result - codec : str, default 'lz4' - Compression codec. - Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} - asbytes : bool, default False - Return result as Python bytes object, otherwise Buffer. - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any. - - Returns - ------- - uncompressed : pyarrow.Buffer or bytes (if asbytes=True) - """ +) -> Buffer | bytes: ... + def input_stream( source: StrPath | Buffer | IOBase, compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", buffer_size: int | None = None, -) -> BufferReader: - """ - Create an Arrow input stream. - - Parameters - ---------- - source : str, Path, buffer, or file-like object - The source to open for reading. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly decompression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. - Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size : int, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary read buffer. - - Examples - -------- - Create a readable BufferReader (NativeFile) from a Buffer or a memoryview object: - - >>> import pyarrow as pa - >>> buf = memoryview(b"some data") - >>> with pa.input_stream(buf) as stream: - ... stream.read(4) - ... - b'some' - - Create a readable OSFile (NativeFile) from a string or file path: - - >>> import gzip - >>> with gzip.open('example.gz', 'wb') as f: - ... f.write(b'some data') - ... - 9 - >>> with pa.input_stream('example.gz') as stream: - ... stream.read() - ... - b'some data' - - Create a readable PythonFile (NativeFile) from a a Python file object: - - >>> with open('example.txt', mode='w') as f: - ... f.write('some text') - ... - 9 - >>> with pa.input_stream('example.txt') as stream: - ... stream.read(6) - ... - b'some t' - """ +) -> BufferReader: ... + def output_stream( source: StrPath | Buffer | IOBase, compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", buffer_size: int | None = None, -) -> NativeFile: - """ - Create an Arrow output stream. - - Parameters - ---------- - source : str, Path, buffer, file-like object - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. - Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size : int, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - - Examples - -------- - Create a writable NativeFile from a pyarrow Buffer: - - >>> import pyarrow as pa - >>> data = b"buffer data" - >>> empty_obj = bytearray(11) - >>> buf = pa.py_buffer(empty_obj) - >>> with pa.output_stream(buf) as stream: - ... stream.write(data) - ... - 11 - >>> with pa.input_stream(buf) as stream: - ... stream.read(6) - ... - b'buffer' - - or from a memoryview object: - - >>> buf = memoryview(empty_obj) - >>> with pa.output_stream(buf) as stream: - ... stream.write(data) - ... - 11 - >>> with pa.input_stream(buf) as stream: - ... stream.read() - ... - b'buffer data' - - Create a writable NativeFile from a string or file path: - - >>> with pa.output_stream('example_second.txt') as stream: - ... stream.write(b'Write some data') - ... - 15 - >>> with pa.input_stream('example_second.txt') as stream: - ... stream.read() - ... - b'Write some data' - """ +) -> NativeFile: ... + __all__ = [ "have_libhdfs", diff --git a/python/pyarrow-stubs/ipc.pyi b/python/pyarrow-stubs/ipc.pyi index 985cf0678f9..a6e7c71dd12 100644 --- a/python/pyarrow-stubs/ipc.pyi +++ b/python/pyarrow-stubs/ipc.pyi @@ -39,6 +39,7 @@ from pyarrow.lib import ( write_tensor, ) + class RecordBatchStreamReader(lib._RecordBatchStreamReader): def __init__( self, @@ -48,6 +49,7 @@ class RecordBatchStreamReader(lib._RecordBatchStreamReader): memory_pool: lib.MemoryPool | None = None, ) -> None: ... + class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): def __init__( self, @@ -58,6 +60,7 @@ class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): options: IpcWriteOptions | None = None, ) -> None: ... + class RecordBatchFileReader(lib._RecordBatchFileReader): def __init__( self, @@ -68,6 +71,7 @@ class RecordBatchFileReader(lib._RecordBatchFileReader): memory_pool: lib.MemoryPool | None = None, ) -> None: ... + class RecordBatchFileWriter(lib._RecordBatchFileWriter): def __init__( self, @@ -78,6 +82,7 @@ class RecordBatchFileWriter(lib._RecordBatchFileWriter): options: IpcWriteOptions | None = None, ) -> None: ... + def new_stream( sink: str | lib.NativeFile | IOBase, schema: lib.Schema, @@ -85,12 +90,16 @@ def new_stream( use_legacy_format: bool | None = None, options: IpcWriteOptions | None = None, ) -> RecordBatchStreamWriter: ... + + def open_stream( source: bytes | lib.Buffer | lib.NativeFile | IOBase, *, options: IpcReadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchStreamReader: ... + + def new_file( sink: str | lib.NativeFile | IOBase, schema: lib.Schema, @@ -98,6 +107,8 @@ def new_file( use_legacy_format: bool | None = None, options: IpcWriteOptions | None = None, ) -> RecordBatchFileWriter: ... + + def open_file( source: bytes | lib.Buffer | lib.NativeFile | IOBase, footer_offset: int | None = None, @@ -105,10 +116,16 @@ def open_file( options: IpcReadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchFileReader: ... + + def serialize_pandas( df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None ) -> lib.Buffer: ... -def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + + +def deserialize_pandas( + buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + __all__ = [ "IpcReadOptions", diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi index 565feb4b3db..eea11a2e8f1 100644 --- a/python/pyarrow-stubs/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -40,45 +40,15 @@ class MonthDayNano(NamedTuple): months: int nanoseconds: int -def cpu_count() -> int: - """ - Return the number of threads to use in parallel operations. - The number of threads is determined at startup by inspecting the - ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. - If neither is present, it will default to the number of hardware threads - on the system. It can be modified at runtime by calling - :func:`set_cpu_count()`. +def cpu_count() -> int: ... - See Also - -------- - set_cpu_count : Modify the size of this pool. - io_thread_count : The analogous function for the I/O thread pool. - """ -def set_cpu_count(count: int) -> None: - """ - Set the number of threads to use in parallel operations. +def set_cpu_count(count: int) -> None: ... - Parameters - ---------- - count : int - The number of concurrent threads that should be used. - See Also - -------- - cpu_count : Get the size of this pool. - set_io_thread_count : The analogous function for the I/O thread pool. - """ +def is_threading_enabled() -> bool: ... -def is_threading_enabled() -> bool: - """ - Returns True if threading is enabled in libarrow. - - If it isn't enabled, then python shouldn't create any - threads either, because we're probably on a system where - threading doesn't work (e.g. Emscripten). - """ Type_NA: int Type_BOOL: int diff --git a/python/pyarrow-stubs/memory.pyi b/python/pyarrow-stubs/memory.pyi index 4fc723a1950..ab5db5b1f06 100644 --- a/python/pyarrow-stubs/memory.pyi +++ b/python/pyarrow-stubs/memory.pyi @@ -18,165 +18,60 @@ from pyarrow.lib import _Weakrefable class MemoryPool(_Weakrefable): - """ - Base class for memory allocation. - - Besides tracking its number of allocated bytes, a memory pool also - takes care of the required 64-byte alignment for Arrow data. - """ - - def release_unused(self) -> None: - """ - Attempt to return to the OS any memory being held onto by the pool. - - This function should not be called except potentially for - benchmarking or debugging as it could be expensive and detrimental to - performance. - - This is best effort and may not have any effect on some memory pools - or in some situations (e.g. fragmentation). - """ - def bytes_allocated(self) -> int: - """ - Return the number of bytes that are currently allocated from this - memory pool. - """ - def total_bytes_allocated(self) -> int: - """ - Return the total number of bytes that have been allocated from this - memory pool. - """ - def max_memory(self) -> int | None: - """ - Return the peak memory allocation in this memory pool. - This can be an approximate number in multi-threaded applications. - - None is returned if the pool implementation doesn't know how to - compute this number. - """ - def num_allocations(self) -> int: - """ - Return the number of allocations or reallocations that were made - using this memory pool. - """ - def print_stats(self) -> None: - """ - Print statistics about this memory pool. - - The output format is implementation-specific. Not all memory pools - implement this method. - """ + + + def release_unused(self) -> None: ... + + def bytes_allocated(self) -> int: ... + + def total_bytes_allocated(self) -> int: ... + + def max_memory(self) -> int | None: ... + + def num_allocations(self) -> int: ... + + def print_stats(self) -> None: ... + @property - def backend_name(self) -> str: - """ - The name of the backend used by this MemoryPool (e.g. "jemalloc"). - """ + def backend_name(self) -> str: ... + class LoggingMemoryPool(MemoryPool): ... -class ProxyMemoryPool(MemoryPool): - """ - Memory pool implementation that tracks the number of bytes and - maximum memory allocated through its direct calls, while redirecting - to another memory pool. - """ - -def default_memory_pool() -> MemoryPool: - """ - Return the process-global memory pool. - - Examples - -------- - >>> default_memory_pool() - - """ - -def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: - """ - Create and return a MemoryPool instance that redirects to the - *parent*, but with separate allocation statistics. - - Parameters - ---------- - parent : MemoryPool - The real memory pool that should be used for allocations. - """ - -def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: - """ - Create and return a MemoryPool instance that redirects to the - *parent*, but also dumps allocation logs on stderr. - - Parameters - ---------- - parent : MemoryPool - The real memory pool that should be used for allocations. - """ - -def system_memory_pool() -> MemoryPool: - """ - Return a memory pool based on the C malloc heap. - """ - -def jemalloc_memory_pool() -> MemoryPool: - """ - Return a memory pool based on the jemalloc heap. - - NotImplementedError is raised if jemalloc support is not enabled. - """ - -def mimalloc_memory_pool() -> MemoryPool: - """ - Return a memory pool based on the mimalloc heap. - - NotImplementedError is raised if mimalloc support is not enabled. - """ - -def set_memory_pool(pool: MemoryPool) -> None: - """ - Set the default memory pool. - - Parameters - ---------- - pool : MemoryPool - The memory pool that should be used by default. - """ - -def log_memory_allocations(enable: bool = True) -> None: - """ - Enable or disable memory allocator logging for debugging purposes - - Parameters - ---------- - enable : bool, default True - Pass False to disable logging - """ - -def total_allocated_bytes() -> int: - """ - Return the currently allocated bytes from the default memory pool. - Other memory pools may not be accounted for. - """ - -def jemalloc_set_decay_ms(decay_ms: int) -> None: - """ - Set arenas.dirty_decay_ms and arenas.muzzy_decay_ms to indicated number of - milliseconds. A value of 0 (the default) results in dirty / muzzy memory - pages being released right away to the OS, while a higher value will result - in a time-based decay. See the jemalloc docs for more information - - It's best to set this at the start of your application. - - Parameters - ---------- - decay_ms : int - Number of milliseconds to set for jemalloc decay conf parameters. Note - that this change will only affect future memory arenas - """ - -def supported_memory_backends() -> list[str]: - """ - Return a list of available memory pool backends - """ +class ProxyMemoryPool(MemoryPool): ... + + +def default_memory_pool() -> MemoryPool: ... + + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... + + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... + + +def system_memory_pool() -> MemoryPool: ... + + +def jemalloc_memory_pool() -> MemoryPool: ... + + +def mimalloc_memory_pool() -> MemoryPool: ... + + +def set_memory_pool(pool: MemoryPool) -> None: ... + + +def log_memory_allocations(enable: bool = True) -> None: ... + + +def total_allocated_bytes() -> int: ... + + +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... + + +def supported_memory_backends() -> list[str]: ... + __all__ = [ "MemoryPool", diff --git a/python/pyarrow-stubs/orc.pyi b/python/pyarrow-stubs/orc.pyi index 557f38a2b9e..5e0289e61f7 100644 --- a/python/pyarrow-stubs/orc.pyi +++ b/python/pyarrow-stubs/orc.pyi @@ -29,152 +29,73 @@ from . import _orc from ._fs import SupportedFileSystem from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table -class ORCFile: - """ - Reader interface for a single ORC file - Parameters - ---------- - source : str or pyarrow.NativeFile - Readable source. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. - """ +class ORCFile: reader: _orc.ORCReader def __init__(self, source: StrPath | NativeFile | IO) -> None: ... @property - def metadata(self) -> KeyValueMetadata: - """The file metadata, as an arrow KeyValueMetadata""" + def metadata(self) -> KeyValueMetadata: ... + @property - def schema(self) -> Schema: - """The file schema, as an arrow schema""" + def schema(self) -> Schema: ... + @property - def nrows(self) -> int: - """The number of rows in the file""" + def nrows(self) -> int: ... + @property - def nstripes(self) -> int: - """The number of stripes in the file""" + def nstripes(self) -> int: ... + @property - def file_version(self) -> str: - """Format version of the ORC file, must be 0.11 or 0.12""" + def file_version(self) -> str: ... + @property - def software_version(self) -> str: - """Software instance and version that wrote this file""" + def software_version(self) -> str: ... + @property - def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: - """Compression codec of the file""" + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + @property - def compression_size(self) -> int: - """Number of bytes to buffer for the compression codec in the file""" + def compression_size(self) -> int: ... + @property - def writer(self) -> str: - """Name of the writer that wrote this file. - If the writer is unknown then its Writer ID - (a number) is returned""" + def writer(self) -> str: ... + @property - def writer_version(self) -> str: - """Version of the writer""" + def writer_version(self) -> str: ... + @property - def row_index_stride(self) -> int: - """Number of rows per an entry in the row index or 0 - if there is no row index""" + def row_index_stride(self) -> int: ... + @property - def nstripe_statistics(self) -> int: - """Number of stripe statistics""" + def nstripe_statistics(self) -> int: ... + @property - def content_length(self) -> int: - """Length of the data stripes in the file in bytes""" + def content_length(self) -> int: ... + @property - def stripe_statistics_length(self) -> int: - """The number of compressed bytes in the file stripe statistics""" + def stripe_statistics_length(self) -> int: ... + @property - def file_footer_length(self) -> int: - """The number of compressed bytes in the file footer""" + def file_footer_length(self) -> int: ... + @property - def file_postscript_length(self) -> int: - """The number of bytes in the file postscript""" + def file_postscript_length(self) -> int: ... + @property - def file_length(self) -> int: - """The number of bytes in the file""" - def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: - """Read a single stripe from the file. - - Parameters - ---------- - n : int - The stripe index - columns : list - If not None, only these columns will be read from the stripe. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e' - - Returns - ------- - pyarrow.RecordBatch - Content of the stripe as a RecordBatch. - """ - def read(self, columns: list[str] | None = None) -> Table: - """Read the whole file. - - Parameters - ---------- - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. Output always follows the - ordering of the file and not the `columns` list. - - Returns - ------- - pyarrow.Table - Content of the file as a Table. - """ + def file_length(self) -> int: ... + + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + + def read(self, columns: list[str] | None = None) -> Table: ... + class ORCWriter: - """ - Writer interface for a single ORC file - - Parameters - ---------- - where : str or pyarrow.io.NativeFile - Writable target. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream - or pyarrow.io.FixedSizeBufferWriter. - file_version : {"0.11", "0.12"}, default "0.12" - Determine which ORC file version to use. - `Hive 0.11 / ORC v0 `_ - is the older version - while `Hive 0.12 / ORC v1 `_ - is the newer one. - batch_size : int, default 1024 - Number of rows the ORC writer writes at a time. - stripe_size : int, default 64 * 1024 * 1024 - Size of each ORC stripe in bytes. - compression : string, default 'uncompressed' - The compression codec. - Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} - Note that LZ0 is currently not supported. - compression_block_size : int, default 64 * 1024 - Size of each compression block in bytes. - compression_strategy : string, default 'speed' - The compression strategy i.e. speed vs size reduction. - Valid values: {'SPEED', 'COMPRESSION'} - row_index_stride : int, default 10000 - The row index stride i.e. the number of rows per - an entry in the row index. - padding_tolerance : double, default 0.0 - The padding tolerance. - dictionary_key_size_threshold : double, default 0.0 - The dictionary key size threshold. 0 to disable dictionary encoding. - 1 to always enable dictionary encoding. - bloom_filter_columns : None, set-like or list-like, default None - Columns that use the bloom filter. - bloom_filter_fpp : double, default 0.05 - Upper limit of the false-positive rate of the bloom filter. - """ writer: _orc.ORCWriter is_open: bool + def __init__( self, where: StrPath | NativeFile | IO, @@ -182,7 +103,8 @@ class ORCWriter: file_version: str = "0.12", batch_size: int = 1024, stripe_size: int = 64 * 1024 * 1024, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression: Literal["UNCOMPRESSED", "ZLIB", + "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", compression_block_size: int = 65536, compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", row_index_stride: int = 10000, @@ -193,47 +115,17 @@ class ORCWriter: ): ... def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> None: ... - def write(self, table: Table) -> None: - """ - Write the table into an ORC file. The schema of the table must - be equal to the schema used when opening the ORC file. - - Parameters - ---------- - table : pyarrow.Table - The table to be written into the ORC file - """ - def close(self) -> None: - """ - Close the ORC file - """ + def write(self, table: Table) -> None: ... + + def close(self) -> None: ... + def read_table( source: StrPath | NativeFile | IO, columns: list[str] | None = None, filesystem: SupportedFileSystem | None = None, -) -> Table: - """ - Read a Table from an ORC file. - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name. For file-like objects, - only read a single file. Use pyarrow.BufferReader to read a file - contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. Output always follows the ordering of the file and - not the `columns` list. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - """ +) -> Table: ... + def write_table( table: Table, @@ -242,7 +134,8 @@ def write_table( file_version: str = "0.12", batch_size: int = 1024, stripe_size: int = 64 * 1024 * 1024, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression: Literal["UNCOMPRESSED", "ZLIB", + "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", compression_block_size: int = 65536, compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", row_index_stride: int = 10000, @@ -250,47 +143,4 @@ def write_table( dictionary_key_size_threshold: float = 0.0, bloom_filter_columns: list[int] | None = None, bloom_filter_fpp: float = 0.05, -) -> None: - """ - Write a table into an ORC file. - - Parameters - ---------- - table : pyarrow.lib.Table - The table to be written into the ORC file - where : str or pyarrow.io.NativeFile - Writable target. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream - or pyarrow.io.FixedSizeBufferWriter. - file_version : {"0.11", "0.12"}, default "0.12" - Determine which ORC file version to use. - `Hive 0.11 / ORC v0 `_ - is the older version - while `Hive 0.12 / ORC v1 `_ - is the newer one. - batch_size : int, default 1024 - Number of rows the ORC writer writes at a time. - stripe_size : int, default 64 * 1024 * 1024 - Size of each ORC stripe in bytes. - compression : string, default 'uncompressed' - The compression codec. - Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} - Note that LZ0 is currently not supported. - compression_block_size : int, default 64 * 1024 - Size of each compression block in bytes. - compression_strategy : string, default 'speed' - The compression strategy i.e. speed vs size reduction. - Valid values: {'SPEED', 'COMPRESSION'} - row_index_stride : int, default 10000 - The row index stride i.e. the number of rows per - an entry in the row index. - padding_tolerance : double, default 0.0 - The padding tolerance. - dictionary_key_size_threshold : double, default 0.0 - The dictionary key size threshold. 0 to disable dictionary encoding. - 1 to always enable dictionary encoding. - bloom_filter_columns : None, set-like or list-like, default None - Columns that use the bloom filter. - bloom_filter_fpp : double, default 0.05 - Upper limit of the false-positive rate of the bloom filter. - """ +) -> None: ... diff --git a/python/pyarrow-stubs/pandas_compat.pyi b/python/pyarrow-stubs/pandas_compat.pyi index 82fcb19ad97..f25d1ad24a6 100644 --- a/python/pyarrow-stubs/pandas_compat.pyi +++ b/python/pyarrow-stubs/pandas_compat.pyi @@ -26,12 +26,14 @@ from .lib import Array, DataType, Schema, Table _T = TypeVar("_T") + def get_logical_type_map() -> dict[int, str]: ... def get_logical_type(arrow_type: DataType) -> str: ... def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... def get_logical_type_from_numpy(pandas_collection) -> str: ... def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + class _ColumnMetadata(TypedDict): name: str field_name: str @@ -39,9 +41,12 @@ class _ColumnMetadata(TypedDict): numpy_type: str metadata: dict | None + def get_column_metadata( column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str ) -> _ColumnMetadata: ... + + def construct_metadata( columns_to_convert: list[pd.Series], df: pd.DataFrame, @@ -52,9 +57,13 @@ def construct_metadata( types: list[DataType], column_field_names: list[str] = ..., ) -> dict[bytes, bytes]: ... + + def dataframe_to_types( df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None ) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... + + def dataframe_to_arrays( df: pd.DataFrame, schema: Schema, @@ -65,6 +74,8 @@ def dataframe_to_arrays( ) -> tuple[Array, Schema, int]: ... def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... + + def table_to_dataframe( options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None ) -> pd.DataFrame: ... diff --git a/python/pyarrow-stubs/parquet/core.pyi b/python/pyarrow-stubs/parquet/core.pyi index 67882f3a747..8cb4f152ff7 100644 --- a/python/pyarrow-stubs/parquet/core.pyi +++ b/python/pyarrow-stubs/parquet/core.pyi @@ -77,12 +77,19 @@ __all__ = ( "filters_to_expression", ) -def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + +def filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + @deprecated("use filters_to_expression") -def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... +def _filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +_Compression: TypeAlias = Literal["gzip", "bz2", + "brotli", "lz4", "zstd", "snappy", "none"] -_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] class ParquetFile: reader: ParquetReader @@ -118,6 +125,7 @@ class ParquetFile: def close(self, force: bool = False) -> None: ... @property def closed(self) -> bool: ... + def read_row_group( self, i: int, @@ -125,6 +133,7 @@ class ParquetFile: use_threads: bool = True, use_pandas_metadata: bool = False, ) -> Table: ... + def read_row_groups( self, row_groups: list, @@ -132,6 +141,7 @@ class ParquetFile: use_threads: bool = True, use_pandas_metadata: bool = False, ) -> Table: ... + def iter_batches( self, batch_size: int = 65536, @@ -140,13 +150,16 @@ class ParquetFile: use_threads: bool = True, use_pandas_metadata: bool = False, ) -> Iterator[RecordBatch]: ... + def read( self, columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, ) -> Table: ... - def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: ... + def scan_contents(self, columns: list | None = None, + batch_size: int = 65536) -> int: ... + class ParquetWriter: flavor: str @@ -186,14 +199,18 @@ class ParquetWriter: ) -> None: ... def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> Literal[False]: ... + def write( self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None ) -> None: ... - def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: ... + def write_batch(self, batch: RecordBatch, + row_group_size: int | None = None) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... def close(self) -> None: ... def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... + class ParquetDataset: def __init__( self, @@ -220,6 +237,7 @@ class ParquetDataset: def equals(self, other: ParquetDataset) -> bool: ... @property def schema(self) -> Schema: ... + def read( self, columns: list[str] | None = None, @@ -236,6 +254,7 @@ class ParquetDataset: @property def partitioning(self) -> Partitioning: ... + def read_table( source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], *, @@ -258,10 +277,12 @@ def read_table( page_checksum_verification: bool = False, ) -> Table: ... + def read_pandas( source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs ) -> Table: ... + def write_table( table: Table, where: str | Path | NativeFile | IO, @@ -292,6 +313,7 @@ def write_table( **kwargs, ) -> None: ... + def write_to_dataset( table: Table, root_path: str | Path, @@ -307,6 +329,7 @@ def write_to_dataset( **kwargs, ) -> None: ... + def write_metadata( schema: Schema, where: str | NativeFile, @@ -315,6 +338,7 @@ def write_metadata( **kwargs, ) -> None: ... + def read_metadata( where: str | Path | IO | NativeFile, memory_map: bool = False, @@ -322,6 +346,7 @@ def read_metadata( filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... + def read_schema( where: str | Path | IO | NativeFile, memory_map: bool = False, diff --git a/python/pyarrow-stubs/scalar.pyi b/python/pyarrow-stubs/scalar.pyi index 0bcd97dd038..4563b97fef7 100644 --- a/python/pyarrow-stubs/scalar.pyi +++ b/python/pyarrow-stubs/scalar.pyi @@ -65,567 +65,267 @@ _AsPyTypeV = TypeVar("_AsPyTypeV") _DataType_co = TypeVar("_DataType_co", bound=DataType, covariant=True) class Scalar(_Weakrefable, Generic[_DataType_co]): - """ - The base class for scalars. - """ + @property - def type(self) -> _DataType_co: - """ - Data type of the Scalar object. - """ + def type(self) -> _DataType_co: ... + @property - def is_valid(self) -> bool: - """ - Holds a valid (non-null) value. - """ + def is_valid(self) -> bool: ... + def cast( self, target_type: None | _DataTypeT, safe: bool = True, options: CastOptions | None = None, memory_pool: MemoryPool | None = None, - ) -> Self | Scalar[_DataTypeT]: - """ - Cast scalar value to another data type. - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, default None - Type to cast scalar to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Returns - ------- - scalar : A Scalar of the given target data type. - """ - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def equals(self, other: Scalar) -> bool: - """ - Parameters - ---------- - other : pyarrow.Scalar - - Returns - ------- - bool - """ - def __hash__(self) -> int: - """ - Return hash(self). - """ - def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None) -> Any: - """ - Return this value as a Python representation. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - """ + ) -> Self | Scalar[_DataTypeT]: ... + + def validate(self, *, full: bool = False) -> None: ... + + def equals(self, other: Scalar) -> bool: ... + + def __hash__(self) -> int: ... + + def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None) -> Any: ... + _NULL: TypeAlias = None NA = _NULL -class NullScalar(Scalar[NullType]): - """ - Concrete class for null scalars. - """ -class BooleanScalar(Scalar[BoolType]): - """ - Concrete class for boolean scalars. - """ -class UInt8Scalar(Scalar[UInt8Type]): - """ - Concrete class for uint8 scalars. - """ -class Int8Scalar(Scalar[Int8Type]): - """ - Concrete class for int8 scalars. - """ -class UInt16Scalar(Scalar[UInt16Type]): - """ - Concrete class for uint16 scalars. - """ -class Int16Scalar(Scalar[Int16Type]): - """ - Concrete class for int16 scalars. - """ -class UInt32Scalar(Scalar[Uint32Type]): - """ - Concrete class for uint32 scalars. - """ -class Int32Scalar(Scalar[Int32Type]): - """ - Concrete class for int32 scalars. - """ -class UInt64Scalar(Scalar[UInt64Type]): - """ - Concrete class for uint64 scalars. - """ -class Int64Scalar(Scalar[Int64Type]): - """ - Concrete class for int64 scalars. - """ -class HalfFloatScalar(Scalar[Float16Type]): - """ - Concrete class for float scalars. - """ -class FloatScalar(Scalar[Float32Type]): - """ - Concrete class for float scalars. - """ -class DoubleScalar(Scalar[Float64Type]): - """ - Concrete class for double scalars. - """ -class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]): - """ - Concrete class for decimal32 scalars. - """ -class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]): - """ - Concrete class for decimal64 scalars. - """ -class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]): - """ - Concrete class for decimal128 scalars. - """ -class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]): - """ - Concrete class for decimal256 scalars. - """ -class Date32Scalar(Scalar[Date32Type]): - """ - Concrete class for date32 scalars. - """ +class NullScalar(Scalar[NullType]): ... + +class BooleanScalar(Scalar[BoolType]): ... + +class UInt8Scalar(Scalar[UInt8Type]): ... + +class Int8Scalar(Scalar[Int8Type]): ... + +class UInt16Scalar(Scalar[UInt16Type]): ... + +class Int16Scalar(Scalar[Int16Type]): ... + +class UInt32Scalar(Scalar[Uint32Type]): ... + +class Int32Scalar(Scalar[Int32Type]): ... + +class UInt64Scalar(Scalar[UInt64Type]): ... + +class Int64Scalar(Scalar[Int64Type]): ... + +class HalfFloatScalar(Scalar[Float16Type]): ... + +class FloatScalar(Scalar[Float32Type]): ... + +class DoubleScalar(Scalar[Float64Type]): ... + +class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]): ... + +class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]): ... + +class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]): ... + +class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]): ... + +class Date32Scalar(Scalar[Date32Type]): ... + class Date64Scalar(Scalar[Date64Type]): - """ - Concrete class for date64 scalars. - """ + @property def value(self) -> dt.date | None: ... class Time32Scalar(Scalar[Time32Type[_Time32Unit]]): - """ - Concrete class for time32 scalars. - """ + @property def value(self) -> dt.time | None: ... class Time64Scalar(Scalar[Time64Type[_Time64Unit]]): - """ - Concrete class for time64 scalars. - """ + @property def value(self) -> dt.time | None: ... class TimestampScalar(Scalar[TimestampType[_Unit, _Tz]]): - """ - Concrete class for timestamp scalars. - """ + @property def value(self) -> int | None: ... class DurationScalar(Scalar[DurationType[_Unit]]): - """ - Concrete class for duration scalars. - """ + @property def value(self) -> dt.timedelta | None: ... class MonthDayNanoIntervalScalar(Scalar[MonthDayNanoIntervalType]): - """ - Concrete class for month, day, nanosecond interval scalars. - """ + @property - def value(self) -> MonthDayNano | None: - """ - Same as self.as_py() - """ + def value(self) -> MonthDayNano | None: ... + class BinaryScalar(Scalar[BinaryType]): - """ - Concrete class for binary-like scalars. - """ - def as_buffer(self) -> Buffer: - """ - Return a view over this value as a Buffer object. - """ + + def as_buffer(self) -> Buffer: ... + class LargeBinaryScalar(Scalar[LargeBinaryType]): - """ - """ - def as_buffer(self) -> Buffer: - """ - BinaryScalar.as_buffer(self) - Return a view over this value as a Buffer object. - """ + def as_buffer(self) -> Buffer: ... + class FixedSizeBinaryScalar(Scalar[FixedSizeBinaryType]): - """ - """ - def as_buffer(self) -> Buffer: - """ - BinaryScalar.as_buffer(self) - Return a view over this value as a Buffer object. - """ + def as_buffer(self) -> Buffer: ... + class StringScalar(Scalar[StringType]): - """ - Concrete class for string-like (utf8) scalars. - """ - def as_buffer(self) -> Buffer: - """ - BinaryScalar.as_buffer(self) - Return a view over this value as a Buffer object. - """ + def as_buffer(self) -> Buffer: ... + class LargeStringScalar(Scalar[LargeStringType]): - """ - """ - def as_buffer(self) -> Buffer: - """ - BinaryScalar.as_buffer(self) - Return a view over this value as a Buffer object. - """ + def as_buffer(self) -> Buffer: ... + class BinaryViewScalar(Scalar[BinaryViewType]): - """ - """ - def as_buffer(self) -> Buffer: - """ - BinaryScalar.as_buffer(self) - Return a view over this value as a Buffer object. - """ + def as_buffer(self) -> Buffer: ... + class StringViewScalar(Scalar[StringViewType]): - """ - """ - def as_buffer(self) -> Buffer: - """ - BinaryScalar.as_buffer(self) - Return a view over this value as a Buffer object. - """ + def as_buffer(self) -> Buffer: ... + class ListScalar(Scalar[ListType[_DataTypeT]]): - """ - Concrete class for list-like scalars. - """ + @property def values(self) -> Array | None: ... - def __len__(self) -> int: - """ - Return the number of values. - """ - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: - """ - Return the value at the given index. - """ - def __iter__(self) -> Iterator[Array]: - """ - Iterate over this element's values. - """ + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, _Size]]): - """ - """ + @property def values(self) -> Array | None: ... - def __len__(self) -> int: - """ - ListScalar.__len__(self) + def __len__(self) -> int: ... - Return the number of values. - """ - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: - """ - ListScalar.__getitem__(self, i) + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - Return the value at the given index. - """ - def __iter__(self) -> Iterator[Array]: - """ - ListScalar.__iter__(self) + def __iter__(self) -> Iterator[Array]: ... - Iterate over this element's values. - """ class LargeListScalar(Scalar[LargeListType[_DataTypeT]]): - """ - """ + @property def values(self) -> Array | None: ... - def __len__(self) -> int: - """ - ListScalar.__len__(self) + def __len__(self) -> int: ... - Return the number of values. - """ - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: - """ - ListScalar.__getitem__(self, i) + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - Return the value at the given index. - """ - def __iter__(self) -> Iterator[Array]: - """ - ListScalar.__iter__(self) + def __iter__(self) -> Iterator[Array]: ... - Iterate over this element's values. - """ class ListViewScalar(Scalar[ListViewType[_DataTypeT]]): - """ - """ + @property def values(self) -> Array | None: ... - def __len__(self) -> int: - """ - ListScalar.__len__(self) + def __len__(self) -> int: ... - Return the number of values. - """ - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: - """ - ListScalar.__getitem__(self, i) + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - Return the value at the given index. - """ - def __iter__(self) -> Iterator[Array]: - """ - ListScalar.__iter__(self) + def __iter__(self) -> Iterator[Array]: ... - Iterate over this element's values. - """ class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]): - """ - """ + @property def values(self) -> Array | None: ... - def __len__(self) -> int: - """ - ListScalar.__len__(self) + def __len__(self) -> int: ... - Return the number of values. - """ - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: - """ - ListScalar.__getitem__(self, i) + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - Return the value at the given index. - """ - def __iter__(self) -> Iterator[Array]: - """ - ListScalar.__iter__(self) + def __iter__(self) -> Iterator[Array]: ... - Iterate over this element's values. - """ class StructScalar(Scalar[StructType], collections.abc.Mapping[str, Scalar]): - """ - Concrete class for struct scalars. - """ - def __len__(self) -> int: - """ - Return len(self). - """ - def __iter__(self) -> Iterator[str]: - """ - Implement iter(self). - """ - def __getitem__(self, key: int | str) -> Scalar[Any]: - """ - Return the child value for the given field. - - Parameters - ---------- - key : Union[int, str] - Index / position or name of the field. - - Returns - ------- - result : Scalar - """ + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[str]: ... + + def __getitem__(self, key: int | str) -> Scalar[Any]: ... + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... class MapScalar(Scalar[MapType[_K, _ValueT]]): - """ - Concrete class for map scalars. - """ + @property def values(self) -> Array | None: ... - def __len__(self) -> int: - """ - ListScalar.__len__(self) - - Return the number of values. - """ - def __getitem__(self, i: int) -> tuple[Scalar[_K], _ValueT, Any]: - """ - Return the value at the given index or key. - """ + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> tuple[Scalar[_K], _ValueT, Any]: ... + def __iter__( self: Scalar[ MapType[_BasicDataType[_AsPyTypeK], _BasicDataType[_AsPyTypeV]],] | Scalar[MapType[Any, _BasicDataType[_AsPyTypeV]]] | Scalar[MapType[_BasicDataType[_AsPyTypeK], Any]] - ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] | Iterator[tuple[Any, _AsPyTypeV]] | Iterator[tuple[_AsPyTypeK, Any]]: - """ - Iterate over this element's values. - """ + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] | Iterator[tuple[Any, _AsPyTypeV]] | Iterator[tuple[_AsPyTypeK, Any]]: ... + class DictionaryScalar(Scalar[DictionaryType[_IndexT, _BasicValueT]]): - """ - Concrete class for dictionary-encoded scalars. - """ + @property - def index(self) -> Scalar[_IndexT]: - """ - Return this value's underlying index as a scalar. - """ + def index(self) -> Scalar[_IndexT]: ... + @property - def value(self) -> Scalar[_BasicValueT]: - """ - Return the encoded value as a scalar. - """ + def value(self) -> Scalar[_BasicValueT]: ... + @property def dictionary(self) -> Array: ... class RunEndEncodedScalar(Scalar[RunEndEncodedType[_RunEndType, _BasicValueT]]): - """ - Concrete class for RunEndEncoded scalars. - """ + @property - def value(self) -> tuple[int, _BasicValueT] | None: - """ - Return underlying value as a scalar. - """ + def value(self) -> tuple[int, _BasicValueT] | None: ... + class UnionScalar(Scalar[UnionType]): - """ - Concrete class for Union scalars. - """ + @property - def value(self) -> Any | None: - """ - Return underlying value as a scalar. - """ + def value(self) -> Any | None: ... + @property - def type_code(self) -> str: - """ - Return the union type code for this scalar. - """ + def type_code(self) -> str: ... + class ExtensionScalar(Scalar[ExtensionType]): - """ - Concrete class for Extension scalars. - """ + @property - def value(self) -> Any | None: - """ - Return storage value as a scalar. - """ + def value(self) -> Any | None: ... + @staticmethod - def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: - """ - Construct ExtensionScalar from type and storage value. - - Parameters - ---------- - typ : DataType - The extension type for the result scalar. - value : object - The storage value for the result scalar. - - Returns - ------- - ext_scalar : ExtensionScalar - """ - -class Bool8Scalar(Scalar[Bool8Type]): - """ - Concrete class for bool8 extension scalar. - """ -class UuidScalar(Scalar[UuidType]): - """ - Concrete class for Uuid extension scalar. - """ -class JsonScalar(Scalar[JsonType]): - """ - Concrete class for JSON extension scalar. - """ -class OpaqueScalar(Scalar[OpaqueType]): - """ - Concrete class for opaque extension scalar. - """ + def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: ... + + +class Bool8Scalar(Scalar[Bool8Type]): ... + +class UuidScalar(Scalar[UuidType]): ... + +class JsonScalar(Scalar[JsonType]): ... + +class OpaqueScalar(Scalar[OpaqueType]): ... + class FixedShapeTensorScalar(ExtensionScalar): - """ - Concrete class for fixed shape tensor extension scalar. - """ - def to_numpy(self) -> np.ndarray: - """ - Convert fixed shape tensor scalar to a numpy.ndarray. - - The resulting ndarray's shape matches the permuted shape of the - fixed shape tensor scalar. - The conversion is zero-copy. - - Returns - ------- - numpy.ndarray - """ - def to_tensor(self) -> Tensor: - """ - Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape - and strides derived from corresponding FixedShapeTensorType. - - The conversion is zero-copy. - - Returns - ------- - pyarrow.Tensor - Tensor represented stored in FixedShapeTensorScalar. - """ + + def to_numpy(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... + def scalar( value: Any, @@ -633,45 +333,8 @@ def scalar( *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> Scalar[_DataTypeT]: - """ - Create a pyarrow.Scalar instance from a Python object. - - Parameters - ---------- - value : Any - Python object coercible to arrow's type system. - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred from - the value. - from_pandas : bool, default None - Use pandas's semantics for inferring nulls from values in - ndarray-like data. Defaults to False if not passed explicitly by user, - or True if a pandas object is passed in. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Returns - ------- - scalar : pyarrow.Scalar - - Examples - -------- - >>> import pyarrow as pa - - >>> pa.scalar(42) - - - >>> pa.scalar("string") - - - >>> pa.scalar([1, 2]) - - - >>> pa.scalar([1, 2], type=pa.list_(pa.int16())) - - """ +) -> Scalar[_DataTypeT]: ... + __all__ = [ "Scalar", diff --git a/python/pyarrow-stubs/table.pyi b/python/pyarrow-stubs/table.pyi index a9b861e2b78..29784d274df 100644 --- a/python/pyarrow-stubs/table.pyi +++ b/python/pyarrow-stubs/table.pyi @@ -137,67 +137,15 @@ NarySelector: TypeAlias = list[str] | tuple[str, ...] ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): - """ - An array-like composed from a (possibly empty) collection of pyarrow.Arrays - - Warnings - -------- - Do not call this class's constructor directly. - - Examples - -------- - To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: - - >>> import pyarrow as pa - >>> pa.chunked_array([], type=pa.int8()) - - [ - ... - ] - - >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) - True - """ + @property def data(self) -> Self: ... @property - def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: - """ - Return data type of a ChunkedArray. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.type - DataType(int64) - """ - def length(self) -> int: - """ - Return length of a ChunkedArray. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.length() - 6 - """ + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... + + def length(self) -> int: ... + __len__ = length def to_string( self, @@ -206,666 +154,57 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): window: int = 5, container_window: int = 2, skip_new_lines: bool = False, - ) -> str: - """ - Render a "pretty-printed" string representation of the ChunkedArray - - Parameters - ---------- - indent : int - How much to indent right the content of the array, - by default ``0``. - window : int - How many items to preview within each chunk at the begin and end - of the chunk when the chunk is bigger than the window. - The other elements will be ellipsed. - container_window : int - How many chunks to preview at the begin and end - of the array when the array is bigger than the window. - The other elements will be ellipsed. - This setting also applies to list columns. - skip_new_lines : bool - If the array should be rendered as a single line of text - or if each element should be on its own line. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_string(skip_new_lines=True) - '[[2,2,4],[4,5,100]]' - """ + ) -> str: ... + format = to_string - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ + def validate(self, *, full: bool = False) -> None: ... + @property - def null_count(self) -> int: - """ - Number of null entries - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.null_count - 1 - """ + def null_count(self) -> int: ... + @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the chunked array. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.nbytes - 49 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the chunked array. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.get_total_buffer_size() - 49 - """ + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... - def __getitem__(self, key: int | builtins.slice) -> Self | _Scalar_co: - """ - Slice or return value at given index - - Parameters - ---------- - key : integer or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - value : Scalar (index) or ChunkedArray (slice) - """ + def __getitem__(self, key: int | builtins.slice) -> Self | _Scalar_co: ... + def getitem(self, i: int) -> Scalar: ... - def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: - """ - Return boolean array indicating the null values. - - Parameters - ---------- - nan_is_null : bool (optional, default False) - Whether floating-point NaN values should also be considered null. - - Returns - ------- - array : boolean Array or ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.is_null() - - [ - [ - false, - false, - false, - false, - true, - false - ] - ] - """ - def is_nan(self) -> ChunkedArray[BooleanScalar]: - """ - Return boolean array indicating the NaN values. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) - >>> arr.is_nan() - - [ - [ - false, - true, - false, - false, - null, - false - ] - ] - """ - def is_valid(self) -> ChunkedArray[BooleanScalar]: - """ - Return boolean array indicating the non-null values. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.is_valid() - - [ - [ - true, - true, - true - ], - [ - true, - false, - true - ] - ] - """ - def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: - """ - Replace each null element in values with fill_value. - - See :func:`pyarrow.compute.fill_null` for full usage. - - Parameters - ---------- - fill_value : any - The replacement value for null entries. - - Returns - ------- - result : Array or ChunkedArray - A new array with nulls replaced by the given value. - - Examples - -------- - >>> import pyarrow as pa - >>> fill_value = pa.scalar(5, type=pa.int8()) - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.fill_null(fill_value) - - [ - [ - 2, - 2, - 4, - 4, - 5, - 100 - ] - ] - """ - def equals(self, other: Self) -> bool: - """ - Return whether the contents of two chunked arrays are equal. - - Parameters - ---------- - other : pyarrow.ChunkedArray - Chunked array to compare against. - - Returns - ------- - are_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) - ... ) - >>> n_legs.equals(n_legs) - True - >>> n_legs.equals(animals) - False - """ - def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: - """ - Return a NumPy copy of this array (experimental). - - Parameters - ---------- - zero_copy_only : bool, default False - Introduced for signature consistence with pyarrow.Array.to_numpy. - This must be False here since NumPy arrays' buffer must be contiguous. - - Returns - ------- - array : numpy.ndarray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_numpy() - array([ 2, 2, 4, 4, 5, 100]) - """ + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: ... + + def is_nan(self) -> ChunkedArray[BooleanScalar]: ... + + def is_valid(self) -> ChunkedArray[BooleanScalar]: ... + + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: ... + + def equals(self, other: Self) -> bool: ... + + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... def cast( self, target_type: None | _CastAs = None, safe: bool | None = None, options: CastOptions | None = None, - ) -> Self | ChunkedArray[Scalar[_CastAs]]: - """ - Cast array values to another data type - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - cast : Array or ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.type - DataType(int64) - - Change the data type of an array: - - >>> n_legs_seconds = n_legs.cast(pa.duration("s")) - >>> n_legs_seconds.type - DurationType(duration[s]) - """ - def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: - """ - Compute dictionary-encoded representation of array. - - See :func:`pyarrow.compute.dictionary_encode` for full usage. - - Parameters - ---------- - null_encoding : str, default "mask" - How to handle null entries. - - Returns - ------- - encoded : ChunkedArray - A dictionary-encoded version of this array. - - Examples - -------- - >>> import pyarrow as pa - >>> animals = pa.chunked_array( - ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) - ... ) - >>> animals.dictionary_encode() - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 3, - 4, - 5 - ] - ] - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: - """ - Flatten this ChunkedArray. If it has a struct type, the column is - flattened into one array per struct field. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : list of ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> c_arr = pa.chunked_array(n_legs.value_counts()) - >>> c_arr - - [ - -- is_valid: all not null - -- child 0 type: int64 - [ - 2, - 4, - 5, - 100 - ] - -- child 1 type: int64 - [ - 2, - 2, - 1, - 1 - ] - ] - >>> c_arr.flatten() - [ - [ - [ - 2, - 4, - 5, - 100 - ] - ], - [ - [ - 2, - 2, - 1, - 1 - ] - ]] - >>> c_arr.type - StructType(struct) - >>> n_legs.type - DataType(int64) - """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: - """ - Flatten this ChunkedArray into a single non-chunked array. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.combine_chunks() - - [ - 2, - 2, - 4, - 4, - 5, - 100 - ] - """ - def unique(self) -> ChunkedArray[_Scalar_co]: - """ - Compute distinct elements in array - - Returns - ------- - pyarrow.Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.unique() - - [ - 2, - 4, - 5, - 100 - ] - """ - def value_counts(self) -> StructArray: - """ - Compute counts of unique elements in array. - - Returns - ------- - An array of structs - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.value_counts() - - -- is_valid: all not null - -- child 0 type: int64 - [ - 2, - 4, - 5, - 100 - ] - -- child 1 type: int64 - [ - 2, - 2, - 1, - 1 - ] - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this ChunkedArray - - Parameters - ---------- - offset : int, default 0 - Offset from start of array to slice - length : int, default None - Length of slice (default is until end of batch starting from - offset) - - Returns - ------- - sliced : ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.slice(2, 2) - - [ - [ - 4 - ], - [ - 4 - ] - ] - """ - def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: - """ - Select values from the chunked array. - - See :func:`pyarrow.compute.filter` for full usage. - - Parameters - ---------- - mask : Array or array-like - The boolean mask to filter the chunked array with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled. - - Returns - ------- - filtered : Array or ChunkedArray - An array of the same type, with only the elements selected by - the boolean mask. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> mask = pa.array([True, False, None, True, False, True]) - >>> n_legs.filter(mask) - - [ - [ - 2 - ], - [ - 4, - 100 - ] - ] - >>> n_legs.filter(mask, null_selection_behavior="emit_null") - - [ - [ - 2, - null - ], - [ - 4, - 100 - ] - ] - """ + ) -> Self | ChunkedArray[Scalar[_CastAs]]: ... + + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: ... + + def unique(self) -> ChunkedArray[_Scalar_co]: ... + + def value_counts(self) -> StructArray: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + def index( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], value: Scalar[_DataTypeT] | _AsPyType, @@ -873,444 +212,49 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): end: int | None = None, *, memory_pool: MemoryPool | None = None, - ) -> Int64Scalar: - """ - Find the first index of a value. - - See :func:`pyarrow.compute.index` for full usage. - - Parameters - ---------- - value : Scalar or object - The value to look for in the array. - start : int, optional - The start index where to look for `value`. - end : int, optional - The end index where to look for `value`. - memory_pool : MemoryPool, optional - A memory pool for potential memory allocations. - - Returns - ------- - index : Int64Scalar - The index of the value in the array (-1 if not found). - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.index(4) - - >>> n_legs.index(4, start=3) - - """ - def take(self, indices: Indices) -> Self: - """ - Select values from the chunked array. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the array whose values will be returned. - - Returns - ------- - taken : Array or ChunkedArray - An array with the same datatype, containing the taken values. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.take([1, 4, 5]) - - [ - [ - 2, - 5, - 100 - ] - ] - """ - def drop_null(self) -> Self: - """ - Remove missing values from a chunked array. - See :func:`pyarrow.compute.drop_null` for full description. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.drop_null() - - [ - [ - 2, - 2 - ], - [ - 4, - 5, - 100 - ] - ] - """ - def sort(self, order: Order = "ascending", **kwargs) -> Self: - """ - Sort the ChunkedArray - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : ChunkedArray - """ - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Unify dictionaries across all chunks. - - This method returns an equivalent chunked array, but where all - chunks share the same dictionary values. Dictionary indices are - transposed accordingly. - - If there are no dictionaries in the chunked array, it is returned - unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() - >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() - >>> c_arr = pa.chunked_array([arr_1, arr_2]) - >>> c_arr - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ] - ] - >>> c_arr.unify_dictionaries() - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 3, - 4, - 5 - ] - ] - """ + ) -> Int64Scalar: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + @property - def num_chunks(self) -> int: - """ - Number of underlying chunks. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs.num_chunks - 2 - """ - def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: - """ - Select a chunk by its index. - - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs.chunk(1) - - [ - 4, - 5, - 100 - ] - """ + def num_chunks(self) -> int: ... + + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: ... + @property - def chunks(self) -> list[Array[_Scalar_co]]: - """ - Convert to a list of single-chunked arrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.chunks - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ]] - """ + def chunks(self) -> list[Array[_Scalar_co]]: ... + def iterchunks( self: ArrayOrChunkedArray[_ScalarT], - ) -> Generator[Array, None, None]: - """ - Convert to an iterator of ChunkArrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> for i in n_legs.iterchunks(): - ... print(i.null_count) - 0 - 1 - - """ + ) -> Generator[Array, None, None]: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: - """ - Convert to a list of native Python objects. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.to_pylist() - [2, 2, 4, 4, None, 100] - """ - def __arrow_c_stream__(self, requested_schema=None) -> Any: - """ - Export to a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - - Returns - ------- - PyCapsule - A capsule containing a C ArrowArrayStream struct. - """ + ) -> list[_AsPyType | None]: ... + + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + @classmethod - def _import_from_c_capsule(cls, stream) -> Self: - """ - Import ChunkedArray from a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - stream: PyCapsule - A capsule containing a C ArrowArrayStream PyCapsule. - - Returns - ------- - ChunkedArray - """ + def _import_from_c_capsule(cls, stream) -> Self: ... + @property - def is_cpu(self) -> bool: - """ - Whether all chunks in the ChunkedArray are CPU-accessible. - """ + def is_cpu(self) -> bool: ... + def chunked_array( arrays: Iterable[NullableCollection[Any]] | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray] | Iterable[Array[_ScalarT]], type: DataType | str | None = None, -) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: - """ - Construct chunked array from list of array-like objects - - Parameters - ---------- - arrays : Array, list of Array, or array-like - Must all be the same data type. Can be empty only if type also passed. - Any Arrow-compatible array that implements the Arrow PyCapsule Protocol - (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be - passed as well. - type : DataType or string coercible to DataType - - Returns - ------- - ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> pa.chunked_array([], type=pa.int8()) - - [ - ... - ] - - >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - """ +) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: ... + _ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) @@ -1318,1408 +262,129 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: - """ - Return the dataframe interchange object implementing the interchange protocol. - - Parameters - ---------- - nan_as_null : bool, default False - Whether to tell the DataFrame to overwrite null values in the data - with ``NaN`` (or ``NaT``). - allow_copy : bool, default True - Whether to allow memory copying when exporting. If set to False - it would cause non-zero-copy exports to fail. - - Returns - ------- - DataFrame interchange object - The object which consuming library can use to ingress the dataframe. - - Notes - ----- - Details on the interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - `nan_as_null` currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - """ - def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: - """ - Slice or return column at given index or column name - - Parameters - ---------- - key : integer, str, or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - Array (from RecordBatch) or ChunkedArray (from Table) for column input. - RecordBatch or Table for slice input. - """ + ) -> _PyArrowDataFrame: ... + + def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: ... + def __len__(self) -> int: ... - def column(self, i: int | str) -> _ColumnT: - """ - Select single column from Table or RecordBatch. - - Parameters - ---------- - i : int or string - The index or name of the column to retrieve. - - Returns - ------- - column : Array (for RecordBatch) or ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Select a column by numeric index: - - >>> table.column(0) - - [ - [ - 2, - 4, - 5, - 100 - ] - ] - - Select a column by its name: - - >>> table.column("animals") - - [ - [ - "Flamingo", - "Horse", - "Brittle stars", - "Centipede" - ] - ] - """ + def column(self, i: int | str) -> _ColumnT: ... + @property - def column_names(self) -> list[str]: - """ - Names of the Table or RecordBatch columns. - - Returns - ------- - list of str - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> table = pa.Table.from_arrays( - ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], - ... names=["n_legs", "animals"], - ... ) - >>> table.column_names - ['n_legs', 'animals'] - """ + def column_names(self) -> list[str]: ... + @property - def columns(self) -> list[_ColumnT]: - """ - List of all columns in numerical order. - - Returns - ------- - columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.columns - [ - [ - [ - null, - 4, - 5, - null - ] - ], - [ - [ - "Flamingo", - "Horse", - null, - "Centipede" - ] - ]] - """ - def drop_null(self) -> Self: - """ - Remove rows that contain missing values from a Table or RecordBatch. - - See :func:`pyarrow.compute.drop_null` for full usage. - - Returns - ------- - Table or RecordBatch - A tabular object with the same schema, with rows containing - no missing values. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [None, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", None, "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.drop_null() - pyarrow.Table - year: double - n_legs: int64 - animals: string - ---- - year: [[2022,2021]] - n_legs: [[4,100]] - animals: [["Horse","Centipede"]] - """ - def field(self, i: int | str) -> Field: - """ - Select a schema field by its column name or numeric index. - - Parameters - ---------- - i : int or string - The index or name of the field to retrieve. - - Returns - ------- - Field - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.field(0) - pyarrow.Field - >>> table.field(1) - pyarrow.Field - """ + def columns(self) -> list[_ColumnT]: ... + + def drop_null(self) -> Self: ... + + def field(self, i: int | str) -> Field: ... + @classmethod def from_pydict( cls, mapping: Mapping[str, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray], schema: Schema | None = None, metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a Table or RecordBatch from Arrow arrays or columns. - - Parameters - ---------- - mapping : dict or Mapping - A mapping of strings to Arrays or Python lists. - schema : Schema, default None - If not passed, will be inferred from the Mapping values. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table or RecordBatch - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> pydict = {"n_legs": n_legs, "animals": animals} - - Construct a Table from a dictionary of arrays: - - >>> pa.Table.from_pydict(pydict) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_pydict(pydict).schema - n_legs: int64 - animals: string - - Construct a Table from a dictionary of arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a dictionary of arrays with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.Table.from_pydict(pydict, schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ + ) -> Self: ... + @classmethod def from_pylist( cls, mapping: Sequence[Mapping[str, Any]], schema: Schema | None = None, metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a Table or RecordBatch from list of rows / dictionaries. - - Parameters - ---------- - mapping : list of dicts of rows - A mapping of strings to row values. - schema : Schema, default None - If not passed, will be inferred from the first row of the - mapping values. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table or RecordBatch - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] - - Construct a Table from a list of rows: - - >>> pa.Table.from_pylist(pylist) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4]] - animals: [["Flamingo","Dog"]] - - Construct a Table from a list of rows with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a list of rows with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.Table.from_pylist(pylist, schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - def itercolumns(self) -> Generator[_ColumnT, None, None]: - """ - Iterator over all columns in their numerical order. - - Yields - ------ - Array (for RecordBatch) or ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> for i in table.itercolumns(): - ... print(i.null_count) - 2 - 1 - """ + ) -> Self: ... + + def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + @property def num_columns(self) -> int: ... @property def num_rows(self) -> int: ... @property - def shape(self) -> tuple[int, int]: - """ - Dimensions of the table or record batch: (#rows, #columns). - - Returns - ------- - (int, int) - Number of rows and number of columns. - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table.shape - (4, 2) - """ + def shape(self) -> tuple[int, int]: ... + @property def schema(self) -> Schema: ... @property def nbytes(self) -> int: ... - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: - """ - Sort the Table or RecordBatch by one or multiple columns. - - Parameters - ---------- - sorting : str or list[tuple(name, order)] - Name of the column to use to sort (ascending), or - a list of multiple sorting conditions where - each entry is a tuple with column name - and sorting order ("ascending" or "descending") - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - Table or RecordBatch - A new tabular object sorted according to the sort keys. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.sort_by("animal") - pyarrow.Table - year: int64 - n_legs: int64 - animal: string - ---- - year: [[2019,2021,2021,2020,2022,2022]] - n_legs: [[5,100,4,2,4,2]] - animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] - """ - def take(self, indices: Indices) -> Self: - """ - Select rows from a Table or RecordBatch. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the tabular object whose rows will be returned. - - Returns - ------- - Table or RecordBatch - A tabular object with the same schema, containing the taken rows. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.take([1, 3]) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2022,2021]] - n_legs: [[4,100]] - animals: [["Horse","Centipede"]] - """ + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + def filter( self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" - ) -> Self: - """ - Select rows from the table or record batch based on a boolean mask. - - The Table can be filtered based on a mask, which will be passed to - :func:`pyarrow.compute.filter` to perform the filtering, or it can - be filtered through a boolean :class:`.Expression` - - Parameters - ---------- - mask : Array or array-like or .Expression - The boolean mask or the :class:`.Expression` to filter the table with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled, does nothing if - an :class:`.Expression` is used. - - Returns - ------- - filtered : Table or RecordBatch - A tabular object of the same schema, with only the rows selected - by applied filtering - - Examples - -------- - Using a Table (works similarly for RecordBatch): - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Define an expression and select rows: - - >>> import pyarrow.compute as pc - >>> expr = pc.field("year") <= 2020 - >>> table.filter(expr) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2019]] - n_legs: [[2,5]] - animals: [["Flamingo","Brittle stars"]] - - Define a mask and select rows: - - >>> mask = [True, True, False, None] - >>> table.filter(mask) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022]] - n_legs: [[2,4]] - animals: [["Flamingo","Horse"]] - >>> table.filter(mask, null_selection_behavior="emit_null") - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,null]] - n_legs: [[2,4,null]] - animals: [["Flamingo","Horse",null]] - """ + ) -> Self: ... + def to_pydict( self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> dict[str, list[Any]]: - """ - Convert the Table or RecordBatch to a dict or OrderedDict. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - dict - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> table.to_pydict() - {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} - """ + ) -> dict[str, list[Any]]: ... + def to_pylist( self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> list[dict[str, Any]]: - """ - Convert the Table or RecordBatch to a list of rows / dictionaries. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - list - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] - >>> table = pa.table(data, names=["n_legs", "animals"]) - >>> table.to_pylist() - [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... - """ - def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: - """ - Return human-readable string representation of Table or RecordBatch. - - Parameters - ---------- - show_metadata : bool, default False - Display Field-level and Schema-level KeyValueMetadata. - preview_cols : int, default 0 - Display values of the columns for the first N columns. - - Returns - ------- - str - """ + ) -> list[dict[str, Any]]: ... + + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: ... + def remove_column(self, i: int) -> Self: ... - def drop_columns(self, columns: str | list[str]) -> Self: - """ - Drop one or more columns and return a new Table or RecordBatch. - - Parameters - ---------- - columns : str or list[str] - Field name(s) referencing existing column(s). - - Raises - ------ - KeyError - If any of the passed column names do not exist. - - Returns - ------- - Table or RecordBatch - A tabular object without the column(s). - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Drop one column: - - >>> table.drop_columns("animals") - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,4,5,100]] - - Drop one or more columns: - - >>> table.drop_columns(["n_legs", "animals"]) - pyarrow.Table - ... - ---- - """ + def drop_columns(self, columns: str | list[str]) -> Self: ... + def add_column( self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] ) -> Self: ... def append_column( self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: - """ - Append column at end of columns. - - Parameters - ---------- - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - Table or RecordBatch - New table or record batch with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Append column at the end: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.append_column("year", [year]) - pyarrow.Table - n_legs: int64 - animals: string - year: int64 - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - year: [[2021,2022,2019,2021]] - """ + ) -> Self: ... + class RecordBatch(_Tabular[Array]): - """ - Batch of rows of columns of equal length - - Warnings - -------- - Do not call this class's constructor directly, use one of the - ``RecordBatch.from_*`` functions instead. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Constructing a RecordBatch from arrays: - - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Constructing a RecordBatch from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.RecordBatch.from_pandas(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_pandas(df).to_pandas() - year month day n_legs animals - 0 2020 3 1 2 Flamingo - 1 2022 5 5 4 Horse - 2 2021 7 9 5 Brittle stars - 3 2022 9 13 100 Centipede - - Constructing a RecordBatch from pylist: - - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] - >>> pa.RecordBatch.from_pylist(pylist).to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Dog - - You can also construct a RecordBatch using :func:`pyarrow.record_batch`: - - >>> pa.record_batch([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - >>> pa.record_batch(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ + + + def validate(self, *, full: bool = False) -> None: ... + def replace_schema_metadata( self, metadata: dict[str | bytes, str | bytes] | None = None - ) -> Self: - """ - Create shallow copy of record batch by replacing schema - key-value metadata with the indicated new metadata (which may be None, - which deletes any existing metadata - - Parameters - ---------- - metadata : dict, default None - - Returns - ------- - shallow_copy : RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - - Constructing a RecordBatch with schema and metadata: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) - >>> batch.schema - n_legs: int64 - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Shallow copy of a RecordBatch with deleted schema metadata: - - >>> batch.replace_schema_metadata().schema - n_legs: int64 - """ + ) -> Self: ... + @property - def num_columns(self) -> int: - """ - Number of columns - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.num_columns - 2 - """ + def num_columns(self) -> int: ... + @property - def num_rows(self) -> int: - """ - Number of rows - - Due to the definition of a RecordBatch, all columns have the same - number of rows. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.num_rows - 6 - """ + def num_rows(self) -> int: ... + @property - def schema(self) -> Schema: - """ - Schema of the RecordBatch and its columns - - Returns - ------- - pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.schema - n_legs: int64 - animals: string - """ + def schema(self) -> Schema: ... + @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the record batch. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.nbytes - 116 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the record batch - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.get_total_buffer_size() - 120 - """ + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... def add_column( self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: - """ - Add column to RecordBatch at position i. - - A new record batch is returned with the column added, the original record batch - object is left unchanged. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - RecordBatch - New record batch with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - - Add column: - - >>> year = [2021, 2022, 2019, 2021] - >>> batch.add_column(0, "year", year) - pyarrow.RecordBatch - year: int64 - n_legs: int64 - animals: string - ---- - year: [2021,2022,2019,2021] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Original record batch is left unchanged: - - >>> batch - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - def remove_column(self, i: int) -> Self: - """ - Create new RecordBatch with the indicated column removed. - - Parameters - ---------- - i : int - Index of column to remove. - - Returns - ------- - Table - New record batch without the column. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> batch.remove_column(1) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,4,5,100] - """ - def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: - """ - Replace column in RecordBatch at position. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - RecordBatch - New record batch with the passed column set. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - - Replace a column: - - >>> year = [2021, 2022, 2019, 2021] - >>> batch.set_column(1, "year", year) - pyarrow.RecordBatch - n_legs: int64 - year: int64 - ---- - n_legs: [2,4,5,100] - year: [2021,2022,2019,2021] - """ - def rename_columns(self, names: list[str] | dict[str, str]) -> Self: - """ - Create new record batch with columns renamed to provided names. - - Parameters - ---------- - names : list[str] or dict[str, str] - List of new column names or mapping of old column names to new column names. - - If a mapping of old to new column names is passed, then all columns which are - found to match a provided old column name will be renamed to the new column name. - If any column names are not found in the mapping, a KeyError will be raised. - - Raises - ------ - KeyError - If any of the column names passed in the names mapping do not exist. - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> new_names = ["n", "name"] - >>> batch.rename_columns(new_names) - pyarrow.RecordBatch - n: int64 - name: string - ---- - n: [2,4,5,100] - name: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> new_names = {"n_legs": "n", "animals": "name"} - >>> batch.rename_columns(new_names) - pyarrow.RecordBatch - n: int64 - name: string - ---- - n: [2,4,5,100] - name: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write RecordBatch to Buffer as encapsulated IPC message, which does not - include a Schema. - - To reconstruct a RecordBatch from the encapsulated IPC message Buffer - returned by this function, a Schema must be passed separately. See - Examples. - - Parameters - ---------- - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> buf = batch.serialize() - >>> buf - - - Reconstruct RecordBatch from IPC message Buffer and original Schema - - >>> pa.ipc.read_record_batch(buf, batch.schema) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this RecordBatch - - Parameters - ---------- - offset : int, default 0 - Offset from start of record batch to slice - length : int, default None - Length of slice (default is until end of batch starting from - offset) - - Returns - ------- - sliced : RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - >>> batch.slice(offset=3).to_pandas() - n_legs animals - 0 4 Horse - 1 5 Brittle stars - 2 100 Centipede - >>> batch.slice(length=2).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - >>> batch.slice(offset=3, length=1).to_pandas() - n_legs animals - 0 4 Horse - """ - def equals(self, other: Self, check_metadata: bool = False) -> bool: - """ - Check if contents of two record batches are equal. - - Parameters - ---------- - other : pyarrow.RecordBatch - RecordBatch to compare against. - check_metadata : bool, default False - Whether schema metadata equality should be checked as well. - - Returns - ------- - are_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch_0 = pa.record_batch([]) - >>> batch_1 = pa.RecordBatch.from_arrays( - ... [n_legs, animals], - ... names=["n_legs", "animals"], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> batch.equals(batch) - True - >>> batch.equals(batch_0) - False - >>> batch.equals(batch_1) - True - >>> batch.equals(batch_1, check_metadata=True) - False - """ - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: - """ - Select columns of the RecordBatch. - - Returns a new RecordBatch with the specified columns, and metadata - preserved. - - Parameters - ---------- - columns : list-like - The column names or integer indices to select. - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) - - Select columns my indices: - - >>> batch.select([1]) - pyarrow.RecordBatch - animals: string - ---- - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - - Select columns by names: - - >>> batch.select(["n_legs"]) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,2,4,4,5,100] - """ + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self, check_metadata: bool = False) -> bool: ... + + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... + def cast( self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: - """ - Cast record batch values to another schema. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - safe : bool, default True - Check for overflows or other unsafe conversions. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> batch.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - - Define new schema and cast batch values: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] - ... ) - >>> batch.cast(target_schema=my_schema) - pyarrow.RecordBatch - n_legs: duration[s] - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ + ) -> Self: ... + @classmethod def from_arrays( cls, @@ -2727,72 +392,8 @@ class RecordBatch(_Tabular[Array]): names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a RecordBatch from multiple pyarrow.Arrays - - Parameters - ---------- - arrays : list of pyarrow.Array - One for each field in RecordBatch - names : list of str, optional - Names for the batch fields. If not passed, schema must be passed - schema : Schema, default None - Schema for the created batch. If not passed, names must be passed - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - pyarrow.RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> names = ["n_legs", "animals"] - - Construct a RecordBatch from pyarrow Arrays using names: - - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Construct a RecordBatch from pyarrow Arrays using schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ + ) -> Self: ... + @classmethod def from_pandas( cls, @@ -2801,387 +402,52 @@ class RecordBatch(_Tabular[Array]): preserve_index: bool | None = None, nthreads: int | None = None, columns: list[str] | None = None, - ) -> Self: - """ - Convert pandas.DataFrame to an Arrow RecordBatch - - Parameters - ---------- - df : pandas.DataFrame - schema : pyarrow.Schema, optional - The expected schema of the RecordBatch. This can be used to - indicate the type of columns if we cannot infer it automatically. - If passed, the output will have exactly this schema. Columns - specified in the schema that are not found in the DataFrame columns - or its index will raise an error. Additional columns or index - levels in the DataFrame which are not specified in the schema will - be ignored. - preserve_index : bool, optional - Whether to store the index as an additional column in the resulting - ``RecordBatch``. The default of None will store the index as a - column, except for RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None - If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. By default, this follows - :func:`pyarrow.cpu_count` (may use up to system CPU count threads). - columns : list, optional - List of column to be converted. If None, use all columns. - - Returns - ------- - pyarrow.RecordBatch - - - Examples - -------- - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Convert pandas DataFrame to RecordBatch: - - >>> import pyarrow as pa - >>> pa.RecordBatch.from_pandas(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Convert pandas DataFrame to RecordBatch using schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.RecordBatch.from_pandas(df, schema=my_schema) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Convert pandas DataFrame to RecordBatch specifying columns: - - >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,4,5,100] - """ + ) -> Self: ... + @classmethod def from_struct_array( cls, struct_array: StructArray | ChunkedArray[StructScalar] - ) -> Self: - """ - Construct a RecordBatch from a StructArray. - - Each field in the StructArray will become a column in the resulting - ``RecordBatch``. - - Parameters - ---------- - struct_array : StructArray - Array to construct the record batch from. - - Returns - ------- - pyarrow.RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> pa.RecordBatch.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 - """ - def to_struct_array(self) -> StructArray: - """ - Convert to a struct array. - """ + ) -> Self: ... + + def to_struct_array(self) -> StructArray: ... + def to_tensor( self, null_to_nan: bool = False, row_major: bool = True, memory_pool: MemoryPool | None = None, - ) -> Tensor: - """ - Convert to a :class:`~pyarrow.Tensor`. - - RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths. - - ``null_to_nan`` is ``False`` by default and this method will raise an error in case - any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` - set to ``True``. In this case null values are converted to ``NaN`` and integer type - arrays are promoted to the appropriate float type. - - Parameters - ---------- - null_to_nan : bool, default False - Whether to write null values in the result as ``NaN``. - row_major : bool, default True - Whether resulting Tensor is row-major or column-major - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Examples - -------- - >>> import pyarrow as pa - >>> batch = pa.record_batch( - ... [ - ... pa.array([1, 2, 3, 4, None], type=pa.int32()), - ... pa.array([10, 20, 30, 40, None], type=pa.float32()), - ... ], - ... names=["a", "b"], - ... ) - - >>> batch - pyarrow.RecordBatch - a: int32 - b: float - ---- - a: [1,2,3,4,null] - b: [10,20,30,40,null] - - Convert a RecordBatch to row-major Tensor with null values - written as ``NaN``s - - >>> batch.to_tensor(null_to_nan=True) - - type: double - shape: (5, 2) - strides: (16, 8) - >>> batch.to_tensor(null_to_nan=True).to_numpy() - array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) - - Convert a RecordBatch to column-major Tensor - - >>> batch.to_tensor(null_to_nan=True, row_major=False) - - type: double - shape: (5, 2) - strides: (8, 40) - >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() - array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) - """ - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): - """ - Export to a C ArrowArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the record batch - schema is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ + ) -> Tensor: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... + @classmethod - def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: - """ - Import RecordBatch from a C ArrowArray struct, given its pointer - and the imported schema. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArray struct. - type: Schema or int - Either a Schema object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_array__(self, requested_schema=None): - """ - Get a pair of PyCapsules containing a C ArrowArray representation of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the batch to this schema. - If None, the batch will be returned as-is, with a schema matching the - one returned by :meth:`__arrow_c_schema__()`. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowArray, - respectively. - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the batch as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - Currently, this is not supported and will raise a - NotImplementedError if the schema doesn't match the current schema. - - Returns - ------- - PyCapsule - """ + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None): ... + + def __arrow_c_stream__(self, requested_schema=None): ... + @classmethod - def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: - """ - Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema - and ArrowArray, respectively. - - Parameters - ---------- - schema_capsule : PyCapsule - A PyCapsule containing a C ArrowSchema representation of the schema. - array_capsule : PyCapsule - A PyCapsule containing a C ArrowArray representation of the array. - - Returns - ------- - pyarrow.RecordBatch - """ - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowDeviceArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the record batch - schema is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + @classmethod - def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: - """ - Import RecordBatch from a C ArrowDeviceArray struct, given its pointer - and the imported schema. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - type: Schema or int - Either a Schema object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_device_array__(self, requested_schema=None, **kwargs): - """ - Get a pair of PyCapsules containing a C ArrowDeviceArray representation - of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the batch to this data type. - If None, the batch will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - kwargs - Currently no additional keyword arguments are supported, but - this method will accept any keyword with a value of ``None`` - for compatibility with future keywords. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, - respectively. - """ + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... + @classmethod - def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: - """ - Import RecordBatch from a pair of PyCapsules containing a - C ArrowSchema and ArrowDeviceArray, respectively. - - Parameters - ---------- - schema_capsule : PyCapsule - A PyCapsule containing a C ArrowSchema representation of the schema. - array_capsule : PyCapsule - A PyCapsule containing a C ArrowDeviceArray representation of the array. - - Returns - ------- - pyarrow.RecordBatch - """ + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the arrays in the RecordBatch reside. - - Returns - ------- - DeviceAllocationType - """ + def device_type(self) -> DeviceAllocationType: ... + @property - def is_cpu(self) -> bool: - """ - Whether the RecordBatch's arrays are CPU-accessible. - """ - def copy_to(self, destination: MemoryManager | Device) -> Self: - """ - Copy the entire RecordBatch to destination device. - - This copies each column of the record batch to create - a new record batch where all underlying buffers for the columns have - been copied to the destination MemoryManager. - - Parameters - ---------- - destination : pyarrow.MemoryManager or pyarrow.Device - The destination device to copy the array to. - - Returns - ------- - RecordBatch - """ + def is_cpu(self) -> bool: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + def table_to_blocks(options, table: Table, categories, extension_columns): ... @@ -3197,523 +463,30 @@ JoinType: TypeAlias = Literal[ ] class Table(_Tabular[ChunkedArray[Any]]): - """ - A collection of top-level named, equal length Arrow arrays. - - Warnings - -------- - Do not call this class's constructor directly, use one of the ``from_*`` - methods instead. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from arrays: - - >>> pa.Table.from_arrays([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a RecordBatch: - - >>> batch = pa.record_batch([n_legs, animals], names=names) - >>> pa.Table.from_batches([batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.Table.from_pandas(df) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a dictionary of arrays: - - >>> pydict = {"n_legs": n_legs, "animals": animals} - >>> pa.Table.from_pydict(pydict) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_pydict(pydict).schema - n_legs: int64 - animals: string - - Construct a Table from a dictionary of arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a list of rows: - - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] - >>> pa.Table.from_pylist(pylist) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,null]] - animals: [["Flamingo","Centipede"]] - - Construct a Table from a list of rows with pyarrow schema: - - >>> my_schema = pa.schema( - ... [ - ... pa.field("year", pa.int64()), - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... ], - ... metadata={"year": "Year of entry"}, - ... ) - >>> pa.Table.from_pylist(pylist, schema=my_schema).schema - year: int64 - n_legs: int64 - animals: string - -- schema metadata -- - year: 'Year of entry' - - Construct a Table with :func:`pyarrow.table`: - - >>> pa.table([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this Table. - - Parameters - ---------- - offset : int, default 0 - Offset from start of table to slice. - length : int, default None - Length of slice (default is until end of table starting from - offset). - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.slice(length=3) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019]] - n_legs: [[2,4,5]] - animals: [["Flamingo","Horse","Brittle stars"]] - >>> table.slice(offset=2) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2019,2021]] - n_legs: [[5,100]] - animals: [["Brittle stars","Centipede"]] - >>> table.slice(offset=2, length=1) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2019]] - n_legs: [[5]] - animals: [["Brittle stars"]] - """ - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: - """ - Select columns of the Table. - - Returns a new Table with the specified columns, and metadata - preserved. - - Parameters - ---------- - columns : list-like - The column names or integer indices to select. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.select([0, 1]) - pyarrow.Table - year: int64 - n_legs: int64 - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - >>> table.select(["year"]) - pyarrow.Table - year: int64 - ---- - year: [[2020,2022,2019,2021]] - """ + + + def validate(self, *, full: bool = False) -> None: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... + def replace_schema_metadata( self, metadata: dict[str | bytes, str | bytes] | None = None - ) -> Self: - """ - Create shallow copy of table by replacing schema - key-value metadata with the indicated new metadata (which may be None), - which deletes any existing metadata. - - Parameters - ---------- - metadata : dict, default None - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Constructing a Table with pyarrow schema and metadata: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> table = pa.table(df, my_schema) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: ... - - Create a shallow copy of a Table with deleted schema metadata: - - >>> table.replace_schema_metadata().schema - n_legs: int64 - animals: string - - Create a shallow copy of a Table with new schema metadata: - - >>> metadata = {"animals": "Which animal"} - >>> table.replace_schema_metadata(metadata=metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - animals: 'Which animal' - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Flatten this Table. - - Each column with a struct type is flattened - into one column per struct field. Other columns are left unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> month = pa.array([4, 6]) - >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) - >>> table - pyarrow.Table - a: struct - child 0, animals: string - child 1, n_legs: int64 - child 2, year: int64 - month: int64 - ---- - a: [ - -- is_valid: all not null - -- child 0 type: string - ["Parrot",null] - -- child 1 type: int64 - [2,4] - -- child 2 type: int64 - [null,2022]] - month: [[4,6]] - - Flatten the columns with struct field: - - >>> table.flatten() - pyarrow.Table - a.animals: string - a.n_legs: int64 - a.year: int64 - month: int64 - ---- - a.animals: [["Parrot",null]] - a.n_legs: [[2,4]] - a.year: [[null,2022]] - month: [[4,6]] - """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Make a new table by combining the chunks this table has. - - All the underlying chunks in the ChunkedArray of each column are - concatenated into zero or one chunk. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] - ... ) - >>> names = ["n_legs", "animals"] - >>> table = pa.table([n_legs, animals], names=names) - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4],[4,5,100]] - animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] - >>> table.combine_chunks() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4,4,5,100]] - animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] - """ - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Unify dictionaries across all chunks. - - This method returns an equivalent table, but where all chunks of - each column share the same dictionary values. Dictionary indices - are transposed accordingly. - - Columns without dictionaries are returned unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() - >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() - >>> c_arr = pa.chunked_array([arr_1, arr_2]) - >>> table = pa.table([c_arr], names=["animals"]) - >>> table - pyarrow.Table - animals: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Parrot","Dog"] -- indices: - [0,1,2], -- dictionary: - ["Horse","Brittle stars","Centipede"] -- indices: - [0,1,2]] - - Unify dictionaries across both chunks: - - >>> table.unify_dictionaries() - pyarrow.Table - animals: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: - [0,1,2], -- dictionary: - ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: - [3,4,5]] - """ - def equals(self, other: Self, check_metadata: bool = False) -> Self: - """ - Check if contents of two tables are equal. - - Parameters - ---------- - other : pyarrow.Table - Table to compare against. - check_metadata : bool, default False - Whether schema metadata equality should be checked as well. - - Returns - ------- - bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> names = ["n_legs", "animals"] - >>> table = pa.Table.from_arrays([n_legs, animals], names=names) - >>> table_0 = pa.Table.from_arrays([]) - >>> table_1 = pa.Table.from_arrays( - ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} - ... ) - >>> table.equals(table) - True - >>> table.equals(table_0) - False - >>> table.equals(table_1) - True - >>> table.equals(table_1, check_metadata=True) - False - """ + ) -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def equals(self, other: Self, check_metadata: bool = False) -> Self: ... + def cast( self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: - """ - Cast table values to another schema. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - safe : bool, default True - Check for overflows or other unsafe conversions. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - - Define new schema and cast table values: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] - ... ) - >>> table.cast(target_schema=my_schema) - pyarrow.Table - n_legs: duration[s] - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ + ) -> Self: ... + @classmethod def from_pandas( cls, @@ -3723,70 +496,8 @@ class Table(_Tabular[ChunkedArray[Any]]): nthreads: int | None = None, columns: list[str] | None = None, safe: bool = True, - ) -> Self: - """ - Convert pandas.DataFrame to an Arrow Table. - - The column types in the resulting Arrow Table are inferred from the - dtypes of the pandas.Series in the DataFrame. In the case of non-object - Series, the NumPy dtype is translated to its Arrow equivalent. In the - case of `object`, we need to guess the datatype by looking at the - Python objects in this Series. - - Be aware that Series of the `object` dtype don't carry enough - information to always lead to a meaningful Arrow type. In the case that - we cannot infer a type, e.g. because the DataFrame is of length 0 or - the Series only contains None/nan objects, the type is set to - null. This behavior can be avoided by constructing an explicit schema - and passing it to this function. - - Parameters - ---------- - df : pandas.DataFrame - schema : pyarrow.Schema, optional - The expected schema of the Arrow Table. This can be used to - indicate the type of columns if we cannot infer it automatically. - If passed, the output will have exactly this schema. Columns - specified in the schema that are not found in the DataFrame columns - or its index will raise an error. Additional columns or index - levels in the DataFrame which are not specified in the schema will - be ignored. - preserve_index : bool, optional - Whether to store the index as an additional column in the resulting - ``Table``. The default of None will store the index as a column, - except for RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None - If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. By default, this follows - :func:`pyarrow.cpu_count` (may use up to system CPU count threads). - columns : list, optional - List of column to be converted. If None, use all columns. - safe : bool, default True - Check for overflows or other unsafe conversions. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.Table.from_pandas(df) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ + ) -> Self: ... + @classmethod def from_arrays( cls, @@ -3794,630 +505,55 @@ class Table(_Tabular[ChunkedArray[Any]]): names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping[str | bytes, str | bytes] | None = None, - ) -> Self: - """ - Construct a Table from Arrow arrays. - - Parameters - ---------- - arrays : list of pyarrow.Array or pyarrow.ChunkedArray - Equal-length arrays that should form the table. - names : list of str, optional - Names for the table columns. If not passed, schema must be passed. - schema : Schema, default None - Schema for the created table. If not passed, names must be passed. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from arrays: - - >>> pa.Table.from_arrays([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from arrays with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"animals": "Name of the animal species"}, - ... ) - >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - animals: 'Name of the animal species' - """ + ) -> Self: ... + @classmethod def from_struct_array( cls, struct_array: StructArray | ChunkedArray[StructScalar] - ) -> Self: - """ - Construct a Table from a StructArray. - - Each field in the StructArray will become a column in the resulting - ``Table``. - - Parameters - ---------- - struct_array : StructArray or ChunkedArray - Array to construct the table from. - - Returns - ------- - pyarrow.Table - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> pa.Table.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 - """ + ) -> Self: ... + def to_struct_array( self, max_chunksize: int | None = None - ) -> ChunkedArray[StructScalar]: - """ - Convert to a chunked array of struct type. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for ChunkedArray chunks. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - ChunkedArray - """ + ) -> ChunkedArray[StructScalar]: ... + @classmethod - def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: - """ - Construct a Table from a sequence or iterator of Arrow RecordBatches. - - Parameters - ---------- - batches : sequence or iterator of RecordBatch - Sequence of RecordBatch to be converted, all schemas must be equal. - schema : Schema, default None - If not passed, will be inferred from the first RecordBatch. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - >>> batch = pa.record_batch([n_legs, animals], names=names) - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - - Construct a Table from a RecordBatch: - - >>> pa.Table.from_batches([batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a sequence of RecordBatches: - - >>> pa.Table.from_batches([batch, batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100],[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: - """ - Convert Table to a list of RecordBatch objects. - - Note that this method is zero-copy, it merely exposes the same data - under a different API. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for each RecordBatch chunk. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - list[RecordBatch] - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Convert a Table to a RecordBatch: - - >>> table.to_batches()[0].to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - - Convert a Table to a list of RecordBatches: - - >>> table.to_batches(max_chunksize=2)[0].to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - >>> table.to_batches(max_chunksize=2)[1].to_pandas() - n_legs animals - 0 5 Brittle stars - 1 100 Centipede - """ - def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: - """ - Convert the Table to a RecordBatchReader. - - Note that this method is zero-copy, it merely exposes the same data - under a different API. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for each RecordBatch chunk. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - RecordBatchReader - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Convert a Table to a RecordBatchReader: - - >>> table.to_reader() - - - >>> reader = table.to_reader() - >>> reader.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - >>> reader.read_all() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: ... + + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... + + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... + @property - def schema(self) -> Schema: - """ - Schema of the table and its columns. - - Returns - ------- - Schema - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... - """ + def schema(self) -> Schema: ... + @property - def num_columns(self) -> int: - """ - Number of columns in this table. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.num_columns - 2 - """ + def num_columns(self) -> int: ... + @property - def num_rows(self) -> int: - """ - Number of rows in this table. - - Due to the definition of a table, all columns have the same number of - rows. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.num_rows - 4 - """ + def num_rows(self) -> int: ... + @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the table. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.nbytes - 72 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the table. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.get_total_buffer_size() - 76 - """ + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... def add_column( self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: - """ - Add column to Table at position. - - A new table is returned with the column added, the original table - object is left unchanged. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array, list of Array, or values coercible to arrays - Column data. - - Returns - ------- - Table - New table with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Add column: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.add_column(0, "year", [year]) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2021,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Original table is left unchanged: - - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def remove_column(self, i: int) -> Self: - """ - Create new Table with the indicated column removed. - - Parameters - ---------- - i : int - Index of column to remove. - - Returns - ------- - Table - New table without the column. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.remove_column(1) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,4,5,100]] - """ + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + def set_column( self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] - ) -> Self: - """ - Replace column in Table at position. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array, list of Array, or values coercible to arrays - Column data. - - Returns - ------- - Table - New table with the passed column set. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Replace a column: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.set_column(1, "year", [year]) - pyarrow.Table - n_legs: int64 - year: int64 - ---- - n_legs: [[2,4,5,100]] - year: [[2021,2022,2019,2021]] - """ - def rename_columns(self, names: list[str] | dict[str, str]) -> Self: - """ - Create new table with columns renamed to provided names. - - Parameters - ---------- - names : list[str] or dict[str, str] - List of new column names or mapping of old column names to new column names. - - If a mapping of old to new column names is passed, then all columns which are - found to match a provided old column name will be renamed to the new column name. - If any column names are not found in the mapping, a KeyError will be raised. - - Raises - ------ - KeyError - If any of the column names passed in the names mapping do not exist. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> new_names = ["n", "name"] - >>> table.rename_columns(new_names) - pyarrow.Table - n: int64 - name: string - ---- - n: [[2,4,5,100]] - name: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> new_names = {"n_legs": "n", "animals": "name"} - >>> table.rename_columns(new_names) - pyarrow.Table - n: int64 - name: string - ---- - n: [[2,4,5,100]] - name: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def drop(self, columns: str | list[str]) -> Self: - """ - Drop one or more columns and return a new table. - - Alias of Table.drop_columns, but kept for backwards compatibility. - - Parameters - ---------- - columns : str or list[str] - Field name(s) referencing existing column(s). - - Returns - ------- - Table - New table without the column(s). - """ - def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: - """ - Declare a grouping over the columns of the table. - - Resulting grouping can then be used to perform aggregations - with a subsequent ``aggregate()`` method. - - Parameters - ---------- - keys : str or list[str] - Name of the columns that should be used as the grouping key. - use_threads : bool, default True - Whether to use multithreading or not. When set to True (the - default), no stable ordering of the output is guaranteed. - - Returns - ------- - TableGroupBy - - See Also - -------- - TableGroupBy.aggregate - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.group_by("year").aggregate([("n_legs", "sum")]) - pyarrow.Table - year: int64 - n_legs_sum: int64 - ---- - year: [[2020,2022,2021,2019]] - n_legs_sum: [[2,6,104,5]] - """ + ) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def drop(self, columns: str | list[str]) -> Self: ... + + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: ... + def join( self, right_table: Self, @@ -4428,110 +564,8 @@ class Table(_Tabular[ChunkedArray[Any]]): right_suffix: str | None = None, coalesce_keys: bool = True, use_threads: bool = True, - ) -> Self: - """ - Perform a join between this table and another one. - - Result of the join will be a new Table, where further - operations can be applied. - - Parameters - ---------- - right_table : Table - The table to join to the current one, acting as the right table - in the join operation. - keys : str or list[str] - The columns from current table that should be used as keys - of the join operation left side. - right_keys : str or list[str], default None - The columns from the right_table that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left table. - join_type : str, default "left outer" - The kind of join that should be performed, one of - ("left semi", "right semi", "left anti", "right anti", - "inner", "left outer", "right outer", "full outer") - left_suffix : str, default None - Which suffix to add to left column names. This prevents confusion - when the columns in left and right tables have colliding names. - right_suffix : str, default None - Which suffix to add to the right column names. This prevents confusion - when the columns in left and right tables have colliding names. - coalesce_keys : bool, default True - If the duplicated keys should be omitted from one of the sides - in the join result. - use_threads : bool, default True - Whether to use multithreading or not. - - Returns - ------- - Table - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) - >>> df2 = pd.DataFrame( - ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} - ... ) - >>> t1 = pa.Table.from_pandas(df1) - >>> t2 = pa.Table.from_pandas(df2) - - Left outer join: - - >>> t1.join(t2, "id").combine_chunks().sort_by("year") - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[3,1,2]] - year: [[2019,2020,2022]] - n_legs: [[5,null,null]] - animal: [["Brittle stars",null,null]] - - Full outer join: - - >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[3,1,2,4]] - year: [[2019,2020,2022,null]] - n_legs: [[5,null,null,100]] - animal: [["Brittle stars",null,null,"Centipede"]] - - Right outer join: - - >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") - pyarrow.Table - year: int64 - id: int64 - n_legs: int64 - animal: string - ---- - year: [[2019,null]] - id: [[3,4]] - n_legs: [[5,100]] - animal: [["Brittle stars","Centipede"]] - - Right anti join - - >>> t1.join(t2, "id", join_type="right anti") - pyarrow.Table - id: int64 - n_legs: int64 - animal: string - ---- - id: [[4]] - n_legs: [[100]] - animal: [["Centipede"]] - """ + ) -> Self: ... + def join_asof( self, right_table: Self, @@ -4540,109 +574,13 @@ class Table(_Tabular[ChunkedArray[Any]]): tolerance: int, right_on: str | list[str] | None = None, right_by: str | list[str] | None = None, - ) -> Self: - """ - Perform an asof join between this table and another one. - - This is similar to a left-join except that we match on nearest key rather - than equal keys. Both tables must be sorted by the key. This type of join - is most useful for time series data that are not perfectly aligned. - - Optionally match on equivalent keys with "by" before searching with "on". - - Result of the join will be a new Table, where further - operations can be applied. - - Parameters - ---------- - right_table : Table - The table to join to the current one, acting as the right table - in the join operation. - on : str - The column from current table that should be used as the "on" key - of the join operation left side. - - An inexact match is used on the "on" key, i.e. a row is considered a - match if and only if left_on - tolerance <= right_on <= left_on. - - The input dataset must be sorted by the "on" key. Must be a single - field of a common type. - - Currently, the "on" key must be an integer, date, or timestamp type. - by : str or list[str] - The columns from current table that should be used as the keys - of the join operation left side. The join operation is then done - only for the matches in these columns. - tolerance : int - The tolerance for inexact "on" key matching. A right row is considered - a match with the left row ``right.on - left.on <= tolerance``. The - ``tolerance`` may be: - - - negative, in which case a past-as-of-join occurs; - - or positive, in which case a future-as-of-join occurs; - - or zero, in which case an exact-as-of-join occurs. - - The tolerance is interpreted in the same units as the "on" key. - right_on : str or list[str], default None - The columns from the right_table that should be used as the on key - on the join operation right side. - When ``None`` use the same key name as the left table. - right_by : str or list[str], default None - The columns from the right_table that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left table. - - Returns - ------- - Table - - Example - -------- - >>> import pyarrow as pa - >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) - >>> t2 = pa.table( - ... { - ... "id": [3, 4], - ... "year": [2020, 2021], - ... "n_legs": [5, 100], - ... "animal": ["Brittle stars", "Centipede"], - ... } - ... ) - - >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[1,3,2,3,3]] - year: [[2020,2021,2022,2022,2023]] - n_legs: [[null,5,null,5,null]] - animal: [[null,"Brittle stars",null,"Brittle stars",null]] - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the table as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - Currently, this is not supported and will raise a - NotImplementedError if the schema doesn't match the current schema. - - Returns - ------- - PyCapsule - """ + ) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + @property - def is_cpu(self) -> bool: - """ - Whether all ChunkedArrays are CPU-accessible. - """ + def is_cpu(self) -> bool: ... + def record_batch( data: dict[str, list[Any] | Array[Any]] @@ -4653,138 +591,8 @@ def record_batch( names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping[str | bytes, str | bytes] | None = None, -) -> RecordBatch: - """ - Create a pyarrow.RecordBatch from another Python data structure or sequence - of arrays. - - Parameters - ---------- - data : dict, list, pandas.DataFrame, Arrow-compatible table - A mapping of strings to Arrays or Python lists, a list of Arrays, - a pandas DataFame, or any tabular object implementing the - Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or - ``__arrow_c_device_array__`` method). - names : list, default None - Column names if list of arrays passed as data. Mutually exclusive with - 'schema' argument. - schema : Schema, default None - The expected schema of the RecordBatch. If not passed, will be inferred - from the data. Mutually exclusive with 'names' argument. - metadata : dict or Mapping, default None - Optional metadata for the schema (if schema not passed). - - Returns - ------- - RecordBatch - - See Also - -------- - RecordBatch.from_arrays, RecordBatch.from_pandas, table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a RecordBatch from a python dictionary: - - >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Creating a RecordBatch from a list of arrays with names: - - >>> pa.record_batch([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - - Creating a RecordBatch from a list of arrays with names and metadata: - - >>> my_metadata = {"n_legs": "How many legs does an animal have?"} - >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'How many legs does an animal have?' - - Creating a RecordBatch from a pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.record_batch(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - >>> pa.record_batch(df).to_pandas() - year month day n_legs animals - 0 2020 3 1 2 Flamingo - 1 2022 5 5 4 Horse - 2 2021 7 9 5 Brittle stars - 3 2022 9 13 100 Centipede - - Creating a RecordBatch from a pandas DataFrame with schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.record_batch(df, my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: ... - >>> pa.record_batch(df, my_schema).to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - """ +) -> RecordBatch: ... + def table( data: dict[str, list[Any] | Array[Any]] @@ -4797,223 +605,19 @@ def table( schema: Schema | None = None, metadata: Mapping[str | bytes, str | bytes] | None = None, nthreads: int | None = None, -) -> Table: - """ - Create a pyarrow.Table from a Python data structure or sequence of arrays. - - Parameters - ---------- - data : dict, list, pandas.DataFrame, Arrow-compatible table - A mapping of strings to Arrays or Python lists, a list of arrays or - chunked arrays, a pandas DataFame, or any tabular object implementing - the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, - ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). - names : list, default None - Column names if list of arrays passed as data. Mutually exclusive with - 'schema' argument. - schema : Schema, default None - The expected schema of the Arrow Table. If not passed, will be inferred - from the data. Mutually exclusive with 'names' argument. - If passed, the output will have exactly this schema (raising an error - when columns are not found in the data and ignoring additional data not - specified in the schema, when data is a dict or DataFrame). - metadata : dict or Mapping, default None - Optional metadata for the schema (if schema not passed). - nthreads : int, default None - For pandas.DataFrame inputs: if greater than 1, convert columns to - Arrow in parallel using indicated number of threads. By default, - this follows :func:`pyarrow.cpu_count` (may use up to system CPU count - threads). - - Returns - ------- - Table - - See Also - -------- - Table.from_arrays, Table.from_pandas, Table.from_pydict - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from a python dictionary: - - >>> pa.table({"n_legs": n_legs, "animals": animals}) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays: - - >>> pa.table([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.table(df) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from pandas DataFrame with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.table(df, my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... - - Construct a Table from chunked arrays: - - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] - ... ) - >>> table = pa.table([n_legs, animals], names=names) - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4],[4,5,100]] - animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] - """ +) -> Table: ... + def concat_tables( tables: Iterable[Table], memory_pool: MemoryPool | None = None, promote_options: Literal["none", "default", "permissive"] = "none", **kwargs: Any, -) -> Table: - """ - Concatenate pyarrow.Table objects. - - If promote_options="none", a zero-copy concatenation will be performed. The schemas - of all the Tables must be the same (except the metadata), otherwise an - exception will be raised. The result Table will share the metadata with the - first table. - - If promote_options="default", any null type arrays will be casted to the type of other - arrays in the column of the same name. If a table is missing a particular - field, null values of the appropriate type will be generated to take the - place of the missing field. The new schema will share the metadata with the - first table. Each field in the new schema will share the metadata with the - first table which has the field defined. Note that type promotions may - involve additional allocations on the given ``memory_pool``. - - If promote_options="permissive", the behavior of default plus types will be promoted - to the common denominator that fits all the fields. - - Parameters - ---------- - tables : iterable of pyarrow.Table objects - Pyarrow tables to concatenate into a single Table. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - promote_options : str, default none - Accepts strings "none", "default" and "permissive". - **kwargs : dict, optional - - Examples - -------- - >>> import pyarrow as pa - >>> t1 = pa.table( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) - >>> pa.concat_tables([t1, t2]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100],[2,4]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] - - """ +) -> Table: ... + class TableGroupBy: - """ - A grouping of columns in a table on which to perform aggregations. - - Parameters - ---------- - table : pyarrow.Table - Input table to execute the aggregation on. - keys : str or list[str] - Name of the grouped columns. - use_threads : bool, default True - Whether to use multithreading or not. When set to True (the default), - no stable ordering of the output is guaranteed. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.table( - ... [ - ... pa.array(["a", "a", "b", "b", "c"]), - ... pa.array([1, 2, 3, 4, 5]), - ... ], - ... names=["keys", "values"], - ... ) - - Grouping of columns: - - >>> pa.TableGroupBy(t, "keys") - - - Perform aggregations: - - >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) - pyarrow.Table - keys: string - values_sum: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - """ + keys: str | list[str] def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... @@ -5023,138 +627,16 @@ class TableGroupBy: tuple[ColumnSelector, Aggregation] | tuple[ColumnSelector, Aggregation, AggregateOptions | None] ], - ) -> Table: - """ - Perform an aggregation over the grouped columns of the table. - - Parameters - ---------- - aggregations : list[tuple(str, str)] or \ -list[tuple(str, str, FunctionOptions)] - List of tuples, where each tuple is one aggregation specification - and consists of: aggregation column name followed - by function name and optionally aggregation function option. - Pass empty list to get a single row for each group. - The column name can be a string, an empty list or a list of - column names, for unary, nullary and n-ary aggregation functions - respectively. - - For the list of function names and respective aggregation - function options see :ref:`py-grouped-aggrs`. - - Returns - ------- - Table - Results of the aggregation functions. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.table([ - ... pa.array(["a", "a", "b", "b", "c"]), - ... pa.array([1, 2, 3, 4, 5]), - ... ], names=["keys", "values"]) - - Sum the column "values" over the grouped column "keys": - - >>> t.group_by("keys").aggregate([("values", "sum")]) - pyarrow.Table - keys: string - values_sum: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - - Count the rows over the grouped column "keys": - - >>> t.group_by("keys").aggregate([([], "count_all")]) - pyarrow.Table - keys: string - count_all: int64 - ---- - keys: [["a","b","c"]] - count_all: [[2,2,1]] - - Do multiple aggregations: - - >>> t.group_by("keys").aggregate([ - ... ("values", "sum"), - ... ("keys", "count") - ... ]) - pyarrow.Table - keys: string - values_sum: int64 - keys_count: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - keys_count: [[2,2,1]] - - Count the number of non-null values for column "values" - over the grouped column "keys": - - >>> import pyarrow.compute as pc - >>> t.group_by(["keys"]).aggregate([ - ... ("values", "count", pc.CountOptions(mode="only_valid")) - ... ]) - pyarrow.Table - keys: string - values_count: int64 - ---- - keys: [["a","b","c"]] - values_count: [[2,2,1]] - - Get a single row for each group in column "keys": - - >>> t.group_by("keys").aggregate([]) - pyarrow.Table - keys: string - ---- - keys: [["a","b","c"]] - """ + ) -> Table: ... + def _table(self) -> Table: ... @property def _use_threads(self) -> bool: ... def concat_batches( recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None -) -> RecordBatch: - """ - Concatenate pyarrow.RecordBatch objects. - - All recordbatches must share the same Schema, - the operation implies a copy of the data to merge - the arrays of the different RecordBatches. - - Parameters - ---------- - recordbatches : iterable of pyarrow.RecordBatch objects - Pyarrow record batches to concatenate into a single RecordBatch. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Examples - -------- - >>> import pyarrow as pa - >>> t1 = pa.record_batch( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> t2 = pa.record_batch( - ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] - ... ) - >>> pa.concat_batches([t1, t2]) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100,2,4] - animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] - - """ +) -> RecordBatch: ... + __all__ = [ "ChunkedArray", diff --git a/python/pyarrow-stubs/tensor.pyi b/python/pyarrow-stubs/tensor.pyi index 7e9b86ea1cd..471f0ec1e98 100644 --- a/python/pyarrow-stubs/tensor.pyi +++ b/python/pyarrow-stubs/tensor.pyi @@ -29,219 +29,44 @@ from scipy.sparse import coo_matrix, csr_matrix from sparse import COO # type: ignore class Tensor(_Weakrefable): - """ - A n-dimensional array a.k.a Tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - - type: int32 - shape: (2, 3) - strides: (12, 4) - """ + @classmethod - def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Create a Tensor from a numpy array. - - Parameters - ---------- - obj : numpy.ndarray - The source numpy array - dim_names : list, optional - Names of each dimension of the Tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - - type: int32 - shape: (2, 3) - strides: (12, 4) - """ - def to_numpy(self) -> np.ndarray: - """ - Convert arrow::Tensor to numpy.ndarray with zero copy - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.to_numpy() - array([[ 2, 2, 4], - [ 4, 5, 100]], dtype=int32) - """ - def equals(self, other: Tensor) -> bool: - """ - Return true if the tensors contains exactly equal data. - - Parameters - ---------- - other : Tensor - The other tensor to compare for equality. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) - >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a","b"]) - >>> tensor.equals(tensor) - True - >>> tensor.equals(tensor2) - False - """ - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.dim_name(0) - 'dim1' - >>> tensor.dim_name(1) - 'dim2' - """ - @property - def dim_names(self) -> list[str]: - """ - Names of this tensor dimensions. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.dim_names - ['dim1', 'dim2'] - """ - @property - def is_mutable(self) -> bool: - """ - Is this tensor mutable or immutable. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.is_mutable - True - """ - @property - def is_contiguous(self) -> bool: - """ - Is this tensor contiguous in memory. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.is_contiguous - True - """ - @property - def ndim(self) -> int: - """ - The dimension (n) of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.ndim - 2 - """ - @property - def size(self) -> str: - """ - The size of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.size - 6 - """ - @property - def shape(self) -> tuple[int, ...]: - """ - The shape of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.shape - (2, 3) - """ - @property - def strides(self) -> tuple[int, ...]: - """ - Strides of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) - >>> tensor.strides - (12, 4) - """ + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + + def to_numpy(self) -> np.ndarray: ... + + def equals(self, other: Tensor) -> bool: ... + + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_contiguous(self) -> bool: ... + + @property + def ndim(self) -> int: ... + + @property + def size(self) -> str: ... + + @property + def shape(self) -> tuple[int, ...]: ... + + @property + def strides(self) -> tuple[int, ...]: ... + class SparseCOOTensor(_Weakrefable): - """ - A sparse COO tensor. - """ + @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCOOTensor - - Parameters - ---------- - obj : numpy.ndarray - Data used to populate the rows. - dim_names : list[str], optional - Names of the dimensions. - - Returns - ------- - pyarrow.SparseCOOTensor - """ + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod def from_numpy( @@ -250,80 +75,27 @@ class SparseCOOTensor(_Weakrefable): coords: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCOOTensor from numpy.ndarrays - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the rows. - coords : numpy.ndarray - Coordinates of the data. - shape : tuple - Shape of the tensor. - dim_names : list, optional - Names of the dimensions. - """ + ) -> Self: ... + @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: - """ - Convert scipy.sparse.coo_array or scipy.sparse.coo_matrix to arrow::SparseCOOTensor - - Parameters - ---------- - obj : scipy.sparse.coo_array or scipy.sparse.coo_matrix - The scipy array or matrix that should be converted. - dim_names : list, optional - Names of the dimensions. - """ + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod - def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: - """ - Convert pydata/sparse.COO to arrow::SparseCOOTensor. - - Parameters - ---------- - obj : pydata.sparse.COO - The sparse multidimensional array that should be converted. - dim_names : list, optional - Names of the dimensions. - """ + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: ... + @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCOOTensor. - - Parameters - ---------- - obj : Tensor - The tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. - """ - def to_scipy(self) -> coo_matrix: - """ - Convert arrow::SparseCOOTensor to scipy.sparse.coo_array. - """ - def to_pydata_sparse(self) -> COO: - """ - Convert arrow::SparseCOOTensor to pydata/sparse.COO. - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCOOTensor to arrow::Tensor. - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data. - - Parameters - ---------- - other : SparseCOOTensor - The other tensor to compare for equality. - """ + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> coo_matrix: ... + + def to_pydata_sparse(self) -> COO: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + @property def is_mutable(self) -> bool: ... @property @@ -332,19 +104,8 @@ class SparseCOOTensor(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. + def dim_name(self, i: int) -> str: ... - Returns - ------- - str - """ @property def dim_names(self) -> list[str]: ... @property @@ -353,26 +114,11 @@ class SparseCOOTensor(_Weakrefable): def has_canonical_format(self) -> bool: ... class SparseCSRMatrix(_Weakrefable): - """ - A sparse CSR matrix. - """ + @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCSRMatrix - - Parameters - ---------- - obj : numpy.ndarray - The dense numpy array that should be converted. - dim_names : list, optional - The names of the dimensions. - - Returns - ------- - pyarrow.SparseCSRMatrix - """ + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod def from_numpy( cls, @@ -381,67 +127,22 @@ class SparseCSRMatrix(_Weakrefable): indices: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCSRMatrix from numpy.ndarrays. - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the sparse matrix. - indptr : numpy.ndarray - Range of the rows, - The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. - indices : numpy.ndarray - Column indices of the corresponding non-zero values. - shape : tuple - Shape of the matrix. - dim_names : list, optional - Names of the dimensions. - """ + ) -> Self: ... + @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: - """ - Convert scipy.sparse.csr_array or scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. - - Parameters - ---------- - obj : scipy.sparse.csr_array or scipy.sparse.csr_matrix - The scipy matrix that should be converted. - dim_names : list, optional - Names of the dimensions. - """ + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCSRMatrix. - - Parameters - ---------- - obj : Tensor - The dense tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. - """ - def to_scipy(self) -> csr_matrix: - """ - Convert arrow::SparseCSRMatrix to scipy.sparse.csr_array. - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCSRMatrix to arrow::Tensor. - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data. - - Parameters - ---------- - other : SparseCSRMatrix - The other tensor to compare for equality. - """ + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + @property def is_mutable(self) -> bool: ... @property @@ -450,45 +151,19 @@ class SparseCSRMatrix(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. + def dim_name(self, i: int) -> str: ... - Returns - ------- - str - """ @property def dim_names(self) -> list[str]: ... @property def non_zero_length(self) -> int: ... class SparseCSCMatrix(_Weakrefable): - """ - A sparse CSC matrix. - """ + @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCSCMatrix - - Parameters - ---------- - obj : numpy.ndarray - Data used to populate the rows. - dim_names : list[str], optional - Names of the dimensions. - - Returns - ------- - pyarrow.SparseCSCMatrix - """ + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod def from_numpy( cls, @@ -497,67 +172,22 @@ class SparseCSCMatrix(_Weakrefable): indices: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCSCMatrix from numpy.ndarrays - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the sparse matrix. - indptr : numpy.ndarray - Range of the rows, - The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. - indices : numpy.ndarray - Column indices of the corresponding non-zero values. - shape : tuple - Shape of the matrix. - dim_names : list, optional - Names of the dimensions. - """ + ) -> Self: ... + @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: - """ - Convert scipy.sparse.csc_array or scipy.sparse.csc_matrix to arrow::SparseCSCMatrix - - Parameters - ---------- - obj : scipy.sparse.csc_array or scipy.sparse.csc_matrix - The scipy matrix that should be converted. - dim_names : list, optional - Names of the dimensions. - """ + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCSCMatrix - - Parameters - ---------- - obj : Tensor - The dense tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy - """ - def to_scipy(self) -> csr_matrix: - """ - Convert arrow::SparseCSCMatrix to scipy.sparse.csc_array - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCSCMatrix to arrow::Tensor - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data - - Parameters - ---------- - other : SparseCSCMatrix - The other tensor to compare for equality. - """ + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + @property def is_mutable(self) -> bool: ... @property @@ -566,52 +196,19 @@ class SparseCSCMatrix(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. + def dim_name(self, i: int) -> str: ... - Returns - ------- - str - """ @property def dim_names(self) -> list[str]: ... @property def non_zero_length(self) -> int: ... class SparseCSFTensor(_Weakrefable): - """ - A sparse CSF tensor. - CSF is a generalization of compressed sparse row (CSR) index. - - CSF index recursively compresses each dimension of a tensor into a set - of prefix trees. Each path from a root to leaf forms one tensor - non-zero index. CSF is implemented with two arrays of buffers and one - arrays of integers. - """ @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCSFTensor - - Parameters - ---------- - obj : numpy.ndarray - Data used to populate the rows. - dim_names : list[str], optional - Names of the dimensions. - - Returns - ------- - pyarrow.SparseCSFTensor - """ + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod def from_numpy( cls, @@ -621,59 +218,17 @@ class SparseCSFTensor(_Weakrefable): shape: tuple[int, ...], axis_order: list[int] | None = None, dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCSFTensor from numpy.ndarrays - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the sparse tensor. - indptr : numpy.ndarray - The sparsity structure. - Each two consecutive dimensions in a tensor correspond to - a buffer in indices. - A pair of consecutive values at `indptr[dim][i]` - `indptr[dim][i + 1]` signify a range of nodes in - `indices[dim + 1]` who are children of `indices[dim][i]` node. - indices : numpy.ndarray - Stores values of nodes. - Each tensor dimension corresponds to a buffer in indptr. - shape : tuple - Shape of the matrix. - axis_order : list, optional - the sequence in which dimensions were traversed to - produce the prefix tree. - dim_names : list, optional - Names of the dimensions. - """ + ) -> Self: ... + @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCSFTensor - - Parameters - ---------- - obj : Tensor - The dense tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCSFTensor to arrow::Tensor - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data - - Parameters - ---------- - other : SparseCSFTensor - The other tensor to compare for equality. - """ + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + @property def is_mutable(self) -> bool: ... @property @@ -682,19 +237,8 @@ class SparseCSFTensor(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Returns - ------- - str - """ + def dim_name(self, i: int) -> str: ... + @property def dim_names(self) -> list[str]: ... @property diff --git a/python/pyarrow-stubs/types.pyi b/python/pyarrow-stubs/types.pyi index 98181f6acc2..def5e3771ab 100644 --- a/python/pyarrow-stubs/types.pyi +++ b/python/pyarrow-stubs/types.pyi @@ -87,7 +87,8 @@ _Decimal: TypeAlias = ( _Date: TypeAlias = Date32Type | Date64Type _Time: TypeAlias = Time32Type[Any] | Time64Type[Any] _Interval: TypeAlias = MonthDayNanoIntervalType -_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval +_Temporal: TypeAlias = TimestampType[Any, + Any] | DurationType[Any] | _Time | _Date | _Interval _Union: TypeAlias = SparseUnionType | DenseUnionType _Nested: TypeAlias = ( ListType[Any] @@ -100,6 +101,7 @@ _Nested: TypeAlias = ( | _Union ) + def is_null(t: DataType) -> TypeIs[NullType]: ... def is_boolean(t: DataType) -> TypeIs[BoolType]: ... def is_integer(t: DataType) -> TypeIs[_Integer]: ... @@ -157,6 +159,7 @@ def is_boolean_value(obj: Any) -> bool: ... def is_integer_value(obj: Any) -> bool: ... def is_float_value(obj: Any) -> bool: ... + __all__ = [ "is_binary", "is_binary_view", diff --git a/python/pyarrow-stubs/util.pyi b/python/pyarrow-stubs/util.pyi index 5c9687bb83f..db74524d77d 100644 --- a/python/pyarrow-stubs/util.pyi +++ b/python/pyarrow-stubs/util.pyi @@ -22,9 +22,11 @@ from typing import Any, Protocol, Sequence, TypeVar _F = TypeVar("_F", bound=Callable) _N = TypeVar("_N") + class _DocStringComponents(Protocol): _docstring_components: list[str] + def doc( *docstrings: str | _DocStringComponents | Callable | None, **params: Any ) -> Callable[[_F], _F]: ... @@ -32,6 +34,8 @@ def _is_iterable(obj) -> bool: ... def _is_path_like(path) -> bool: ... def _stringify_path(path: str | PathLike) -> str: ... def product(seq: Sequence[_N]) -> _N: ... + + def get_contiguous_span( shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int ) -> tuple[int, int]: ... From b1f43b2f72747a9881060439cbf503723537662e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 17 Sep 2025 21:48:08 +0200 Subject: [PATCH 20/26] ReplaceEllipsis to replace ellipsis with docstrings --- dev/update_stub_docstrings.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index 17f7e8e1aa1..2ad7a877f34 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -117,6 +117,46 @@ def leave_SimpleString(self, original_node, updated_node): return updated_node +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.stack = [namespace] if namespace else [] + self.indentation = 0 + self.package = package + + def _get_docstring(self, name, indentation): + # print(name) + try: + obj = self.package.get_member(name) + if obj.has_docstring: + indentation_prefix = indentation * " " + docstring = indent(obj.docstring.value, indentation_prefix) + docstring = f'"""\n{docstring}\n{indentation_prefix}"""' + # print(f"{name} has {len(docstring)} long docstring.") + return docstring + except KeyError: + print(f"{name} has no docstring.") + return "" + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + node_name = ".".join(self.stack) + indentation = self.indentation + self.stack.pop() + self.indentation -= 1 + + if isinstance(updated_node.body.body[0].value, libcst.Ellipsis): + print(node_name) + docstring = self._get_docstring(node_name, indentation) + if docstring and len(docstring) > 0: + new_docstring = libcst.SimpleString(value=docstring) + new_body = updated_node.body.with_changes(body=[libcst.Expr(value=new_docstring)]) + return updated_node.with_changes(body=new_body) + return updated_node + + @click.command() @click.option('--pyarrow_folder', '-f', type=click.Path(resolve_path=True)) def update_stub_files(pyarrow_folder): From 9db449971741b81c0734d5900fd7f8640fe56edc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 Sep 2025 12:52:57 +0200 Subject: [PATCH 21/26] minor fixes --- python/pyarrow-stubs/_parquet.pyi | 2 +- python/pyarrow-stubs/lib.pyi | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow-stubs/_parquet.pyi b/python/pyarrow-stubs/_parquet.pyi index ce499fd1c16..35ee2b41fde 100644 --- a/python/pyarrow-stubs/_parquet.pyi +++ b/python/pyarrow-stubs/_parquet.pyi @@ -127,7 +127,7 @@ class Statistics(_Weakrefable): @property def has_min_max(self) -> bool: ... @property - def hash_null_count(self) -> bool: ... + def has_null_count(self) -> bool: ... @property def has_distinct_count(self) -> bool: ... @property diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi index eea11a2e8f1..43c40b61cf8 100644 --- a/python/pyarrow-stubs/lib.pyi +++ b/python/pyarrow-stubs/lib.pyi @@ -19,7 +19,6 @@ from typing import NamedTuple from .array import * -# from .benchmark import * from .builder import * from .compat import * from .config import * From 33fbbb90013ff449c09476d519a7511599a95a0c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 Sep 2025 19:27:30 +0200 Subject: [PATCH 22/26] add ellipsis to _ipc.pyi --- python/pyarrow-stubs/_ipc.pyi | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/python/pyarrow-stubs/_ipc.pyi b/python/pyarrow-stubs/_ipc.pyi index 23d770070e7..6e83541bf5c 100644 --- a/python/pyarrow-stubs/_ipc.pyi +++ b/python/pyarrow-stubs/_ipc.pyi @@ -36,6 +36,7 @@ from ._types import DictionaryMemo, KeyValueMetadata class MetadataVersion(enum.IntEnum): + ... V1 = enum.auto() V2 = enum.auto() V3 = enum.auto() @@ -44,7 +45,7 @@ class MetadataVersion(enum.IntEnum): class WriteStats(NamedTuple): - + ... num_messages: int num_record_batches: int num_dictionary_batches: int @@ -53,7 +54,7 @@ class WriteStats(NamedTuple): class ReadStats(NamedTuple): - + ... num_messages: int num_record_batches: int num_dictionary_batches: int @@ -62,7 +63,7 @@ class ReadStats(NamedTuple): class IpcReadOptions(_Weakrefable): - + ... ensure_native_endian: bool use_threads: bool included_fields: list[int] @@ -77,7 +78,7 @@ class IpcReadOptions(_Weakrefable): class IpcWriteOptions(_Weakrefable): - + ... metadata_version: MetadataVersion allow_64bit: bool use_legacy_format: bool @@ -100,7 +101,7 @@ class IpcWriteOptions(_Weakrefable): class Message(_Weakrefable): - + ... @property def type(self) -> str: ... @property @@ -120,7 +121,7 @@ class Message(_Weakrefable): class MessageReader(_Weakrefable): - + ... @classmethod def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: ... @@ -135,7 +136,7 @@ class MessageReader(_Weakrefable): class _CRecordBatchWriter(_Weakrefable): - + ... def write(self, table_or_batch: Table | RecordBatch): ... def write_batch( @@ -155,6 +156,7 @@ class _CRecordBatchWriter(_Weakrefable): class _RecordBatchStreamWriter(_CRecordBatchWriter): + ... @property def _use_legacy_format(self) -> bool: ... @property @@ -164,11 +166,12 @@ class _RecordBatchStreamWriter(_CRecordBatchWriter): class _ReadPandasMixin: + ... def read_pandas(self, **options) -> pd.DataFrame: ... class RecordBatchReader(_Weakrefable): - + ... def __iter__(self) -> Self: ... def read_next_batch(self) -> RecordBatch: ... @@ -211,6 +214,7 @@ class RecordBatchReader(_Weakrefable): class _RecordBatchStreamReader(RecordBatchReader): + ... @property def stats(self) -> ReadStats: ... @@ -220,12 +224,13 @@ class _RecordBatchFileWriter(_RecordBatchStreamWriter): class RecordBatchWithMetadata(NamedTuple): - + ... batch: RecordBatch custom_metadata: KeyValueMetadata class _RecordBatchFileReader(_Weakrefable): + ... @property def num_record_batches(self) -> int: ... From 86e7ba4a76db025ebae5ec9212578867365b7fb0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 Sep 2025 20:10:29 +0200 Subject: [PATCH 23/26] docstring update script --- dev/update_stub_docstrings.py | 188 +++++++++++++--------------------- 1 file changed, 74 insertions(+), 114 deletions(-) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index 2ad7a877f34..dceda807545 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -30,157 +30,117 @@ import click # TODO: perhaps replace griffe with importlib import griffe +from griffe import AliasResolutionError import libcst -class DocUpdater(libcst.CSTTransformer): - def __init__(self, package, namespace): - self.stack = [namespace] if namespace else [] - self._docstring = None - self.indentation = 0 - self.package = package +def _get_docstring(name, package, indentation): + # print("extract_docstrings", name) + try: + obj = package.get_member(name) + except (KeyError, ValueError, AliasResolutionError): + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + stack = name.split(".") + parent_name = ".".join(stack[:-1]) - def _get_docstring(self, name): - # print("extract_docstrings", name) try: - obj = self.package.get_member(name) - except KeyError: - # Some cython __init__ symbols can't be found - # e.g. pyarrow.lib.OSFile.__init__ - parent_name = ".".join(self.stack[:-1]) - - try: - obj = self.package.get_member(parent_name).all_members[self.stack[-1]] - except KeyError: - # print(f"{name} not found in {self.package.name}, it's probably ok.") - return None - - if obj.has_docstring: - docstring = obj.docstring.value - # remove signature if present in docstring - if docstring.startswith(obj.name) or ( - (hasattr(obj.parent, "name") and - docstring.startswith(f"{obj.parent.name}.{obj.name}"))): - return "\n".join(docstring.splitlines()[2:]) - else: - return docstring - return None + obj = package.get_member(parent_name).all_members[stack[-1]] + except (KeyError, ValueError, AliasResolutionError): + print(f"{name} not found in {package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + docstring = "\n".join(docstring.splitlines()[2:]) + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + + return docstring + return None + +def _has_ellipsis(node): + if hasattr(node.body.body[0], "value") and isinstance(node.body.body[0].value, libcst.Ellipsis): + return True + return False - def visit_ClassDef(self, node): - # TODO: class docstrings? - self.stack.append(node.name.value) - self.indentation += 1 - node_name = ".".join(self.stack) - docstring = self._get_docstring(node_name) - - if docstring: - if not node.get_docstring(clean=False): - print("Missing docstring (in annotations) for:", node_name) - return False - self._docstring = f'"""{node.get_docstring(clean=False)}"""' - return True - return False - - def visit_FunctionDef(self, node): - self.stack.append(node.name.value) - self.indentation += 1 - node_name = ".".join(self.stack) - docstring = self._get_docstring(node_name) - if docstring: - if not node.get_docstring(clean=False): - print("Missing docstring (in annotations) for:", node_name) - return False - self._docstring = f'"""{node.get_docstring(clean=False)}"""' - return True - return False +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.package = package + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 - def leave_ClassDef(self, original_node, updated_node): - self.stack.pop() - self.indentation -= 1 - return updated_node + def _replace_ellipsis(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name - def leave_FunctionDef(self, original_node, updated_node): + if _has_ellipsis(updated_node): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None and len(docstring) > 0: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) self.stack.pop() self.indentation -= 1 return updated_node - def leave_SimpleString(self, original_node, updated_node): - node_name = ".".join(self.stack) - - if original_node.value == self._docstring: - indentation = self.indentation * " " - indented_docstring = indent(self._get_docstring(node_name), indentation) - docstring = f'"""\n{indented_docstring}\n{indentation}"""' - return updated_node.with_changes(value=docstring) - - return updated_node - - -class ReplaceEllipsis(libcst.CSTTransformer): - def __init__(self, package, namespace): - self.stack = [namespace] if namespace else [] - self.indentation = 0 - self.package = package - - def _get_docstring(self, name, indentation): - # print(name) - try: - obj = self.package.get_member(name) - if obj.has_docstring: - indentation_prefix = indentation * " " - docstring = indent(obj.docstring.value, indentation_prefix) - docstring = f'"""\n{docstring}\n{indentation_prefix}"""' - # print(f"{name} has {len(docstring)} long docstring.") - return docstring - except KeyError: - print(f"{name} has no docstring.") - return "" + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + def leave_ClassDef(self, original_node, updated_node): + return self._replace_ellipsis(original_node, updated_node) def visit_FunctionDef(self, node): self.stack.append(node.name.value) self.indentation += 1 - def leave_FunctionDef(self, original_node, updated_node): - node_name = ".".join(self.stack) - indentation = self.indentation - self.stack.pop() - self.indentation -= 1 - - if isinstance(updated_node.body.body[0].value, libcst.Ellipsis): - print(node_name) - docstring = self._get_docstring(node_name, indentation) - if docstring and len(docstring) > 0: - new_docstring = libcst.SimpleString(value=docstring) - new_body = updated_node.body.with_changes(body=[libcst.Expr(value=new_docstring)]) - return updated_node.with_changes(body=new_body) - return updated_node + return self._replace_ellipsis(original_node, updated_node) @click.command() @click.option('--pyarrow_folder', '-f', type=click.Path(resolve_path=True)) -def update_stub_files(pyarrow_folder): +def add_docs_to_stub_files(pyarrow_folder): print("Updating docstrings of stub files in:", pyarrow_folder) package = griffe.load("pyarrow", try_relative_path=True, force_inspection=True, resolve_aliases=True) + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"] for stub_file in Path(pyarrow_folder).rglob('*.pyi'): if stub_file.name == "_stubs_typing.pyi": continue - print(f"[{stub_file}]") with open(stub_file, 'r') as f: tree = libcst.parse_module(f.read()) - if stub_file.name != "__init__.pyi": - modified_tree = tree.visit(DocUpdater(package, "lib")) - else: - modified_tree = tree.visit(DocUpdater(package, None)) + module = stub_file.with_suffix('').name + if module in lib_modules: + module = "lib" + elif stub_file.parent.name in ["parquet", "interchange"]: + module = f"{stub_file.parent.name}.{module}" + elif module == "__init__": + module = "" + + modified_tree = tree.visit(ReplaceEllipsis(package, module)) with open(stub_file, "w") as f: f.write(modified_tree.code) + print("\n") if __name__ == "__main__": docstrings_map = {} - update_stub_files(obj={}) + add_docs_to_stub_files(obj={}) From 133af4d0c5bd82ef821f15fbc9583c94a5a83934 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 Sep 2025 23:50:38 +0200 Subject: [PATCH 24/26] move ipc around --- dev/update_stub_docstrings.py | 4 ++-- python/pyarrow-stubs/_cuda.pyi | 2 +- python/pyarrow/{_ipc.py => ipc.py} | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) rename python/pyarrow/{_ipc.py => ipc.py} (99%) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index dceda807545..0a9ce41e1f6 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -122,12 +122,12 @@ def add_docs_to_stub_files(pyarrow_folder): for stub_file in Path(pyarrow_folder).rglob('*.pyi'): if stub_file.name == "_stubs_typing.pyi": continue - print(f"[{stub_file}]") + module = stub_file.with_suffix('').name + print(f"[{stub_file} {module}]") with open(stub_file, 'r') as f: tree = libcst.parse_module(f.read()) - module = stub_file.with_suffix('').name if module in lib_modules: module = "lib" elif stub_file.parent.name in ["parquet", "interchange"]: diff --git a/python/pyarrow-stubs/_cuda.pyi b/python/pyarrow-stubs/_cuda.pyi index 3ec866ad668..929f448f396 100644 --- a/python/pyarrow-stubs/_cuda.pyi +++ b/python/pyarrow-stubs/_cuda.pyi @@ -19,7 +19,7 @@ from typing import Any import cuda # type: ignore[import-not-found] -from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-untyped] +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] from . import lib from ._stubs_typing import ArrayLike diff --git a/python/pyarrow/_ipc.py b/python/pyarrow/ipc.py similarity index 99% rename from python/pyarrow/_ipc.py rename to python/pyarrow/ipc.py index 4e236678788..19d3d46f3ba 100644 --- a/python/pyarrow/_ipc.py +++ b/python/pyarrow/ipc.py @@ -278,3 +278,4 @@ def deserialize_pandas(buf, *, use_threads=True): with pa.RecordBatchStreamReader(buffer_reader) as reader: table = reader.read_all() return table.to_pandas(use_threads=use_threads) + From 5e488be497386db649880ff66684c7fe17fbb937 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 16:36:15 +0200 Subject: [PATCH 25/26] improve script --- .github/workflows/python.yml | 2 +- dev/update_stub_docstrings.py | 108 +++++++++++++++++++++++++++------- python/pyarrow-stubs/_fs.pyi | 2 +- python/pyarrow/ipc.py | 1 - 4 files changed, 90 insertions(+), 23 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 700218024a5..59bcba0837c 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -140,7 +140,7 @@ jobs: - name: Type check with mypy and pyright run: |- - python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst + python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst types-cffi pushd python; # pip install -e . mypy pyarrow-stubs pyarrow/tests/test_array.py pyarrow/tests/test_io.py diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index 0a9ce41e1f6..7eb1ee2925d 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -21,7 +21,7 @@ # Usage # ===== # -# python ../dev/update_stub_docstrings.py -f ./pyarrow/ +# python ./dev/update_stub_docstrings.py -f ./python/pyarrow-stubs from pathlib import Path @@ -32,6 +32,7 @@ import griffe from griffe import AliasResolutionError import libcst +from libcst import matchers as m def _get_docstring(name, package, indentation): @@ -52,23 +53,21 @@ def _get_docstring(name, package, indentation): if obj.has_docstring: docstring = obj.docstring.value - # remove signature if present in docstring + # Remove signature if present in docstring if docstring.startswith(obj.name) or ( (hasattr(obj.parent, "name") and docstring.startswith(f"{obj.parent.name}.{obj.name}"))): docstring = "\n".join(docstring.splitlines()[2:]) + # Skip empty docstrings + if docstring.strip() == "": + return None + # Indent docstring indentation_prefix = indentation * " " docstring = indent(docstring + '\n"""', indentation_prefix) docstring = '"""\n' + docstring - return docstring return None -def _has_ellipsis(node): - if hasattr(node.body.body[0], "value") and isinstance(node.body.body[0].value, libcst.Ellipsis): - return True - return False - class ReplaceEllipsis(libcst.CSTTransformer): def __init__(self, package, namespace): @@ -77,37 +76,105 @@ def __init__(self, package, namespace): self.stack = [] self.indentation = 0 - def _replace_ellipsis(self, original_node, updated_node): + # Insert module level docstring if _clone_signature is used + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign( + value=m.Call(func=m.Name(value="_clone_signature")) + ), m.ZeroOrMore()] + ) + for statement in updated_node.body: + new_body.append(statement) + if m.matches(statement, clone_matcher): + name = statement.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.package, 0) + if docstring is not None: + new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) + new_line = libcst.SimpleStatementLine(body=[new_expr]) + new_body.append(new_line) + + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): name = ".".join(self.stack) if self.base_namespace: name = self.base_namespace + "." + name - if _has_ellipsis(updated_node): + class_matcher_1 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.SimpleStatementLine( + body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()] + ), m.ZeroOrMore()] + ) + ) + class_matcher_2 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()] + ) + ) + + if m.matches(updated_node, class_matcher_1): docstring = _get_docstring(name, self.package, self.indentation) - if docstring is not None and len(docstring) > 0: + if docstring is not None: + new_node = libcst.SimpleString(value=docstring) + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, new_node) + + if m.matches(updated_node, class_matcher_2): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: new_docstring = libcst.SimpleString(value=docstring) new_body = [ libcst.SimpleWhitespace(self.indentation * " "), libcst.Expr(value=new_docstring), libcst.Newline() - ] + ] + list(updated_node.body.body) new_body = libcst.IndentedBlock(body=new_body) updated_node = updated_node.with_changes(body=new_body) + self.stack.pop() self.indentation -= 1 return updated_node - def visit_ClassDef(self, node): - self.stack.append(node.name.value) - self.indentation += 1 - def leave_ClassDef(self, original_node, updated_node): - return self._replace_ellipsis(original_node, updated_node) - def visit_FunctionDef(self, node): self.stack.append(node.name.value) self.indentation += 1 + def leave_FunctionDef(self, original_node, updated_node): - return self._replace_ellipsis(original_node, updated_node) + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + function_matcher = m.FunctionDef( + name=m.Name(), + body=m.SimpleStatementSuite( + body=[m.Expr( + m.Ellipsis() + )])) + if m.matches(original_node, function_matcher): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node @click.command() @@ -117,7 +184,8 @@ def add_docs_to_stub_files(pyarrow_folder): package = griffe.load("pyarrow", try_relative_path=True, force_inspection=True, resolve_aliases=True) lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", - "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"] + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", + "_types"] for stub_file in Path(pyarrow_folder).rglob('*.pyi'): if stub_file.name == "_stubs_typing.pyi": diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi index 42ea8543738..59f803b801e 100644 --- a/python/pyarrow-stubs/_fs.pyi +++ b/python/pyarrow-stubs/_fs.pyi @@ -32,7 +32,7 @@ else: from typing import Union, overload -from fsspec import AbstractFileSystem # type: ignore[import-untyped] +from fsspec import AbstractFileSystem # type: ignore[import-not-found] from .lib import NativeFile, _Weakrefable diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 19d3d46f3ba..4e236678788 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -278,4 +278,3 @@ def deserialize_pandas(buf, *, use_threads=True): with pa.RecordBatchStreamReader(buffer_reader) as reader: table = reader.read_all() return table.to_pandas(use_threads=use_threads) - From b4e326a1c94c47d36a9a07136140895f967c5d24 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 17:38:26 +0200 Subject: [PATCH 26/26] change to CI --- .github/workflows/python.yml | 14 +++++++------- python/pyarrow-stubs/_fs.pyi | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 59bcba0837c..f28e1a65739 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -140,13 +140,13 @@ jobs: - name: Type check with mypy and pyright run: |- - python -m pip install mypy pyright scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst types-cffi - pushd python; - # pip install -e . - mypy pyarrow-stubs pyarrow/tests/test_array.py pyarrow/tests/test_io.py - pyright pyarrow-stubs - # python ../dev/update_stub_docstrings.py -f ./pyarrow - # git status --porcelain=1 + python -m pip install mypy pyright griffe libcst scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst types-cffi + pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pyarrow + cd python + mypy ./pyarrow-stubs ./pyarrow/tests/test_array.py ./pyarrow/tests/test_io.py + pyright ./pyarrow-stubs + cd .. + python ./dev/update_stub_docstrings.py -f ./python/pyarrow-stubs macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi index 59f803b801e..9ec5c543c58 100644 --- a/python/pyarrow-stubs/_fs.pyi +++ b/python/pyarrow-stubs/_fs.pyi @@ -32,7 +32,7 @@ else: from typing import Union, overload -from fsspec import AbstractFileSystem # type: ignore[import-not-found] +from fsspec import AbstractFileSystem # type: ignore from .lib import NativeFile, _Weakrefable