tlc.core.schema_helper#

Helper methods for working with Schemas

Module Contents#

Classes#

Class

Description

SchemaHelper

A class with helper methods for working with Schema objects

API#

class tlc.core.schema_helper.SchemaHelper#

A class with helper methods for working with Schema objects

ARROW_TYPE_TO_SCALAR_VALUE_MAPPING = None#

A mapping from PyArrow types to ScalarValue types.

SCALAR_VALUE_TYPE_TO_ARROW_TYPE_MAPPING = None#

A mapping from ScalarValue types to PyArrow types.

static object_input_urls(obj: Any, schema: tlc.core.schema.Schema) list[tlc.core.url.Url]#

Returns a list of all URLs referenced by this object, from scalar strings or lists of strings

Note: the result is likely to be relative with respect to the object’s URL

static from_pyarrow_datatype(data_type: pyarrow.DataType) tlc.core.schema.ScalarValue | None#

Converts a DataType to a ScalarValue.

Parameters:

data_type – The pyarrow DataType object to convert.

Returns:

The type of the scalar value that corresponds to the pyarrow DataType.

static scalar_value_to_pyarrow_datatype(value: tlc.core.schema.ScalarValue) pyarrow.DataType#

Converts a ScalarValue to a pyarrow DataType.

Parameters:

value – The scalar value to convert.

Returns:

The corresponding pyarrow datatype.

static to_pyarrow_datatype(schema_or_value: tlc.core.schema.Schema | tlc.core.schema.ScalarValue) pyarrow.DataType#

Converts a Schema or ScalarValue to a pyarrow DataType.

Currently supports scalar types, lists of scalar types, structs, and lists of structs.

Parameters:

schema_or_value – The schema or scalar value to convert.

Returns:

The corresponding pyarrow datatype.

static tlc_schema_to_pyarrow_schema(tlc_schema: tlc.core.schema.Schema) pyarrow.Schema#

Convert a 3LC schema to a PyArrow schema.

Parameters:

tlc_schema – The 3LC schema to convert.

Returns:

The PyArrow schema.

static find_pyarrow_types(arrow_schema: pyarrow.Schema, scalar_types: list[pyarrow.DataType]) list[dict[str, object]]#

Find all the paths in an Arrow schema that correspond to scalar types.

static pyarrow_list_to_tlc_schema(arrow_schema: pyarrow.Schema, **schema_kwargs: Any) tlc.core.schema.Schema#
static pyarrow_schema_to_tlc_schema(arrow_schema: pyarrow.Schema, **schema_kwargs: Any) tlc.core.schema.Schema#

Convert a PyArrow schema to a 3LC schema.

Parameters:
  • arrow_schema – The PyArrow schema to convert.

  • schema_kwargs – Additional keyword arguments to pass to the Schema constructor.

Returns:

The 3LC schema.

static cast_scalar(value: Any, value_type: tlc.core.schema.ScalarValue) Any#

Cast a value which is a ScalarValue into its corresponding python type.

static cast_value(value: typing.Any, value_schema: tlc.core.schema.Schema, on_error: typing.Literal[raise, discard] = 'raise') Any#

Cast any value into its corresponding python type based on the Schema.

static default_scalar(value_type: tlc.core.schema.ScalarValue) Any#

Returns the default value for a ScalarValue.

static default_value(schema: tlc.core.schema.Schema) Any#

Returns the default value for a schema.

A schema holds either:

  • a ScalarValue (schema.value) which corresponds to a scalar type (potentially an array of scalars)

  • a dict of sub-Schemas (schema.values) corresponding compound types (potentially an array)

static is_computable(schema: tlc.core.schema.Schema) bool#

Returns True if the schema is computable.

static add_schema_to_existing_schema_at_location(added_schema: tlc.core.schema.Schema, existing_schema: tlc.core.schema.Schema, location: list[str]) None#

Adds the value to the schema at the given location.

static is_pseudo_scalar(schema: tlc.core.schema.Schema) bool#

Returns True if the schema is a pseudo-scalar.

When a schema has a size0 with min=1 and max=1, it is considered a pseudo-scalar. This is a trick we use when unrolling/rolling up tables. We want to treat table cells with 1-element lists as scalars.

static get_nested_schema(schema: tlc.core.schema.Schema, path: str) tlc.core.schema.Schema | None#

Retrieves a nested schema from a schema.

Parameters:
  • schema – The schema to retrieve the nested schema from.

  • path – The (dot-separated) path to the nested schema.

Returns:

The nested schema, or None if the path doesn’t exist.

static create_sparse_schema_from_scalar_value(path: str, scalar_value: tlc.core.schema.ScalarValue) tlc.core.schema.Schema#

Creates a sparse schema from a path and a schema.

Parameters:
  • path – The (dot-separated) path to the nested schema.

  • new_schema – The schema to create the sparse schema from.

Returns:

The sparse schema.

static create_sparse_schema_from_schema(path: str, schema: tlc.core.schema.Schema) tlc.core.schema.Schema#

Creates a sparse schema from a path and a schema.

Parameters:
  • path – The (dot-separated) path to the nested schema.

  • new_schema – The schema to create the sparse schema from.

Returns:

The sparse schema.

static top_level_url_values(schema: tlc.core.schema.Schema) list[str]#

Return a list of sub-schemas that represent atomic URL values.

This function does not return the keys of nested URL values.

Parameters:

schema – The schema to retrieve the URL values from.

Returns:

A list of sub-value keys corresponding to URL values.

static nested_url_columns(schema: tlc.core.schema.Schema, column_path_to_here: list[str] | None = None) list[list[str]]#

Get columns from the schema that have string roles URL/X. Each column is represented as a list of strings, with subsequent strings denoting nested columns.

Parameters:
  • schema – The schema to retrieve the URL columns from.

  • column_path_to_here – The path to the current schema.

static is_embedding_value(schema: tlc.core.schema.Schema) bool#

Returns True if the schema is an atomic schema describing an unreduced embedding value.

static is_numeric_value(schema: tlc.core.schema.Schema) bool#

Returns True if the schema is an atomic schema describing a numeric value.

static to_simple_value_map(value_map: dict[float, tlc.core.schema.MapElement]) dict[int, str]#

Converts a value map with float keys and MapElement values to a map with int keys and str values