Source code for nested_dask.datasets.generation
from nested_pandas import datasets
import nested_dask as nd
[docs]
def generate_data(n_base, n_layer, npartitions=1, seed=None) -> nd.NestedFrame:
"""Generates a toy dataset.
Docstring copied from nested-pandas.
Parameters
----------
n_base : int
The number of rows to generate for the base layer
n_layer : int, or dict
The number of rows per n_base row to generate for a nested layer.
Alternatively, a dictionary of layer label, layer_size pairs may be
specified to created multiple nested columns with custom sizing.
npartitions: int
The number of partitions to split the data into.
seed : int
A seed to use for random generation of data
Returns
-------
NestedFrame
The constructed Dask NestedFrame.
Examples
--------
>>> import nested_dask as nd
>>> nd.datasets.generate_data(10,100)
>>> nd.datasets.generate_data(10, {"nested_a": 100, "nested_b": 200})
"""
# Use nested-pandas generator
base_nf = datasets.generate_data(n_base, n_layer, seed=seed)
# Convert to nested-dask
base_nf = nd.NestedFrame.from_pandas(base_nf).repartition(npartitions=npartitions)
return base_nf
[docs]
def generate_parquet_file(n_base, n_layer, path, file_per_layer=True, npartitions=1, seed=None):
"""Generates a toy dataset and outputs it to one or more parquet files.
Parameters
----------
n_base : int
The number of rows to generate for the base layer
n_layer : int, or dict
The number of rows per n_base row to generate for a nested layer.
Alternatively, a dictionary of layer label, layer_size pairs may be
specified to created multiple nested columns with custom sizing.
path : str,
The path to the parquet file to write to if `file_per_layer` is `False`,
and otherwise the path to the directory to write the parquet file for
each layer.
file_per_layer : bool, default=True
TODO: Currently only True is supported.
If True, write each layer to its own parquet file. Otherwise, write
the generated to a single parquet file representing a nested dataset.
npartitions : int, default=1
The number of Dask partitions to split the generated data into for each layer.
seed : int, default=None
A seed to use for random generation of data
Returns
-------
None
"""
nf = generate_data(n_base, n_layer, npartitions, seed)
nf.to_parquet(path, by_layer=file_per_layer, write_index=False)