Skip to content

Modalities — Ingesters & Preprocessors

API reference for the data modality components: HuggingFaceIngester, KaggleIngester, OpenMLIngester, TimeSeriesIngester, WindowTransformer, ImageIngester, TextIngester, GraphIngester, GraphStateEncoder, and the NumpyIngester/CSVIngester extensions for sparse data and multi-label support.


HuggingFaceIngester

quprep.ingest.huggingface_ingester.HuggingFaceIngester(split='train', modality='auto', target_columns=None, numeric_only=True, image_column=None, image_size=(28, 28), grayscale=True, normalize=True, text_column=None, text_method='tfidf', max_features=512, text_model='all-MiniLM-L6-v2', edge_index_column='edge_index', node_feature_column='x', n_graph_features=None, token=None)

Load a HuggingFace dataset into a Dataset.

Automatically detects the dataset modality (tabular, image, text) from the HuggingFace feature schema. Pass modality explicitly to override detection or to load graph datasets.

Requires pip install quprep[huggingface].

Parameters:

Name Type Description Default
split str

Dataset split to load (e.g. "train", "test"). Default "train".

'train'
modality str

One of "auto" (default), "tabular", "image", "text", "graph". When "auto" the modality is inferred from the dataset's feature schema.

'auto'
target_columns str or list of str

Column(s) to treat as labels. Works for all modalities.

None
numeric_only bool

Tabular only. Drop non-numeric columns (default True).

True
image_column str

Image column name. Auto-detected when modality="auto"/"image".

None
image_size tuple of (int, int)

(height, width) to resize images before flattening. Default (28, 28).

(28, 28)
grayscale bool

Convert images to grayscale (default True).

True
normalize bool

Divide pixel values by 255 (default True).

True
text_column str

Text column name. Auto-detected when modality="auto"/"text".

None
text_method str

"tfidf" (default, no extra deps) or "sentence_transformers".

'tfidf'
max_features int or None

Max TF-IDF vocabulary size (default 512).

512
text_model str

sentence-transformers model name. Default "all-MiniLM-L6-v2".

'all-MiniLM-L6-v2'
edge_index_column str

Graph only. Column containing edge indices in COO format [2, E]. Default "edge_index".

'edge_index'
node_feature_column str

Graph only. Column containing node feature matrix. Default "x".

'x'
n_graph_features int or None

Pad/truncate graph feature vectors to this length. Default: auto.

None
token str or bool

HuggingFace auth token for gated datasets.

None

Examples:

Tabular (auto-detected)::

ds = HuggingFaceIngester(split="train", target_columns="label").load(
    "imodels/credit-card"
)

Image (auto-detected from HF feature schema)::

ds = HuggingFaceIngester(
    split="train", target_columns="label", image_size=(28, 28)
).load("ylecun/mnist")

Text (auto-detected)::

ds = HuggingFaceIngester(
    split="train", target_columns="label", text_method="tfidf"
).load("imdb")

Graph (explicit)::

ds = HuggingFaceIngester(
    modality="graph", split="train", target_columns="y"
).load("graphs-datasets/ogbg-molhiv")
Source code in quprep/ingest/huggingface_ingester.py
def __init__(
    self,
    split: str = "train",
    modality: str = "auto",
    target_columns: str | list[str] | None = None,
    # tabular
    numeric_only: bool = True,
    # image
    image_column: str | None = None,
    image_size: tuple[int, int] = (28, 28),
    grayscale: bool = True,
    normalize: bool = True,
    # text
    text_column: str | None = None,
    text_method: str = "tfidf",
    max_features: int | None = 512,
    text_model: str = "all-MiniLM-L6-v2",
    # graph
    edge_index_column: str = "edge_index",
    node_feature_column: str = "x",
    n_graph_features: int | None = None,
    # auth
    token: str | bool | None = None,
):
    valid = ("auto",) + _SUPPORTED_MODALITIES
    if modality not in valid:
        raise ValueError(f"modality must be one of {valid}, got '{modality}'")
    self.split = split
    self.modality = modality
    self.target_columns = target_columns
    self.numeric_only = numeric_only
    self.image_column = image_column
    self.image_size = image_size
    self.grayscale = grayscale
    self.normalize = normalize
    self.text_column = text_column
    self.text_method = text_method
    self.max_features = max_features
    self.text_model = text_model
    self.edge_index_column = edge_index_column
    self.node_feature_column = node_feature_column
    self.n_graph_features = n_graph_features
    self.token = token

Functions

load(dataset_name, config_name=None)

Load a HuggingFace dataset by name.

Parameters:

Name Type Description Default
dataset_name str

HuggingFace dataset identifier, e.g. "imdb", "ylecun/mnist".

required
config_name str

Dataset configuration/subset name. E.g. "en" for multilingual datasets (maps to the name argument in load_dataset).

None

Returns:

Type Description
Dataset

Raises:

Type Description
ImportError

If datasets (or Pillow / sentence-transformers / networkx) is not installed.

NotImplementedError

If the dataset contains a modality QuPrep does not yet support (e.g. audio, video).

ValueError

If no usable columns remain after filtering.

Source code in quprep/ingest/huggingface_ingester.py
def load(self, dataset_name: str, config_name: str | None = None) -> Dataset:
    """
    Load a HuggingFace dataset by name.

    Parameters
    ----------
    dataset_name : str
        HuggingFace dataset identifier, e.g. ``"imdb"``, ``"ylecun/mnist"``.
    config_name : str, optional
        Dataset configuration/subset name.  E.g. ``"en"`` for multilingual
        datasets (maps to the ``name`` argument in ``load_dataset``).

    Returns
    -------
    Dataset

    Raises
    ------
    ImportError
        If ``datasets`` (or Pillow / sentence-transformers / networkx) is
        not installed.
    NotImplementedError
        If the dataset contains a modality QuPrep does not yet support
        (e.g. audio, video).
    ValueError
        If no usable columns remain after filtering.
    """
    try:
        from datasets import load_dataset
    except ImportError as e:
        raise ImportError(
            "HuggingFaceIngester requires the 'datasets' package. "
            "Install it with: pip install quprep[huggingface]"
        ) from e

    kwargs: dict = {"split": self.split}
    if config_name is not None:
        kwargs["name"] = config_name
    if self.token is not None:
        kwargs["token"] = self.token

    hf_dataset = load_dataset(dataset_name, **kwargs)

    # resolve modality
    modality = self.modality
    detected_col: str | None = None
    if modality == "auto":
        modality, detected_col = self._detect_modality(
            hf_dataset.features, dataset_name
        )

    meta_base = {
        "source": f"huggingface:{dataset_name}",
        "split": self.split,
        "config": config_name,
        "modality": modality,
    }

    if modality == "tabular":
        import pandas as pd
        return self._load_tabular(hf_dataset, dataset_name, config_name, pd)
    if modality == "image":
        return self._load_image(hf_dataset, dataset_name, meta_base, detected_col)
    if modality == "text":
        return self._load_text(hf_dataset, dataset_name, meta_base, detected_col)
    if modality == "graph":
        return self._load_graph(hf_dataset, dataset_name, meta_base)
    # Should never reach here — validated in __init__
    raise ValueError(f"Unknown modality '{modality}'")  # pragma: no cover

KaggleIngester

quprep.ingest.kaggle_ingester.KaggleIngester(target_columns=None, numeric_only=True, file_name=None, force=False)

Load a Kaggle dataset or competition file into a Dataset.

Requires pip install quprep[kaggle] and a Kaggle API token stored at ~/.kaggle/kaggle.json (or set via the KAGGLE_USERNAME / KAGGLE_KEY environment variables).

The ingester downloads the dataset to a temporary directory, finds the first (or specified) CSV file, and ingests it exactly like :class:~quprep.ingest.csv_ingester.CSVIngester.

Parameters:

Name Type Description Default
target_columns str or list of str

Column name(s) to treat as labels rather than features.

None
numeric_only bool

If True (default), drop non-numeric columns after label extraction. If False, non-numeric columns are stored in Dataset.categorical_data.

True
file_name str

Specific file to download from the dataset (e.g. "train.csv"). When None (default), all files are downloaded and the first CSV found is used.

None
force bool

Re-download even if the file already exists locally. Default False.

False

Examples:

Dataset (owner/name format)::

from quprep.ingest.kaggle_ingester import KaggleIngester

ds = KaggleIngester(target_columns="label").load("heptapod/titanic")

Competition data::

ds = KaggleIngester(file_name="train.csv").load_competition("titanic")

Specific file::

ds = KaggleIngester(file_name="test.csv").load("owner/dataset-name")
Source code in quprep/ingest/kaggle_ingester.py
def __init__(
    self,
    target_columns: str | list[str] | None = None,
    numeric_only: bool = True,
    file_name: str | None = None,
    force: bool = False,
):
    self.target_columns = target_columns
    self.numeric_only = numeric_only
    self.file_name = file_name
    self.force = force

Functions

load(dataset)

Download and load a Kaggle dataset.

Parameters:

Name Type Description Default
dataset str

Kaggle dataset identifier in "owner/dataset-name" format, e.g. "heptapod/titanic" or "zillow/zecon".

required

Returns:

Type Description
Dataset

Raises:

Type Description
ImportError

If kaggle is not installed.

FileNotFoundError

If no CSV file is found in the downloaded dataset.

ValueError

If no numeric columns remain after filtering.

Source code in quprep/ingest/kaggle_ingester.py
def load(self, dataset: str) -> Dataset:
    """
    Download and load a Kaggle dataset.

    Parameters
    ----------
    dataset : str
        Kaggle dataset identifier in ``"owner/dataset-name"`` format,
        e.g. ``"heptapod/titanic"`` or ``"zillow/zecon"``.

    Returns
    -------
    Dataset

    Raises
    ------
    ImportError
        If ``kaggle`` is not installed.
    FileNotFoundError
        If no CSV file is found in the downloaded dataset.
    ValueError
        If no numeric columns remain after filtering.
    """
    api = self._get_api()

    with tempfile.TemporaryDirectory() as tmpdir:
        if self.file_name is not None:
            api.dataset_download_file(
                dataset,
                file_name=self.file_name,
                path=tmpdir,
                force=self.force,
                quiet=True,
            )
        else:
            api.dataset_download_files(
                dataset,
                path=tmpdir,
                force=self.force,
                quiet=True,
                unzip=True,
            )

        csv_path = self._find_csv(tmpdir, self.file_name)
        return self._ingest_csv(
            csv_path,
            source=f"kaggle:dataset:{dataset}",
            extra_meta={"dataset": dataset},
        )

load_competition(competition)

Download and load a Kaggle competition data file.

Parameters:

Name Type Description Default
competition str

Competition identifier, e.g. "titanic" or "house-prices-advanced-regression-techniques".

required

Returns:

Type Description
Dataset

Raises:

Type Description
ImportError

If kaggle is not installed.

FileNotFoundError

If no CSV file is found in the downloaded competition data.

ValueError

If no numeric columns remain after filtering.

Source code in quprep/ingest/kaggle_ingester.py
def load_competition(self, competition: str) -> Dataset:
    """
    Download and load a Kaggle competition data file.

    Parameters
    ----------
    competition : str
        Competition identifier, e.g. ``"titanic"`` or
        ``"house-prices-advanced-regression-techniques"``.

    Returns
    -------
    Dataset

    Raises
    ------
    ImportError
        If ``kaggle`` is not installed.
    FileNotFoundError
        If no CSV file is found in the downloaded competition data.
    ValueError
        If no numeric columns remain after filtering.
    """
    api = self._get_api()

    with tempfile.TemporaryDirectory() as tmpdir:
        if self.file_name is not None:
            api.competition_download_file(
                competition,
                file_name=self.file_name,
                path=tmpdir,
                force=self.force,
                quiet=True,
            )
        else:
            api.competition_download_files(
                competition,
                path=tmpdir,
                force=self.force,
                quiet=True,
            )
            # competition files download as individual files, not a zip
            self._unzip_all(tmpdir)

        csv_path = self._find_csv(tmpdir, self.file_name)
        return self._ingest_csv(
            csv_path,
            source=f"kaggle:competition:{competition}",
            extra_meta={"competition": competition},
        )

OpenMLIngester

quprep.ingest.openml_ingester.OpenMLIngester(target_column=None, numeric_only=True, version=None, cache_format='pickle')

Load an OpenML dataset into a Dataset.

Requires pip install quprep[openml].

OpenML datasets are identified by an integer task/dataset ID or by name. The ingester calls :func:openml.datasets.get_dataset and uses :meth:~openml.datasets.OpenMLDataset.get_data to retrieve a pandas DataFrame, then processes it the same way as :class:~quprep.ingest.csv_ingester.CSVIngester.

Parameters:

Name Type Description Default
target_column str or None

Name of the target / label column. When None (default) the dataset's default target is used if it exists; otherwise no labels are extracted.

None
numeric_only bool

If True (default), drop non-numeric columns after label extraction. If False, non-numeric columns are stored in Dataset.categorical_data.

True
version int or None

Specific dataset version to load. None uses the latest active version.

None
cache_format str

Cache format for the downloaded dataset files. "pickle" (default) or "feather".

'pickle'

Examples:

By dataset ID::

from quprep.ingest.openml_ingester import OpenMLIngester

ds = OpenMLIngester(target_column="class").load(61)   # iris

By dataset name::

ds = OpenMLIngester(target_column="class").load("iris")

No target (unsupervised)::

ds = OpenMLIngester().load(554)   # MNIST_784

Full pipeline::

import quprep as qd

result = qd.Pipeline(encoder=qd.AngleEncoder()).fit_transform(
    qd.OpenMLIngester(target_column="class").load("iris")
)
Source code in quprep/ingest/openml_ingester.py
def __init__(
    self,
    target_column: str | None = None,
    numeric_only: bool = True,
    version: int | None = None,
    cache_format: str = "pickle",
):
    self.target_column = target_column
    self.numeric_only = numeric_only
    self.version = version
    self.cache_format = cache_format

Functions

load(dataset_id)

Load an OpenML dataset by ID or name.

Parameters:

Name Type Description Default
dataset_id int or str

OpenML dataset ID (e.g. 61 for Iris) or dataset name (e.g. "iris"). When a name is given, the most recently published version matching the name is used (or the version specified by the version parameter).

required

Returns:

Type Description
Dataset

Raises:

Type Description
ImportError

If openml is not installed.

ValueError

If no numeric columns remain after filtering, or the dataset cannot be found.

Source code in quprep/ingest/openml_ingester.py
def load(self, dataset_id: int | str) -> Dataset:
    """
    Load an OpenML dataset by ID or name.

    Parameters
    ----------
    dataset_id : int or str
        OpenML dataset ID (e.g. ``61`` for Iris) or dataset name
        (e.g. ``"iris"``).  When a name is given, the most recently
        published version matching the name is used (or the version
        specified by the ``version`` parameter).

    Returns
    -------
    Dataset

    Raises
    ------
    ImportError
        If ``openml`` is not installed.
    ValueError
        If no numeric columns remain after filtering, or the dataset
        cannot be found.
    """
    try:
        import openml
    except ImportError as e:
        raise ImportError(
            "OpenMLIngester requires the 'openml' package. "
            "Install it with: pip install quprep[openml]"
        ) from e

    import pandas as pd

    # Resolve name → id if a string was passed
    resolved_id = self._resolve_id(dataset_id, openml)

    oml_dataset = openml.datasets.get_dataset(
        resolved_id,
        download_data=True,
        version=self.version,
        cache_format=self.cache_format,
    )

    # Determine which column to use as the label
    target = self.target_column or oml_dataset.default_target_attribute

    X, y, categorical_indicator, attribute_names = oml_dataset.get_data(
        target=target,
        dataset_format="dataframe",
    )

    df: pd.DataFrame = X  # type: ignore[assignment]
    labels: np.ndarray | None = None
    if y is not None:
        arr = np.asarray(y)
        labels = arr if arr.ndim > 1 else arr.ravel()

    all_feature_names = list(df.columns)
    all_feature_types = _detect_feature_types(df)

    numeric_mask = [
        not (
            isinstance(df[col].dtype, pd.CategoricalDtype)
            or pd.api.types.is_object_dtype(df[col])
            or df[col].dtype.name == "string"
        )
        for col in df.columns
    ]
    numeric_cols = [col for col, keep in zip(df.columns, numeric_mask) if keep]
    cat_cols = [col for col, keep in zip(df.columns, numeric_mask) if not keep]

    if not numeric_cols:
        raise ValueError(
            f"No numeric columns found in OpenML dataset '{dataset_id}'. "
            f"Available columns: {all_feature_names}. "
            "Check target_column or set numeric_only=False."
        )

    data = df[numeric_cols].to_numpy(dtype=float)
    numeric_types = [
        t for t, keep in zip(all_feature_types, numeric_mask) if keep
    ]
    categorical_data = (
        {} if self.numeric_only
        else {col: df[col].tolist() for col in cat_cols}
    )

    return Dataset(
        data=data,
        feature_names=numeric_cols,
        feature_types=numeric_types,
        categorical_data=categorical_data,
        metadata={
            "source": f"openml:{resolved_id}",
            "dataset_id": resolved_id,
            "dataset_name": oml_dataset.name,
            "version": oml_dataset.version,
            "original_columns": all_feature_names,
            "original_types": all_feature_types,
            "n_dropped_categorical": len(cat_cols),
            "target_column": target,
        },
        labels=labels,
    )

GraphIngester

quprep.ingest.graph_ingester.GraphIngester(features='all', n_features=None)

Convert graph data into a Dataset of feature vectors (lossy path).

Extracts a fixed-size feature vector from each graph so that existing encoders (AngleEncoder, AmplitudeEncoder, etc.) can be applied without modification. Features are drawn from the graph's Laplacian spectrum and degree sequence — both are proven to carry structural information relevant to graph classification tasks.

For the structure-preserving (lossless) path, use :class:~quprep.encode.graph_state.GraphStateEncoder directly.

Parameters:

Name Type Description Default
features str

Which features to extract:

  • 'laplacian_eigenvalues' — sorted eigenvalues of the normalized Laplacian (captures global topology).
  • 'degree' — sorted node degree sequence.
  • 'all' (default) — concatenation of both.
  • 'adjacency' — flattened upper triangle of the adjacency matrix (lossless path). Use with :class:~quprep.encode.graph_state.GraphStateEncoder for structure-preserving quantum encoding. All graphs in a batch must have the same number of nodes.
'all'
n_features int or None

Pad or truncate the feature vector to exactly this length. Not supported for features='adjacency' (lossless path requires exact adjacency structure).

None

Examples:

Lossy path — feature vectors for any standard encoder::

import numpy as np
import quprep as qd

adj = np.array([[0,1,1],[1,0,1],[1,1,0]], dtype=float)
dataset = qd.GraphIngester().load(adj)

Lossless path — full adjacency for GraphStateEncoder::

from quprep.core.pipeline import Pipeline
enc = qd.GraphIngester(features="adjacency")
dataset = enc.load(adj)
result = Pipeline(encoder=qd.GraphStateEncoder()).fit_transform(dataset)

Batch of graphs (lossy)::

import networkx as nx
graphs = [nx.path_graph(5), nx.cycle_graph(6), nx.complete_graph(4)]
dataset = qd.GraphIngester(n_features=8).load(graphs)
print(dataset.data.shape)   # (3, 8)
Source code in quprep/ingest/graph_ingester.py
def __init__(
    self,
    features: str = "all",
    n_features: int | None = None,
):
    if features not in _VALID_FEATURES:
        raise ValueError(f"features must be one of {_VALID_FEATURES}, got '{features}'")
    if features == "adjacency" and n_features is not None:
        raise ValueError(
            "n_features is not supported for features='adjacency'. "
            "The adjacency path is lossless — truncating would corrupt edge structure."
        )
    self.features = features
    self.n_features = n_features

Functions

load(source)

Load graph(s) and return a Dataset of feature vectors.

Parameters:

Name Type Description Default
source np.ndarray, networkx.Graph, or list
  • np.ndarray — square adjacency matrix, shape (n, n).
  • networkx.Graph / DiGraph — converted to adjacency matrix internally (requires networkx; pure NumPy path available).
  • list — each element is a graph (ndarray or networkx Graph); all are embedded and stacked into a single Dataset.
required

Returns:

Type Description
Dataset

data shape is (n_graphs, n_features). metadata["modality"] is "graph". metadata["features"] is the feature set used. metadata["n_nodes"] is a list of node counts per graph.

Raises:

Type Description
ValueError

If the adjacency matrix is not square, or no graphs are provided.

Source code in quprep/ingest/graph_ingester.py
def load(self, source) -> Dataset:
    """
    Load graph(s) and return a Dataset of feature vectors.

    Parameters
    ----------
    source : np.ndarray, networkx.Graph, or list
        - **np.ndarray** — square adjacency matrix, shape ``(n, n)``.
        - **networkx.Graph / DiGraph** — converted to adjacency matrix
          internally (requires ``networkx``; pure NumPy path available).
        - **list** — each element is a graph (ndarray or networkx Graph);
          all are embedded and stacked into a single Dataset.

    Returns
    -------
    Dataset
        ``data`` shape is ``(n_graphs, n_features)``.
        ``metadata["modality"]`` is ``"graph"``.
        ``metadata["features"]`` is the feature set used.
        ``metadata["n_nodes"]`` is a list of node counts per graph.

    Raises
    ------
    ValueError
        If the adjacency matrix is not square, or no graphs are provided.
    """
    if isinstance(source, list):
        if not source:
            raise ValueError("Empty graph list.")
        vectors = [self._graph_to_vec(g) for g in source]
        n_nodes = [self._n_nodes(g) for g in source]
    else:
        vectors = [self._graph_to_vec(source)]
        n_nodes = [self._n_nodes(source)]

    if self.features == "adjacency":
        # Lossless path: all graphs must have the same node count
        lengths = [v.shape[0] for v in vectors]
        if len(set(lengths)) > 1:
            raise ValueError(
                "features='adjacency' requires all graphs to have the same number "
                f"of nodes, but got node counts: {n_nodes}. "
                "Use features='all' with n_features= for variable-size batches."
            )
        padded = np.stack(vectors, axis=0)
    else:
        # Lossy path: pad / truncate to common length
        target_len = self.n_features or max(v.shape[0] for v in vectors)
        padded = np.stack([self._fit_length(v, target_len) for v in vectors], axis=0)

    n_feat = padded.shape[1]
    return Dataset(
        data=padded,
        feature_names=[f"gfeat_{i}" for i in range(n_feat)],
        feature_types=["continuous"] * n_feat,
        metadata={
            "modality": "graph",
            "features": self.features,
            "n_nodes": n_nodes,
            "n_features": n_feat,
        },
    )

GraphStateEncoder

quprep.encode.graph_state.GraphStateEncoder

Bases: BaseEncoder

Encode a graph as a graph state circuit (lossless path).

Produces :math:|G\rangle = \prod_{(i,j) \in E} CZ_{ij} H^{\otimes n} |0\rangle^n. The circuit preserves the full graph structure — every edge becomes a CZ entangling gate.

Two usage patterns:

Pipeline path (recommended) — pair with :class:~quprep.ingest.graph_ingester.GraphIngester using features='adjacency'::

dataset = GraphIngester(features="adjacency").load(adj)
result  = Pipeline(encoder=GraphStateEncoder()).fit_transform(dataset)

Direct path — pass the adjacency matrix directly::

result = GraphStateEncoder().encode_graph(adj)

For feature-based (lossy) encoding, use :class:~quprep.ingest.graph_ingester.GraphIngester with any standard encoder instead.

Examples:

Direct graph encoding::

import numpy as np
import quprep as qd

adj = np.array([[0,1,1,0],[1,0,1,0],[1,1,0,1],[0,0,1,0]], dtype=float)
encoder = qd.GraphStateEncoder()
result = encoder.encode_graph(adj)
print(result.metadata["n_qubits"])   # 4
print(result.metadata["edges"])      # [(0,1),(0,2),(1,2),(2,3)]

Functions

encode(x)

Encode a flattened upper-triangle adjacency vector as a graph state.

Parameters:

Name Type Description Default
x (ndarray, shape(n * (n - 1) // 2))

Flattened upper triangle of the adjacency matrix (row-major, values thresholded at 0.5 to determine edge presence). Use :meth:encode_graph to pass an adjacency matrix directly.

required

Returns:

Type Description
EncodedResult
Source code in quprep/encode/graph_state.py
def encode(self, x: np.ndarray) -> EncodedResult:
    """
    Encode a flattened upper-triangle adjacency vector as a graph state.

    Parameters
    ----------
    x : np.ndarray, shape (n*(n-1)//2,)
        Flattened upper triangle of the adjacency matrix (row-major,
        values thresholded at 0.5 to determine edge presence).
        Use :meth:`encode_graph` to pass an adjacency matrix directly.

    Returns
    -------
    EncodedResult
    """
    x = np.asarray(x, dtype=float)
    # reconstruct n from triangular number: k = n*(n-1)/2
    k = len(x)
    n = int((1 + np.sqrt(1 + 8 * k)) / 2)
    if n * (n - 1) // 2 != k:
        raise ValueError(
            f"Input length {k} is not a valid upper-triangle size. "
            "Expected n*(n-1)//2 for some integer n."
        )
    adj = np.zeros((n, n))
    idx = np.triu_indices(n, k=1)
    adj[idx] = x
    adj = adj + adj.T
    return self._from_adj(adj)

encode_batch_graphs(graphs)

Encode a list of adjacency matrices.

Parameters:

Name Type Description Default
graphs list of np.ndarray

Each element is a square adjacency matrix.

required

Returns:

Type Description
list of EncodedResult
Source code in quprep/encode/graph_state.py
def encode_batch_graphs(self, graphs: list[np.ndarray]) -> list[EncodedResult]:
    """
    Encode a list of adjacency matrices.

    Parameters
    ----------
    graphs : list of np.ndarray
        Each element is a square adjacency matrix.

    Returns
    -------
    list of EncodedResult
    """
    return [self.encode_graph(g) for g in graphs]

encode_graph(adj)

Encode directly from a square adjacency matrix.

Parameters:

Name Type Description Default
adj (ndarray, shape(n, n))

Square adjacency matrix. Values > 0.5 are treated as edges.

required

Returns:

Type Description
EncodedResult
Source code in quprep/encode/graph_state.py
def encode_graph(self, adj: np.ndarray) -> EncodedResult:
    """
    Encode directly from a square adjacency matrix.

    Parameters
    ----------
    adj : np.ndarray, shape (n, n)
        Square adjacency matrix. Values > 0.5 are treated as edges.

    Returns
    -------
    EncodedResult
    """
    adj = np.asarray(adj, dtype=float)
    if adj.ndim != 2 or adj.shape[0] != adj.shape[1]:
        raise ValueError(f"adj must be a square 2-D array, got shape {adj.shape}")
    return self._from_adj(adj)

ImageIngester

quprep.ingest.image_ingester.ImageIngester(size=(28, 28), grayscale=True, normalize=True)

Ingest image files into a Dataset of flattened pixel vectors.

Loads single images or entire directories. When the directory contains subdirectories, the subdirectory name is used as the class label (ImageFolder convention). Pixel values are optionally normalized to [0, 1] and resized to a common shape before flattening.

Requires pip install quprep[image] (Pillow).

Parameters:

Name Type Description Default
size tuple of (int, int) or None

(height, width) to resize each image to before flattening. If None, images are used at their original resolution — all images in a batch must then be the same size.

(28, 28)
grayscale bool

If True (default), convert images to grayscale (1 channel). If False, keep RGB (3 channels).

True
normalize bool

If True (default), divide pixel values by 255 to map to [0.0, 1.0]. Set to False to keep raw [0, 255] integers.

True

Examples:

Single image::

ingester = ImageIngester(size=(28, 28))
dataset = ingester.load("cat.png")

Directory with class labels::

# images/cat/img1.jpg, images/dog/img1.jpg
ingester = ImageIngester(size=(32, 32))
dataset = ingester.load("images/")
print(dataset.labels)        # ['cat', 'cat', ..., 'dog', ...]
print(dataset.data.shape)    # (n_images, 32*32)
Source code in quprep/ingest/image_ingester.py
def __init__(
    self,
    size: tuple[int, int] | None = (28, 28),
    grayscale: bool = True,
    normalize: bool = True,
):
    self.size = size
    self.grayscale = grayscale
    self.normalize = normalize

Functions

load(source)

Load one or more images and return a Dataset.

Parameters:

Name Type Description Default
source str or Path

Path to a single image file or to a directory of images. Supported formats: PNG, JPG/JPEG, BMP, TIFF, WebP.

Directory loading — two layouts are supported:

  • Flat: all image files at the top level → no labels.
  • Subfolders: each subdirectory is a class; images inside are samples → dataset.labels holds the class name strings.
required

Returns:

Type Description
Dataset

data shape is (n_samples, n_pixels) where n_pixels = height × width (grayscale) or height × width × 3 (RGB). metadata["modality"] is "image". metadata["size"] is the (H, W) tuple used. metadata["channels"] is 1 (grayscale) or 3 (RGB).

Raises:

Type Description
ImportError

If Pillow is not installed.

FileNotFoundError

If source does not exist.

ValueError

If no supported image files are found, or images have mismatched sizes when size=None.

Source code in quprep/ingest/image_ingester.py
def load(self, source: str | Path) -> Dataset:
    """
    Load one or more images and return a Dataset.

    Parameters
    ----------
    source : str or Path
        Path to a single image file or to a directory of images.
        Supported formats: PNG, JPG/JPEG, BMP, TIFF, WebP.

        *Directory loading* — two layouts are supported:

        - **Flat**: all image files at the top level → no labels.
        - **Subfolders**: each subdirectory is a class; images inside
          are samples → ``dataset.labels`` holds the class name strings.

    Returns
    -------
    Dataset
        ``data`` shape is ``(n_samples, n_pixels)`` where
        ``n_pixels = height × width`` (grayscale) or
        ``height × width × 3`` (RGB).
        ``metadata["modality"]`` is ``"image"``.
        ``metadata["size"]`` is the ``(H, W)`` tuple used.
        ``metadata["channels"]`` is ``1`` (grayscale) or ``3`` (RGB).

    Raises
    ------
    ImportError
        If Pillow is not installed.
    FileNotFoundError
        If ``source`` does not exist.
    ValueError
        If no supported image files are found, or images have
        mismatched sizes when ``size=None``.
    """
    try:
        from PIL import Image
    except ImportError as e:
        raise ImportError(
            "ImageIngester requires Pillow. Install it with: pip install quprep[image]"
        ) from e

    source = Path(source)
    if not source.exists():
        raise FileNotFoundError(f"Path not found: {source}")

    if source.is_file():
        arr = self._load_one(source, Image)
        n_pixels = arr.shape[0]
        channels = 1 if self.grayscale else 3
        h, w = self.size if self.size else (n_pixels // channels, -1)
        return Dataset(
            data=arr.reshape(1, -1),
            feature_names=[f"px_{i}" for i in range(arr.shape[0])],
            feature_types=["continuous"] * arr.shape[0],
            metadata={
                "source": str(source),
                "modality": "image",
                "size": self.size,
                "channels": 1 if self.grayscale else 3,
            },
        )

    # --- directory ---
    paths, labels = self._collect(source)

    arrays = [self._load_one(p, Image) for p in paths]
    self._check_shapes(arrays, paths)

    data = np.stack(arrays, axis=0)          # (n, n_pixels)
    n_pixels = data.shape[1]
    label_arr = np.array(labels) if any(lbl is not None for lbl in labels) else None

    return Dataset(
        data=data,
        feature_names=[f"px_{i}" for i in range(n_pixels)],
        feature_types=["continuous"] * n_pixels,
        metadata={
            "source": str(source),
            "modality": "image",
            "size": self.size,
            "channels": 1 if self.grayscale else 3,
            "n_images": len(paths),
        },
        labels=label_arr,
    )

TimeSeriesIngester

quprep.ingest.timeseries_ingester.TimeSeriesIngester(time_column=None, delimiter=None, encoding='utf-8', target_columns=None)

Ingest a time-series CSV file into a Dataset.

Reads a CSV where rows are timesteps and columns are features. An optional datetime column is extracted as a time index and stored in Dataset.metadata["time_index"] rather than treated as a feature.

The resulting Dataset preserves temporal ordering. Pass it to a :class:~quprep.preprocess.window.WindowTransformer to produce sliding-window samples ready for quantum encoding.

Parameters:

Name Type Description Default
time_column str or None

Name of the column containing timestamps. Parsed with pandas.to_datetime. If None, no time column is extracted and an integer index is stored instead.

None
delimiter str or None

Field delimiter. Auto-detected from file extension if None: '.tsv' → tab, everything else → comma.

None
encoding str

File encoding. Defaults to 'utf-8'.

'utf-8'
target_columns str or list of str

Column name(s) to treat as labels rather than features. Stored in Dataset.labels.

None
Source code in quprep/ingest/timeseries_ingester.py
def __init__(
    self,
    time_column: str | None = None,
    delimiter: str | None = None,
    encoding: str = "utf-8",
    target_columns: str | list[str] | None = None,
):
    self.time_column = time_column
    self.delimiter = delimiter
    self.encoding = encoding
    self.target_columns = target_columns

Functions

load(path)

Load a time-series CSV and return a Dataset.

Parameters:

Name Type Description Default
path str or Path
required

Returns:

Type Description
Dataset

metadata["time_index"] holds the parsed timestamps (list of pandas.Timestamp) or a plain integer range if no time_column was specified. metadata["modality"] is set to "time_series".

Raises:

Type Description
FileNotFoundError
Source code in quprep/ingest/timeseries_ingester.py
def load(self, path: str | Path) -> Dataset:
    """
    Load a time-series CSV and return a Dataset.

    Parameters
    ----------
    path : str or Path

    Returns
    -------
    Dataset
        ``metadata["time_index"]`` holds the parsed timestamps (list of
        ``pandas.Timestamp``) or a plain integer range if no
        ``time_column`` was specified.
        ``metadata["modality"]`` is set to ``"time_series"``.

    Raises
    ------
    FileNotFoundError
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    delimiter = self.delimiter
    if delimiter is None:
        delimiter = "\t" if path.suffix.lower() == ".tsv" else ","

    df = pd.read_csv(path, delimiter=delimiter, encoding=self.encoding)

    # --- extract time column ---
    if self.time_column and self.time_column in df.columns:
        time_index = pd.to_datetime(df[self.time_column], errors="coerce").tolist()
        df = df.drop(columns=[self.time_column])
    else:
        time_index = list(range(len(df)))

    # --- extract label columns ---
    labels = None
    if self.target_columns is not None:
        cols = (
            [self.target_columns]
            if isinstance(self.target_columns, str)
            else list(self.target_columns)
        )
        labels = df[cols].to_numpy()
        if labels.shape[1] == 1:
            labels = labels.ravel()
        df = df.drop(columns=cols)

    numeric_cols = df.select_dtypes(include="number").columns.tolist()
    data = df[numeric_cols].to_numpy(dtype=float)

    return Dataset(
        data=data,
        feature_names=numeric_cols,
        feature_types=["continuous"] * len(numeric_cols),
        metadata={
            "source": str(path),
            "time_index": time_index,
            "modality": "time_series",
        },
        labels=labels,
    )

WindowTransformer

quprep.preprocess.window.WindowTransformer(window_size=16, step=1, flatten=True)

Convert a time series Dataset into a set of sliding-window samples.

Each window becomes one row in the output Dataset, with features laid out as [feat0_lag(W-1), feat1_lag(W-1), ..., feat0_lag0, feat1_lag0] (oldest timestep first). The output is a standard 2-D Dataset that can be fed directly into any QuPrep encoder.

Parameters:

Name Type Description Default
window_size int

Number of consecutive timesteps per window. Must be ≤ the number of rows in the input Dataset.

16
step int

Stride between consecutive windows. Default 1 (fully overlapping). Set to window_size for non-overlapping windows.

1
flatten bool

If True (default), each window is flattened to a 1-D vector of length window_size × n_features. If False, the raw (window_size, n_features) array is stored per window — useful for inspection but not compatible with the standard encoders.

True

Examples:

>>> from quprep.ingest.timeseries_ingester import TimeSeriesIngester
>>> from quprep.preprocess.window import WindowTransformer
>>> from quprep.encode.angle import AngleEncoder
>>> from quprep.core.pipeline import Pipeline
>>>
>>> pipeline = Pipeline(
...     ingester=TimeSeriesIngester(time_column="date"),
...     preprocessor=WindowTransformer(window_size=8, step=1),
...     encoder=AngleEncoder(),
... )
>>> result = pipeline.fit_transform("timeseries.csv")
Source code in quprep/preprocess/window.py
def __init__(
    self,
    window_size: int = 16,
    step: int = 1,
    flatten: bool = True,
):
    if window_size < 1:
        raise ValueError(f"window_size must be >= 1, got {window_size}")
    if step < 1:
        raise ValueError(f"step must be >= 1, got {step}")
    self.window_size = window_size
    self.step = step
    self.flatten = flatten
    self._n_features: int | None = None

Functions

fit(dataset)

Fit the transformer (records input feature count).

Parameters:

Name Type Description Default
dataset Dataset
required

Returns:

Type Description
WindowTransformer

Returns self for chaining.

Source code in quprep/preprocess/window.py
def fit(self, dataset: Dataset) -> WindowTransformer:
    """
    Fit the transformer (records input feature count).

    Parameters
    ----------
    dataset : Dataset

    Returns
    -------
    WindowTransformer
        Returns ``self`` for chaining.
    """
    self._n_features = dataset.n_features
    return self

fit_transform(dataset)

Fit and transform in one call.

Source code in quprep/preprocess/window.py
def fit_transform(self, dataset: Dataset) -> Dataset:
    """Fit and transform in one call."""
    return self.fit(dataset).transform(dataset)

transform(dataset)

Apply sliding-window extraction and return a new Dataset.

Parameters:

Name Type Description Default
dataset Dataset

Time series dataset with shape (n_timesteps, n_features).

required

Returns:

Type Description
Dataset

Shape (n_windows, window_size × n_features) where n_windows = (n_timesteps - window_size) // step + 1.

Raises:

Type Description
ValueError

If n_timesteps < window_size.

Source code in quprep/preprocess/window.py
def transform(self, dataset: Dataset) -> Dataset:
    """
    Apply sliding-window extraction and return a new Dataset.

    Parameters
    ----------
    dataset : Dataset
        Time series dataset with shape (n_timesteps, n_features).

    Returns
    -------
    Dataset
        Shape (n_windows, window_size × n_features) where
        ``n_windows = (n_timesteps - window_size) // step + 1``.

    Raises
    ------
    ValueError
        If ``n_timesteps < window_size``.
    """
    X = dataset.data
    n_timesteps, n_features = X.shape

    if n_timesteps < self.window_size:
        raise ValueError(
            f"Time series has {n_timesteps} timesteps but "
            f"window_size={self.window_size}. Reduce window_size or "
            "provide more data."
        )

    time_index = dataset.metadata.get("time_index")

    windows: list[np.ndarray] = []
    window_time_index: list = []

    for i in range(0, n_timesteps - self.window_size + 1, self.step):
        window = X[i : i + self.window_size]
        windows.append(window.flatten() if self.flatten else window)
        if time_index is not None:
            # tag each window with the timestamp of its last (most recent) step
            window_time_index.append(time_index[i + self.window_size - 1])

    windows_array = np.array(windows)

    # build feature names: <original_feat>_lag<k> where k=0 is most recent
    feat_names = dataset.feature_names or [f"x{i}" for i in range(n_features)]
    if self.flatten:
        feature_names = [
            f"{feat}_lag{self.window_size - 1 - t}"
            for t in range(self.window_size)
            for feat in feat_names
        ]
    else:
        feature_names = feat_names

    meta = {k: v for k, v in dataset.metadata.items() if k != "time_index"}
    meta.update({
        "window_size": self.window_size,
        "step": self.step,
        "original_n_timesteps": n_timesteps,
        "window_time_index": window_time_index if window_time_index else None,
        "modality": "time_series_windowed",
    })

    # propagate labels if they exist (window-aligned label = label at last timestep)
    labels = None
    if dataset.labels is not None:
        y = dataset.labels
        label_windows = []
        for i in range(0, n_timesteps - self.window_size + 1, self.step):
            label_windows.append(y[i + self.window_size - 1])
        labels = np.array(label_windows)

    return Dataset(
        data=windows_array,
        feature_names=feature_names,
        feature_types=["continuous"] * len(feature_names),
        metadata=meta,
        labels=labels,
    )

TextIngester

quprep.ingest.text_ingester.TextIngester(method='tfidf', model='all-MiniLM-L6-v2', max_features=512, text_column=None, target_column=None, delimiter=',')

Convert text data into a Dataset of dense feature vectors.

Two embedding methods are supported:

  • tfidf (default, no extra deps) — TF-IDF bag-of-words via sklearn. Sparse output is converted to dense. Use a :class:~quprep.reduce.pca.PCAReducer afterwards to bring the feature count down to a circuit-friendly size.
  • sentence_transformers — semantic sentence embeddings via the sentence-transformers library. Produces compact (384–768d) dense vectors that are directly suitable for angle or amplitude encoding. Requires pip install quprep[text].

Parameters:

Name Type Description Default
method str

'tfidf' (default) or 'sentence_transformers'.

'tfidf'
model str

Sentence-transformers model name. Only used when method='sentence_transformers'. Default: 'all-MiniLM-L6-v2' (384-d, fast, good quality).

'all-MiniLM-L6-v2'
max_features int or None

Maximum vocabulary size for TF-IDF. Ignored for sentence_transformers. Default 512.

512
text_column str or None

Column name containing text when loading a CSV file. Required for CSV sources; ignored for .txt files and list inputs.

None
target_column str or list of str or None

Column name(s) to treat as labels. Stored in Dataset.labels.

None
delimiter str

CSV delimiter. Default ','.

','

Examples:

From a list of strings::

ingester = TextIngester(method="tfidf", max_features=64)
dataset = ingester.load(["quantum is great", "machine learning rocks"])

From a text file (one sentence per line)::

dataset = TextIngester().load("corpus.txt")

From a CSV::

ingester = TextIngester(text_column="review", target_column="sentiment")
dataset = ingester.load("reviews.csv")
print(dataset.labels)   # sentiment column values

With sentence transformers::

ingester = TextIngester(method="sentence_transformers")
dataset = ingester.load(sentences)
# dataset.data.shape → (n, 384) — directly encode with AngleEncoder
Source code in quprep/ingest/text_ingester.py
def __init__(
    self,
    method: str = "tfidf",
    model: str = "all-MiniLM-L6-v2",
    max_features: int | None = 512,
    text_column: str | None = None,
    target_column: str | list[str] | None = None,
    delimiter: str = ",",
):
    if method not in _VALID_METHODS:
        raise ValueError(f"method must be one of {_VALID_METHODS}, got '{method}'")
    self.method = method
    self.model = model
    self.max_features = max_features
    self.text_column = text_column
    self.target_column = target_column
    self.delimiter = delimiter

Functions

load(source)

Load text data and return a Dataset of feature vectors.

Parameters:

Name Type Description Default
source list of str, str, or Path
  • list of str — texts are used directly.
  • .txt file — each non-empty line is one text sample.
  • .csv filetext_column must be set; rows become samples.
required

Returns:

Type Description
Dataset

data shape is (n_samples, n_features) where n_features is max_features (TF-IDF) or the embedding dimension (sentence_transformers). metadata["modality"] is "text". metadata["method"] is the embedding method used.

Raises:

Type Description
ImportError

If method='sentence_transformers' and the package is not installed.

FileNotFoundError

If a file path is provided but does not exist.

ValueError

If a CSV is passed but text_column is not set, or the column is not found.

Source code in quprep/ingest/text_ingester.py
def load(self, source) -> Dataset:
    """
    Load text data and return a Dataset of feature vectors.

    Parameters
    ----------
    source : list of str, str, or Path
        - **list of str** — texts are used directly.
        - **.txt file** — each non-empty line is one text sample.
        - **.csv file** — ``text_column`` must be set; rows become samples.

    Returns
    -------
    Dataset
        ``data`` shape is ``(n_samples, n_features)`` where
        ``n_features`` is ``max_features`` (TF-IDF) or the embedding
        dimension (sentence_transformers).
        ``metadata["modality"]`` is ``"text"``.
        ``metadata["method"]`` is the embedding method used.

    Raises
    ------
    ImportError
        If ``method='sentence_transformers'`` and the package is not
        installed.
    FileNotFoundError
        If a file path is provided but does not exist.
    ValueError
        If a CSV is passed but ``text_column`` is not set, or the
        column is not found.
    """
    texts, labels = self._load_texts(source)
    vectors = self._embed(texts)

    n_features = vectors.shape[1]
    return Dataset(
        data=vectors,
        feature_names=[f"emb_{i}" for i in range(n_features)],
        feature_types=["continuous"] * n_features,
        metadata={
            "modality": "text",
            "method": self.method,
            "n_samples": len(texts),
            "n_features": n_features,
        },
        labels=labels,
    )

NumpyIngester — sparse & multi-label

quprep.ingest.numpy_ingester.NumpyIngester

Wrap a NumPy array, Pandas DataFrame, or SciPy sparse matrix as a Dataset.

Functions

load(data, y=None)

Convert array-like data to a Dataset.

Parameters:

Name Type Description Default
data np.ndarray, pd.DataFrame, or scipy.sparse matrix

2-D numeric array or DataFrame. 1-D arrays are treated as a single-feature column. Sparse matrices are converted to dense.

required
y ndarray or array - like

Target labels. Shape (n_samples,) for single-target or (n_samples, n_labels) for multi-label. Stored in Dataset.labels.

None

Returns:

Type Description
Dataset

Raises:

Type Description
TypeError

If data is not a recognisable array-like type.

ValueError

If data has more than 2 dimensions.

Source code in quprep/ingest/numpy_ingester.py
def load(self, data, y=None) -> Dataset:
    """
    Convert array-like data to a Dataset.

    Parameters
    ----------
    data : np.ndarray, pd.DataFrame, or scipy.sparse matrix
        2-D numeric array or DataFrame. 1-D arrays are treated as a
        single-feature column. Sparse matrices are converted to dense.
    y : np.ndarray or array-like, optional
        Target labels. Shape (n_samples,) for single-target or
        (n_samples, n_labels) for multi-label. Stored in ``Dataset.labels``.

    Returns
    -------
    Dataset

    Raises
    ------
    TypeError
        If data is not a recognisable array-like type.
    ValueError
        If data has more than 2 dimensions.
    """
    # --- sparse matrix support ---
    try:
        import scipy.sparse as _sp
        if _sp.issparse(data):
            data = data.toarray()
    except ImportError:
        pass

    labels = np.asarray(y) if y is not None else None

    try:
        import pandas as pd
        is_dataframe = isinstance(data, pd.DataFrame)
    except ImportError:
        is_dataframe = False

    if is_dataframe:
        import pandas as pd
        df = data
        feature_names = list(df.columns.astype(str))
        feature_types = _detect_feature_types(df)
        numeric = df.select_dtypes(include=[np.number])
        arr = numeric.to_numpy(dtype=float)
        return Dataset(
            data=arr,
            feature_names=feature_names,
            feature_types=feature_types,
            labels=labels,
        )

    if not isinstance(data, np.ndarray):
        try:
            data = np.asarray(data, dtype=float)
        except (TypeError, ValueError) as e:
            raise TypeError(
                f"Expected np.ndarray, pd.DataFrame, or scipy.sparse matrix, "
                f"got {type(data).__name__}"
            ) from e

    if data.ndim == 1:
        data = data.reshape(-1, 1)

    if data.ndim != 2:
        raise ValueError(f"Expected 2-D array, got shape {data.shape}")

    data = data.astype(float)
    n_features = data.shape[1]
    feature_names = [f"x{i}" for i in range(n_features)]
    feature_types = ["continuous"] * n_features

    return Dataset(
        data=data,
        feature_names=feature_names,
        feature_types=feature_types,
        labels=labels,
    )

stream(data, y=None, chunksize=1000)

Yield Dataset chunks from a NumPy array without duplicating it in RAM.

Parameters:

Name Type Description Default
data ndarray or DataFrame

2-D array. Processed identically to :meth:load.

required
y ndarray or array - like

Target labels.

None
chunksize int

Rows per chunk.

1000

Yields:

Type Description
Dataset
Source code in quprep/ingest/numpy_ingester.py
def stream(self, data, y=None, chunksize: int = 1000):
    """
    Yield Dataset chunks from a NumPy array without duplicating it in RAM.

    Parameters
    ----------
    data : np.ndarray or pd.DataFrame
        2-D array.  Processed identically to :meth:`load`.
    y : np.ndarray or array-like, optional
        Target labels.
    chunksize : int
        Rows per chunk.

    Yields
    ------
    Dataset
    """
    # Normalise to ndarray first (reuse load() logic for type handling)
    base = self.load(data, y=y)
    X = base.data
    labels = base.labels
    n = len(X)

    for chunk_idx, start in enumerate(range(0, n, chunksize)):
        end = min(start + chunksize, n)
        chunk_labels = labels[start:end] if labels is not None else None
        yield Dataset(
            data=X[start:end].copy(),
            feature_names=list(base.feature_names),
            feature_types=list(base.feature_types),
            metadata={"chunk": chunk_idx},
            labels=chunk_labels,
        )

CSVIngester — multi-label

quprep.ingest.csv_ingester.CSVIngester(delimiter=None, encoding='utf-8', target_columns=None)

Ingest CSV and TSV files into a Dataset.

Supports automatic type detection (continuous, discrete, binary, categorical) and basic dataset profiling on load.

Parameters:

Name Type Description Default
delimiter str or None

Field delimiter. Auto-detected from file extension if None: '.tsv' → tab, everything else → comma.

None
encoding str

File encoding. Defaults to 'utf-8'.

'utf-8'
target_columns str or list of str

Column name(s) to treat as labels rather than features. These columns are extracted and stored in Dataset.labels instead of Dataset.data. Supports single-target (str) and multi-label (list of str) use cases.

None
Source code in quprep/ingest/csv_ingester.py
def __init__(
    self,
    delimiter: str | None = None,
    encoding: str = "utf-8",
    target_columns: str | list[str] | None = None,
):
    self.delimiter = delimiter
    self.encoding = encoding
    self.target_columns = target_columns

Functions

load(path)

Load a CSV/TSV file and return a Dataset.

Numeric columns go into data. Non-numeric (categorical) columns are stored in categorical_data for CategoricalEncoder to process. NaN values are preserved as-is for the Imputer to handle. Columns listed in target_columns are extracted as Dataset.labels.

Parameters:

Name Type Description Default
path str or Path
required

Returns:

Type Description
Dataset

Raises:

Type Description
FileNotFoundError
Source code in quprep/ingest/csv_ingester.py
def load(self, path: str | Path) -> Dataset:
    """
    Load a CSV/TSV file and return a Dataset.

    Numeric columns go into `data`. Non-numeric (categorical) columns
    are stored in `categorical_data` for CategoricalEncoder to process.
    NaN values are preserved as-is for the Imputer to handle.
    Columns listed in ``target_columns`` are extracted as ``Dataset.labels``.

    Parameters
    ----------
    path : str or Path

    Returns
    -------
    Dataset

    Raises
    ------
    FileNotFoundError
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    delimiter = self.delimiter
    if delimiter is None:
        delimiter = "\t" if path.suffix.lower() == ".tsv" else ","

    df = pd.read_csv(path, delimiter=delimiter, encoding=self.encoding)

    # --- extract label columns before feature processing ---
    labels = None
    if self.target_columns is not None:
        cols = (
            [self.target_columns]
            if isinstance(self.target_columns, str)
            else list(self.target_columns)
        )
        labels = df[cols].to_numpy()
        if labels.shape[1] == 1:
            labels = labels.ravel()
        df = df.drop(columns=cols)

    all_feature_names = list(df.columns)
    all_feature_types = _detect_feature_types(df)

    numeric_mask = [
        not (
            isinstance(df[col].dtype, pd.CategoricalDtype)
            or pd.api.types.is_object_dtype(df[col])
        )
        for col in df.columns
    ]
    numeric_cols = [col for col, keep in zip(df.columns, numeric_mask) if keep]
    cat_cols = [col for col, keep in zip(df.columns, numeric_mask) if not keep]

    data = df[numeric_cols].to_numpy(dtype=float) if numeric_cols else np.empty((len(df), 0))

    numeric_types = [
        t for t, keep in zip(all_feature_types, numeric_mask) if keep
    ]

    categorical_data = {col: df[col].tolist() for col in cat_cols}

    return Dataset(
        data=data,
        feature_names=numeric_cols,
        feature_types=numeric_types,
        categorical_data=categorical_data,
        metadata={
            "source": str(path),
            "original_columns": all_feature_names,
            "original_types": all_feature_types,
        },
        labels=labels,
    )

stream(path, chunksize=1000)

Yield Dataset chunks from a CSV file without loading it fully into RAM.

Column detection (numeric vs categorical), label extraction, and feature-type inference mirror :meth:load — each chunk is a self-contained Dataset.

Parameters:

Name Type Description Default
path str or Path
required
chunksize int

Rows per chunk.

1000

Yields:

Type Description
Dataset

Raises:

Type Description
FileNotFoundError
Source code in quprep/ingest/csv_ingester.py
def stream(self, path: str | Path, chunksize: int = 1000):
    """
    Yield Dataset chunks from a CSV file without loading it fully into RAM.

    Column detection (numeric vs categorical), label extraction, and
    feature-type inference mirror :meth:`load` — each chunk is a
    self-contained Dataset.

    Parameters
    ----------
    path : str or Path
    chunksize : int
        Rows per chunk.

    Yields
    ------
    Dataset

    Raises
    ------
    FileNotFoundError
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    delimiter = self.delimiter
    if delimiter is None:
        delimiter = "\t" if path.suffix.lower() == ".tsv" else ","

    reader = pd.read_csv(
        path,
        delimiter=delimiter,
        encoding=self.encoding,
        chunksize=chunksize,
    )

    for chunk_idx, df in enumerate(reader):
        labels = None
        if self.target_columns is not None:
            cols = (
                [self.target_columns]
                if isinstance(self.target_columns, str)
                else list(self.target_columns)
            )
            labels = df[cols].to_numpy()
            if labels.shape[1] == 1:
                labels = labels.ravel()
            df = df.drop(columns=cols)

        all_feature_names = list(df.columns)
        all_feature_types = _detect_feature_types(df)

        numeric_mask = [
            not (
                isinstance(df[col].dtype, pd.CategoricalDtype)
                or pd.api.types.is_object_dtype(df[col])
            )
            for col in df.columns
        ]
        numeric_cols = [c for c, keep in zip(df.columns, numeric_mask) if keep]
        cat_cols = [c for c, keep in zip(df.columns, numeric_mask) if not keep]
        numeric_types = [t for t, keep in zip(all_feature_types, numeric_mask) if keep]

        data = (
            df[numeric_cols].to_numpy(dtype=float)
            if numeric_cols
            else np.empty((len(df), 0))
        )
        categorical_data = {col: df[col].tolist() for col in cat_cols}

        yield Dataset(
            data=data,
            feature_names=numeric_cols,
            feature_types=numeric_types,
            categorical_data=categorical_data,
            metadata={
                "source": str(path),
                "chunk": chunk_idx,
                "original_columns": all_feature_names,
                "original_types": all_feature_types,
            },
            labels=labels,
        )