Coverage for packages / dqm-ml-job / src / dqm_ml_job / dataloaders / proto.py: 100%
10 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-15 10:11 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-15 10:11 +0000
1"""Protocol definitions for data loaders and selections.
3This module contains the DataLoader and DataSelection protocol classes
4that define the interface for data loading implementations.
5"""
7from typing import Any, Protocol, runtime_checkable
10@runtime_checkable
11class DataSelection(Protocol):
12 """
13 Protocol for a specific subset of data discovered by a DataLoader.
15 A DataSelection represents a concrete set of samples (e.g., a specific folder,
16 a filtered view of a database, or a single file) and provides an iterator
17 over data batches.
18 """
20 name: str
22 def bootstrap(self, columns_list: list[str]) -> None:
23 """
24 Perform initial setup for the selection before iteration starts.
26 Args:
27 columns_list: List of column names that must be loaded for this selection.
28 """
30 def get_nb_batches(self) -> int:
31 """
32 Return the estimated number of batches in this selection.
34 Used primarily for progress bar estimation.
35 """
37 def __iter__(self) -> Any:
38 """
39 Iterate over the selection, yielding pyarrow.RecordBatch objects.
40 """
43@runtime_checkable
44class DataLoader(Protocol):
45 """
46 Protocol for Data Loader factories.
48 A DataLoader is responsible for scanning a source (disk, DB, S3) and
49 discovering available DataSelections based on its configuration.
50 """
52 def get_selections(self) -> list[DataSelection]:
53 """
54 Discover and return the list of available selections for this loader.
56 Returns:
57 A list of initialized DataSelection instances.
58 """