Coverage for packages / dqm-ml-job / src / dqm_ml_job / dataloaders / proto.py: 100%

10 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-15 10:11 +0000

1"""Protocol definitions for data loaders and selections. 

2 

3This module contains the DataLoader and DataSelection protocol classes 

4that define the interface for data loading implementations. 

5""" 

6 

7from typing import Any, Protocol, runtime_checkable 

8 

9 

10@runtime_checkable 

11class DataSelection(Protocol): 

12 """ 

13 Protocol for a specific subset of data discovered by a DataLoader. 

14 

15 A DataSelection represents a concrete set of samples (e.g., a specific folder, 

16 a filtered view of a database, or a single file) and provides an iterator 

17 over data batches. 

18 """ 

19 

20 name: str 

21 

22 def bootstrap(self, columns_list: list[str]) -> None: 

23 """ 

24 Perform initial setup for the selection before iteration starts. 

25 

26 Args: 

27 columns_list: List of column names that must be loaded for this selection. 

28 """ 

29 

30 def get_nb_batches(self) -> int: 

31 """ 

32 Return the estimated number of batches in this selection. 

33 

34 Used primarily for progress bar estimation. 

35 """ 

36 

37 def __iter__(self) -> Any: 

38 """ 

39 Iterate over the selection, yielding pyarrow.RecordBatch objects. 

40 """ 

41 

42 

43@runtime_checkable 

44class DataLoader(Protocol): 

45 """ 

46 Protocol for Data Loader factories. 

47 

48 A DataLoader is responsible for scanning a source (disk, DB, S3) and 

49 discovering available DataSelections based on its configuration. 

50 """ 

51 

52 def get_selections(self) -> list[DataSelection]: 

53 """ 

54 Discover and return the list of available selections for this loader. 

55 

56 Returns: 

57 A list of initialized DataSelection instances. 

58 """