Coverage for packages / dqm-ml-job / src / dqm_ml_job / dataloaders / pandas.py: 90%

36 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-15 10:11 +0000

1"""Pandas data loader for reading CSV files. 

2 

3This module contains the PandasDataLoader and PandasDataSelection classes 

4for loading and iterating over CSV file data using Pandas. 

5""" 

6 

7import logging 

8from typing import Any 

9 

10import pandas as pd 

11import pyarrow as pa 

12 

13# COMPATIBILITY : from typing import Any, override # When support of 3.10 and 3.11 will be removed 

14from typing_extensions import override 

15 

16from dqm_ml_job.dataloaders.proto import DataSelection 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21class PandasDataSelection(DataSelection): 

22 """A selection of data from a CSV file loaded via Pandas. 

23 

24 This class represents data loaded from a CSV file and provides 

25 an iterator over PyArrow RecordBatches. 

26 

27 Attributes: 

28 name: Name identifier for this selection. 

29 path: Path to the CSV file. 

30 data: The loaded pandas DataFrame. 

31 """ 

32 

33 def __init__(self, name: str, path: str): 

34 """Initialize a Pandas data selection. 

35 

36 Args: 

37 name: Name identifier for this selection. 

38 path: Path to the CSV file. 

39 """ 

40 self.name = name 

41 self.path = path 

42 self.data: pd.DataFrame | None = None 

43 

44 @override 

45 def bootstrap(self, columns_list: list[str] | None = None) -> None: 

46 # For CSV, we currently load everything 

47 self.data = pd.read_csv(self.path, sep=",") 

48 

49 def __len__(self) -> int: 

50 return len(self.data) if self.data is not None else 0 

51 

52 @override 

53 def get_nb_batches(self) -> int: 

54 return 1 if self.data is not None else 0 

55 

56 @override 

57 def __iter__(self) -> Any: 

58 if self.data is not None: 58 ↛ exitline 58 didn't return from function '__iter__' because the condition on line 58 was always true

59 yield pa.RecordBatch.from_pandas(self.data) 

60 

61 @override 

62 def __repr__(self) -> str: 

63 return f"PandasSelection(name='{self.name}', path='{self.path}')" 

64 

65 

66class PandasDataLoader: 

67 """Data loader for CSV files using Pandas. 

68 

69 This loader reads CSV files and provides DataSelections for 

70 processing by the DQM pipeline. 

71 

72 Attributes: 

73 type: The loader type identifier ("csv"). 

74 """ 

75 

76 type: str = "csv" 

77 

78 def __init__(self, name: str, config: dict[str, Any] | None = None): 

79 """Initialize the Pandas data loader. 

80 

81 Args: 

82 name: Unique name for this loader instance. 

83 config: Configuration dictionary containing: 

84 - path: Path to CSV file (required) 

85 

86 Raises: 

87 ValueError: If required config keys are missing. 

88 """ 

89 if not config or "path" not in config: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError(f"Configuration for dataloader '{name}' must contain 'path'") 

91 self.name = name 

92 self.path = config["path"] 

93 

94 def get_selections(self) -> list[DataSelection]: 

95 """Create a PandasDataSelection for the CSV file. 

96 

97 Returns: 

98 A list containing a single PandasDataSelection instance. 

99 """ 

100 return [PandasDataSelection(name=self.name, path=self.path)]