Coverage for packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py: 90%

1"""Pandas data loader for reading CSV files.

3This module contains the PandasDataLoader and PandasDataSelection classes

4for loading and iterating over CSV file data using Pandas.

5"""

7import logging

8from typing import Any

10import pandas as pd

11import pyarrow as pa

13# COMPATIBILITY : from typing import Any, override # When support of 3.10 and 3.11 will be removed

14from typing_extensions import override

16from dqm_ml_job.dataloaders.proto import DataSelection

18logger = logging.getLogger(__name__)

21class PandasDataSelection(DataSelection):

22 """A selection of data from a CSV file loaded via Pandas.

24 This class represents data loaded from a CSV file and provides

25 an iterator over PyArrow RecordBatches.

27 Attributes:

28 name: Name identifier for this selection.

29 path: Path to the CSV file.

30 data: The loaded pandas DataFrame.

31 """

33 def __init__(self, name: str, path: str):

34 """Initialize a Pandas data selection.

36 Args:

37 name: Name identifier for this selection.

38 path: Path to the CSV file.

39 """

40 self.name = name

41 self.path = path

42 self.data: pd.DataFrame | None = None

44 @override

45 def bootstrap(self, columns_list: list[str] | None = None) -> None:

46 # For CSV, we currently load everything

47 self.data = pd.read_csv(self.path, sep=",")

49 def __len__(self) -> int:

50 return len(self.data) if self.data is not None else 0

52 @override

53 def get_nb_batches(self) -> int:

54 return 1 if self.data is not None else 0

56 @override

57 def __iter__(self) -> Any:

58 if self.data is not None: 58 ↛ exitline 58 didn't return from function '__iter__' because the condition on line 58 was always true

59 yield pa.RecordBatch.from_pandas(self.data)

61 @override

62 def __repr__(self) -> str:

63 return f"PandasSelection(name='{self.name}', path='{self.path}')"

66class PandasDataLoader:

67 """Data loader for CSV files using Pandas.

69 This loader reads CSV files and provides DataSelections for

70 processing by the DQM pipeline.

72 Attributes:

73 type: The loader type identifier ("csv").

74 """

76 type: str = "csv"

78 def __init__(self, name: str, config: dict[str, Any] | None = None):

79 """Initialize the Pandas data loader.

81 Args:

82 name: Unique name for this loader instance.

83 config: Configuration dictionary containing:

84 - path: Path to CSV file (required)

86 Raises:

87 ValueError: If required config keys are missing.

88 """

89 if not config or "path" not in config: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError(f"Configuration for dataloader '{name}' must contain 'path'")

91 self.name = name

92 self.path = config["path"]

94 def get_selections(self) -> list[DataSelection]:

95 """Create a PandasDataSelection for the CSV file.

97 Returns:

98 A list containing a single PandasDataSelection instance.

99 """

100 return [PandasDataSelection(name=self.name, path=self.path)]

Coverage for packages / dqm-ml-job / src / dqm_ml_job / dataloaders / pandas.py: 90%

36 statements