Coverage for packages / dqm-ml-job / src / dqm_ml_job / dataloaders / pandas.py: 90%
36 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-15 10:11 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-15 10:11 +0000
1"""Pandas data loader for reading CSV files.
3This module contains the PandasDataLoader and PandasDataSelection classes
4for loading and iterating over CSV file data using Pandas.
5"""
7import logging
8from typing import Any
10import pandas as pd
11import pyarrow as pa
13# COMPATIBILITY : from typing import Any, override # When support of 3.10 and 3.11 will be removed
14from typing_extensions import override
16from dqm_ml_job.dataloaders.proto import DataSelection
18logger = logging.getLogger(__name__)
21class PandasDataSelection(DataSelection):
22 """A selection of data from a CSV file loaded via Pandas.
24 This class represents data loaded from a CSV file and provides
25 an iterator over PyArrow RecordBatches.
27 Attributes:
28 name: Name identifier for this selection.
29 path: Path to the CSV file.
30 data: The loaded pandas DataFrame.
31 """
33 def __init__(self, name: str, path: str):
34 """Initialize a Pandas data selection.
36 Args:
37 name: Name identifier for this selection.
38 path: Path to the CSV file.
39 """
40 self.name = name
41 self.path = path
42 self.data: pd.DataFrame | None = None
44 @override
45 def bootstrap(self, columns_list: list[str] | None = None) -> None:
46 # For CSV, we currently load everything
47 self.data = pd.read_csv(self.path, sep=",")
49 def __len__(self) -> int:
50 return len(self.data) if self.data is not None else 0
52 @override
53 def get_nb_batches(self) -> int:
54 return 1 if self.data is not None else 0
56 @override
57 def __iter__(self) -> Any:
58 if self.data is not None: 58 ↛ exitline 58 didn't return from function '__iter__' because the condition on line 58 was always true
59 yield pa.RecordBatch.from_pandas(self.data)
61 @override
62 def __repr__(self) -> str:
63 return f"PandasSelection(name='{self.name}', path='{self.path}')"
66class PandasDataLoader:
67 """Data loader for CSV files using Pandas.
69 This loader reads CSV files and provides DataSelections for
70 processing by the DQM pipeline.
72 Attributes:
73 type: The loader type identifier ("csv").
74 """
76 type: str = "csv"
78 def __init__(self, name: str, config: dict[str, Any] | None = None):
79 """Initialize the Pandas data loader.
81 Args:
82 name: Unique name for this loader instance.
83 config: Configuration dictionary containing:
84 - path: Path to CSV file (required)
86 Raises:
87 ValueError: If required config keys are missing.
88 """
89 if not config or "path" not in config: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 raise ValueError(f"Configuration for dataloader '{name}' must contain 'path'")
91 self.name = name
92 self.path = config["path"]
94 def get_selections(self) -> list[DataSelection]:
95 """Create a PandasDataSelection for the CSV file.
97 Returns:
98 A list containing a single PandasDataSelection instance.
99 """
100 return [PandasDataSelection(name=self.name, path=self.path)]