Skip to content

dqm_ml_job.dataloaders.pandas

Pandas data loader for reading CSV files.

This module contains the PandasDataLoader and PandasDataSelection classes for loading and iterating over CSV file data using Pandas.

logger = logging.getLogger(__name__) module-attribute

PandasDataLoader

Data loader for CSV files using Pandas.

This loader reads CSV files and provides DataSelections for processing by the DQM pipeline.

Attributes:

Name Type Description
type str

The loader type identifier ("csv").

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class PandasDataLoader:
    """Data loader for CSV files using Pandas.

    This loader reads CSV files and provides DataSelections for
    processing by the DQM pipeline.

    Attributes:
        type: The loader type identifier ("csv").
    """

    type: str = "csv"

    def __init__(self, name: str, config: dict[str, Any] | None = None):
        """Initialize the Pandas data loader.

        Args:
            name: Unique name for this loader instance.
            config: Configuration dictionary containing:
                - path: Path to CSV file (required)

        Raises:
            ValueError: If required config keys are missing.
        """
        if not config or "path" not in config:
            raise ValueError(f"Configuration for dataloader '{name}' must contain 'path'")
        self.name = name
        self.path = config["path"]

    def get_selections(self) -> list[DataSelection]:
        """Create a PandasDataSelection for the CSV file.

        Returns:
            A list containing a single PandasDataSelection instance.
        """
        return [PandasDataSelection(name=self.name, path=self.path)]

name = name instance-attribute

path = config['path'] instance-attribute

type: str = 'csv' class-attribute instance-attribute

__init__(name: str, config: dict[str, Any] | None = None)

Initialize the Pandas data loader.

Parameters:

Name Type Description Default
name str

Unique name for this loader instance.

required
config dict[str, Any] | None

Configuration dictionary containing: - path: Path to CSV file (required)

None

Raises:

Type Description
ValueError

If required config keys are missing.

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def __init__(self, name: str, config: dict[str, Any] | None = None):
    """Initialize the Pandas data loader.

    Args:
        name: Unique name for this loader instance.
        config: Configuration dictionary containing:
            - path: Path to CSV file (required)

    Raises:
        ValueError: If required config keys are missing.
    """
    if not config or "path" not in config:
        raise ValueError(f"Configuration for dataloader '{name}' must contain 'path'")
    self.name = name
    self.path = config["path"]

get_selections() -> list[DataSelection]

Create a PandasDataSelection for the CSV file.

Returns:

Type Description
list[DataSelection]

A list containing a single PandasDataSelection instance.

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
 94
 95
 96
 97
 98
 99
100
def get_selections(self) -> list[DataSelection]:
    """Create a PandasDataSelection for the CSV file.

    Returns:
        A list containing a single PandasDataSelection instance.
    """
    return [PandasDataSelection(name=self.name, path=self.path)]

PandasDataSelection

Bases: DataSelection

A selection of data from a CSV file loaded via Pandas.

This class represents data loaded from a CSV file and provides an iterator over PyArrow RecordBatches.

Attributes:

Name Type Description
name

Name identifier for this selection.

path

Path to the CSV file.

data DataFrame | None

The loaded pandas DataFrame.

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class PandasDataSelection(DataSelection):
    """A selection of data from a CSV file loaded via Pandas.

    This class represents data loaded from a CSV file and provides
    an iterator over PyArrow RecordBatches.

    Attributes:
        name: Name identifier for this selection.
        path: Path to the CSV file.
        data: The loaded pandas DataFrame.
    """

    def __init__(self, name: str, path: str):
        """Initialize a Pandas data selection.

        Args:
            name: Name identifier for this selection.
            path: Path to the CSV file.
        """
        self.name = name
        self.path = path
        self.data: pd.DataFrame | None = None

    @override
    def bootstrap(self, columns_list: list[str] | None = None) -> None:
        # For CSV, we currently load everything
        self.data = pd.read_csv(self.path, sep=",")

    def __len__(self) -> int:
        return len(self.data) if self.data is not None else 0

    @override
    def get_nb_batches(self) -> int:
        return 1 if self.data is not None else 0

    @override
    def __iter__(self) -> Any:
        if self.data is not None:
            yield pa.RecordBatch.from_pandas(self.data)

    @override
    def __repr__(self) -> str:
        return f"PandasSelection(name='{self.name}', path='{self.path}')"

data: pd.DataFrame | None = None instance-attribute

name = name instance-attribute

path = path instance-attribute

__init__(name: str, path: str)

Initialize a Pandas data selection.

Parameters:

Name Type Description Default
name str

Name identifier for this selection.

required
path str

Path to the CSV file.

required
Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
33
34
35
36
37
38
39
40
41
42
def __init__(self, name: str, path: str):
    """Initialize a Pandas data selection.

    Args:
        name: Name identifier for this selection.
        path: Path to the CSV file.
    """
    self.name = name
    self.path = path
    self.data: pd.DataFrame | None = None

__iter__() -> Any

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
56
57
58
59
@override
def __iter__(self) -> Any:
    if self.data is not None:
        yield pa.RecordBatch.from_pandas(self.data)

__len__() -> int

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
49
50
def __len__(self) -> int:
    return len(self.data) if self.data is not None else 0

__repr__() -> str

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
61
62
63
@override
def __repr__(self) -> str:
    return f"PandasSelection(name='{self.name}', path='{self.path}')"

bootstrap(columns_list: list[str] | None = None) -> None

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
44
45
46
47
@override
def bootstrap(self, columns_list: list[str] | None = None) -> None:
    # For CSV, we currently load everything
    self.data = pd.read_csv(self.path, sep=",")

get_nb_batches() -> int

Source code in packages/dqm-ml-job/src/dqm_ml_job/dataloaders/pandas.py
52
53
54
@override
def get_nb_batches(self) -> int:
    return 1 if self.data is not None else 0