`dqm_ml_core.api.data_processor`

Base data metric processor class.

This module contains the DatametricProcessor base class that all metric processors must inherit from. It provides the streaming architecture for processing large datasets.

`logger = logging.getLogger(name)` `module-attribute`

`DatametricProcessor`

Base class for all Data Quality metrics and feature extractors.

The processor follows a streaming lifecycle designed to handle large datasets without loading them entirely into memory:

Feature Extraction (compute_features): Transformation of raw data into relevant features (e.g., image -> luminosity).
Batch Aggregation (compute_batch_metric): Compression of features into intermediate statistics (e.g., count, partial sum, histogram).
Global Computation (compute): Final aggregation of all batch-level statistics into dataset-level scores.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

class DatametricProcessor:
    """
    Base class for all Data Quality metrics and feature extractors.

    The processor follows a streaming lifecycle designed to handle large datasets
    without loading them entirely into memory:

    1. Feature Extraction (`compute_features`): Transformation of raw data into
       relevant features (e.g., image -> luminosity).
    2. Batch Aggregation (`compute_batch_metric`): Compression of features into
       intermediate statistics (e.g., count, partial sum, histogram).
    3. Global Computation (`compute`): Final aggregation of all batch-level
       statistics into dataset-level scores.
    """

    def __init__(self, name: str, config: dict[str, Any] | None):
        """
        Initialize the dataset processor.

        Args:
            name: Unique name of the processor instance.
            config: Configuration dictionary (optional).
        """

        self.name = name
        self.config = config or {}

        # Validate input_columns if present
        if "input_columns" in self.config:
            if not isinstance(self.config["input_columns"], list):
                raise ValueError(
                    f"Metric {name} configuration need 'input_columns', got {type(self.config['input_columns'])}"
                )
            self.input_columns = self.config["input_columns"]
        else:
            self.input_columns = []

        # Validate output_columns if present
        if "output_columns" in self.config:
            if not isinstance(self.config["output_columns"], dict):
                raise ValueError(
                    f"Metric {name} configuration need of 'output_columns', got {type(self.config['output_columns'])}"
                )
            self.outputs_columns = self.config["output_columns"]
        else:
            self.outputs_columns = {}

    def needed_columns(self) -> list[str]:
        """
        Return the list of raw input columns required for feature extraction.

        Returns:
            A list of column names.
        """
        return getattr(self, "input_columns", [])

    def generated_features(self) -> list[str]:
        """
        Return the list of columns generated by this processor during feature extraction.

        Returns:
            A list of feature names.
        """

        outputs = getattr(self, "output_features", {})
        return list(outputs.values())

    def generated_metrics(self) -> list[str]:
        """
        Return the names of the final metrics produced by this processor.

        Returns:
            A list of metric names.
        """

        outputs = getattr(self, "output_metrics", {})
        return list(outputs.values())

    def compute_features(self, batch: pa.RecordBatch, prev_features: dict[str, pa.Array]) -> dict[str, pa.Array]:
        """
        Transform a raw data batch into features.

        Args:
            batch: The input pyarrow RecordBatch.
            prev_features: Features already computed by preceding processors.

        Returns:
            A dictionary mapping feature names to pyarrow Arrays.
        """
        features = {}

        for col in self.needed_columns():
            if col in prev_features:
                # feature already computed no need to add it again
                continue

            if col not in batch.schema.names:
                logger.warning(f"[{self.name}] column '{col}' not found in batch")
                continue
            features[col] = batch.column(col)

        return features

    def compute_batch_metric(self, features: dict[str, pa.Array]) -> dict[str, pa.Array]:
        """
        Aggregate features into intermediate statistics for the current batch.

        This method is critical for scalability. It should return a compact
        representation of the data (e.g., partial sums) that can be
        efficiently combined later.

        Args:
            features: Dictionary of feature arrays computed on the batch.

        Returns:
            A dictionary of aggregated statistics.
        """
        return {}

    def compute(self, batch_metrics: dict[str, pa.Array]) -> dict[str, Any]:
        """
        Perform the final dataset-level metric calculation.

        Args:
            batch_metrics: The aggregated intermediate statistics from all batches.

        Returns:
            A dictionary containing the final metrics.
        """
        return {}

    def compute_delta(self, source: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]:
        """
        Compare metrics between two different dataselection.

        Args:
            source: Final metrics from the source dataselection.
            target: Final metrics from the target dataselection.

        Returns:
            A dictionary containing distance or difference scores.
        """
        return {}

`config = config or {}` `instance-attribute`

`input_columns = self.config['input_columns']` `instance-attribute`

`name = name` `instance-attribute`

`outputs_columns = self.config['output_columns']` `instance-attribute`

`init(name: str, config: dict[str, Any] | None)`

Initialize the dataset processor.

Parameters:

Name	Type	Description	Default
`name`	`str`	Unique name of the processor instance.	required
`config`	`dict[str, Any] \| None`	Configuration dictionary (optional).	required

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def __init__(self, name: str, config: dict[str, Any] | None):
    """
    Initialize the dataset processor.

    Args:
        name: Unique name of the processor instance.
        config: Configuration dictionary (optional).
    """

    self.name = name
    self.config = config or {}

    # Validate input_columns if present
    if "input_columns" in self.config:
        if not isinstance(self.config["input_columns"], list):
            raise ValueError(
                f"Metric {name} configuration need 'input_columns', got {type(self.config['input_columns'])}"
            )
        self.input_columns = self.config["input_columns"]
    else:
        self.input_columns = []

    # Validate output_columns if present
    if "output_columns" in self.config:
        if not isinstance(self.config["output_columns"], dict):
            raise ValueError(
                f"Metric {name} configuration need of 'output_columns', got {type(self.config['output_columns'])}"
            )
        self.outputs_columns = self.config["output_columns"]
    else:
        self.outputs_columns = {}

`compute(batch_metrics: dict[str, pa.Array]) -> dict[str, Any]`

Perform the final dataset-level metric calculation.

Parameters:

Name	Type	Description	Default
`batch_metrics`	`dict[str, Array]`	The aggregated intermediate statistics from all batches.	required

Returns:

Type	Description
`dict[str, Any]`	A dictionary containing the final metrics.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def compute(self, batch_metrics: dict[str, pa.Array]) -> dict[str, Any]:
    """
    Perform the final dataset-level metric calculation.

    Args:
        batch_metrics: The aggregated intermediate statistics from all batches.

    Returns:
        A dictionary containing the final metrics.
    """
    return {}

`compute_batch_metric(features: dict[str, pa.Array]) -> dict[str, pa.Array]`

Aggregate features into intermediate statistics for the current batch.

This method is critical for scalability. It should return a compact representation of the data (e.g., partial sums) that can be efficiently combined later.

Parameters:

Name	Type	Description	Default
`features`	`dict[str, Array]`	Dictionary of feature arrays computed on the batch.	required

Returns:

Type	Description
`dict[str, Array]`	A dictionary of aggregated statistics.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def compute_batch_metric(self, features: dict[str, pa.Array]) -> dict[str, pa.Array]:
    """
    Aggregate features into intermediate statistics for the current batch.

    This method is critical for scalability. It should return a compact
    representation of the data (e.g., partial sums) that can be
    efficiently combined later.

    Args:
        features: Dictionary of feature arrays computed on the batch.

    Returns:
        A dictionary of aggregated statistics.
    """
    return {}

`compute_delta(source: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]`

Compare metrics between two different dataselection.

Parameters:

Name	Type	Description	Default
`source`	`dict[str, Any]`	Final metrics from the source dataselection.	required
`target`	`dict[str, Any]`	Final metrics from the target dataselection.	required

Returns:

Type	Description
`dict[str, Any]`	A dictionary containing distance or difference scores.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def compute_delta(self, source: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]:
    """
    Compare metrics between two different dataselection.

    Args:
        source: Final metrics from the source dataselection.
        target: Final metrics from the target dataselection.

    Returns:
        A dictionary containing distance or difference scores.
    """
    return {}

`compute_features(batch: pa.RecordBatch, prev_features: dict[str, pa.Array]) -> dict[str, pa.Array]`

Transform a raw data batch into features.

Parameters:

Name	Type	Description	Default
`batch`	`RecordBatch`	The input pyarrow RecordBatch.	required
`prev_features`	`dict[str, Array]`	Features already computed by preceding processors.	required

Returns:

Type	Description
`dict[str, Array]`	A dictionary mapping feature names to pyarrow Arrays.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def compute_features(self, batch: pa.RecordBatch, prev_features: dict[str, pa.Array]) -> dict[str, pa.Array]:
    """
    Transform a raw data batch into features.

    Args:
        batch: The input pyarrow RecordBatch.
        prev_features: Features already computed by preceding processors.

    Returns:
        A dictionary mapping feature names to pyarrow Arrays.
    """
    features = {}

    for col in self.needed_columns():
        if col in prev_features:
            # feature already computed no need to add it again
            continue

        if col not in batch.schema.names:
            logger.warning(f"[{self.name}] column '{col}' not found in batch")
            continue
        features[col] = batch.column(col)

    return features

`generated_features() -> list[str]`

Return the list of columns generated by this processor during feature extraction.

Returns:

Type	Description
`list[str]`	A list of feature names.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def generated_features(self) -> list[str]:
    """
    Return the list of columns generated by this processor during feature extraction.

    Returns:
        A list of feature names.
    """

    outputs = getattr(self, "output_features", {})
    return list(outputs.values())

`generated_metrics() -> list[str]`

Return the names of the final metrics produced by this processor.

Returns:

Type	Description
`list[str]`	A list of metric names.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def generated_metrics(self) -> list[str]:
    """
    Return the names of the final metrics produced by this processor.

    Returns:
        A list of metric names.
    """

    outputs = getattr(self, "output_metrics", {})
    return list(outputs.values())

`needed_columns() -> list[str]`

Return the list of raw input columns required for feature extraction.

Returns:

Type	Description
`list[str]`	A list of column names.

Source code in packages/dqm-ml-core/src/dqm_ml_core/api/data_processor.py

def needed_columns(self) -> list[str]:
    """
    Return the list of raw input columns required for feature extraction.

    Returns:
        A list of column names.
    """
    return getattr(self, "input_columns", [])

dqm_ml_core.api.data_processor

logger = logging.getLogger(__name__) module-attribute

DatametricProcessor

config = config or {} instance-attribute

input_columns = self.config['input_columns'] instance-attribute

name = name instance-attribute

outputs_columns = self.config['output_columns'] instance-attribute

__init__(name: str, config: dict[str, Any] | None)

compute(batch_metrics: dict[str, pa.Array]) -> dict[str, Any]

compute_batch_metric(features: dict[str, pa.Array]) -> dict[str, pa.Array]

compute_delta(source: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]

compute_features(batch: pa.RecordBatch, prev_features: dict[str, pa.Array]) -> dict[str, pa.Array]

generated_features() -> list[str]

generated_metrics() -> list[str]

needed_columns() -> list[str]

`dqm_ml_core.api.data_processor`

`logger = logging.getLogger(name)` `module-attribute`

`DatametricProcessor`

`config = config or {}` `instance-attribute`

`input_columns = self.config['input_columns']` `instance-attribute`

`name = name` `instance-attribute`

`outputs_columns = self.config['output_columns']` `instance-attribute`

`init(name: str, config: dict[str, Any] | None)`

`compute(batch_metrics: dict[str, pa.Array]) -> dict[str, Any]`

`compute_batch_metric(features: dict[str, pa.Array]) -> dict[str, pa.Array]`

`compute_delta(source: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]`

`compute_features(batch: pa.RecordBatch, prev_features: dict[str, pa.Array]) -> dict[str, pa.Array]`

`generated_features() -> list[str]`

`generated_metrics() -> list[str]`

`needed_columns() -> list[str]`