Source code for NekUpload.utils.hdf5_reader

from __future__ import annotations
from typing import Type, Any
import h5py
import os

from .exceptions import HDF5ReaderException


[docs]
class HDF5Reader():
    def __init__(self,filename: str, mode: str="r") -> None:
        self.filename = filename
        self.mode = mode
        self.file: h5py.File | None = None
    
    def __enter__(self) -> HDF5Reader:
        try:
            self.file = h5py.File(self.filename,self.mode)
            return self #Return HDF5Reader instance
        except Exception as e:
            raise e
        
    def __exit__(self, exc_type: Type[BaseException] | None, exc_val: BaseException | None, exc_tb: Type[BaseException] | None) -> None:
        if self.file:
            self.file.close()
            self.file = None
    

[docs]
    def get_keys(self) -> dict[str, str]:
        """
        Recursively retrieves all keys in the HDF5 file and categorizes them as 'GROUP', 'DATASET', or 'ATTRIBUTE'.
        
        Returns:
            dict[str, str]: A dictionary mapping keys (HDF5 paths) to their types.
        """
        if not self.file:
            msg = f"{self.filename} HDF5 file is not open."
            raise HDF5ReaderException(msg)

        key_dict = {}

        def get_keys_helper(group: h5py.Group, prefix: str = "") -> None:
            """Recursive function to traverse HDF5 groups and datasets."""
            for key in group.keys():
                full_key = f"{prefix}/{key}".lstrip("/")  # Ensure proper formatting
                
                obj = group[key]
                if isinstance(obj, h5py.Group):
                    key_dict[full_key] = "GROUP"
                    get_keys_helper(obj, full_key)  # Recurse into sub-groups
                elif isinstance(obj, h5py.Dataset):
                    key_dict[full_key] = "DATASET"

                # Add attributes (only at current level, not recursive)
                for attr_name in obj.attrs.keys():
                    attr_key = f"{full_key}/@{attr_name}"
                    key_dict[attr_key] = "ATTRIBUTE"

        get_keys_helper(self.file)
        return key_dict

    

[docs]
    def get_dataset(self, dataset_key: str) -> h5py.Dataset | None:
        """
        Reads a dataset from the HDF5 file.

        Args:
            dataset_key (str): The name of the dataset to read.

        Returns:
            Optional[h5py.Dataset]: The dataset object if found, otherwise None.
        """
        if not self.file:
            msg = f"{self.filename} HDF5 file is not open."
            raise HDF5ReaderException(msg)

        try:
            dataset = self.file[dataset_key]
            if isinstance(dataset, h5py.Dataset):
                return dataset
            else:
                msg = f"{dataset_key} is not a dataset."
                raise HDF5ReaderException(msg)
        except KeyError:
            msg = f"Dataset {dataset_key} not found in the file."
            raise HDF5ReaderException(msg)



[docs]
    def get_attributes(self, key: str) -> dict[str, Any]:
        """
        Retrieves attributes for a given key in the HDF5 file.

        Args:
            key (str): The key for which to retrieve attributes.

        Returns:
            dict[str, Any]: A dictionary of attribute names and their values.
        """
        if not self.file:
            msg = f"{self.filename} HDF5 file is not open."
            raise HDF5ReaderException(msg)

        try:
            obj = self.file[key]
            return {attr_name: obj.attrs[attr_name] for attr_name in obj.attrs.keys()}
        except KeyError:
            msg = f"Key {key} not found in the file."
            raise HDF5ReaderException(msg)



[docs]
    def dump_to_plain_file(self, target_file_name: str) -> None:
        """Convert HDF5 contents to a formatted plain-text representation."""
        if not self.file:
            raise HDF5ReaderException("HDF5 file is not open.")
        
        output_lines = self._dump_group(self.file)

        with open(target_file_name, "w") as f:
            f.write("\n".join(output_lines))
        print(f"HDF5 content saved to {target_file_name}")



[docs]
    def summary(self) -> dict[str, str]:
        if not self.file:
            raise HDF5ReaderException("HDF5 file is not open.")

        summary_dict = {
            "File Name": self.filename,
            "File Size": f"{os.path.getsize(self.filename)} bytes",
            "Groups": 0,
            "Datasets": 0,
            "Attributes": 0
        }

        def summary_helper(group: h5py.Group) -> None:
            """Recursive function to count groups, datasets, and attributes."""
            for key, obj in group.items():
                if isinstance(obj, h5py.Group):
                    summary_dict["Groups"] += 1
                    summary_helper(obj)  # Recurse into sub-groups
                elif isinstance(obj, h5py.Dataset):
                    summary_dict["Datasets"] += 1
                
                # Count attributes for both groups and datasets
                summary_dict["Attributes"] += len(obj.attrs)

        summary_helper(self.file)
        return summary_dict



[docs]
    def get_dtype(self, dataset_path: str) -> str | None:
        """
        Retrieves the data type of a dataset in the HDF5 file.

        Args:
            dataset_path (str): The path to the dataset.

        Returns:
            Optional[str]: The data type of the dataset if found, otherwise None.
        """
        if not self.file:
            msg = f"{self.filename} HDF5 file is not open."
            raise HDF5ReaderException(msg)

        try:
            dataset = self.file[dataset_path]
            if isinstance(dataset, h5py.Dataset):
                return str(dataset.dtype)
            else:
                msg = f"{dataset_path} is not a dataset."
                raise HDF5ReaderException(msg)
        except KeyError:
            msg = f"Dataset {dataset_path} not found in the file."
            raise HDF5ReaderException(msg)



[docs]
    def get_shape(self, dataset_path: str) -> tuple[int,...] | None:
        """
        Retrieves the shape of a dataset in the HDF5 file.

        Args:
            dataset_path (str): The path to the dataset.

        Returns:
            Optional[tuple]: The shape of the dataset if found, otherwise None.
        """
        if not self.file:
            msg = f"{self.filename} HDF5 file is not open."
            raise HDF5ReaderException(msg)

        try:
            dataset = self.file[dataset_path]
            if isinstance(dataset, h5py.Dataset):
                return dataset.shape
            else:
                msg = f"{dataset_path} is not a dataset."
                raise HDF5ReaderException(msg)
        except KeyError:
            msg = f"Dataset {dataset_path} not found in the file."
            raise HDF5ReaderException(msg)


    def _dump_group(self, group: h5py.Group, indent: int = 0) -> list[str]:
        """Recursively process an HDF5 group and return its content as a string."""
        lines = []
        indent_str = "    " * indent

        # Group name
        lines.append(f"{indent_str} Group: {group.name}")
        if group.attrs:
            lines.append(f"{indent_str} Attributes:")
            for key, value in group.attrs.items():
                lines.append(f"{indent_str}    - {key}: {value}")

        # Iterate through datasets and groups
        for name, item in group.items():
            if isinstance(item, h5py.Group):
                lines.extend(self._dump_group(item, indent + 1))  # Recursive call
            elif isinstance(item, h5py.Dataset):
                lines.append(f"{indent_str} Dataset: {name}")
                if item.attrs:
                    lines.append(f"{indent_str} Attributes:")
                    for key, value in item.attrs.items():
                        lines.append(f"{indent_str}      - {key}: {value}")
                lines.append(f"{indent_str} Shape: {item.shape}")
                lines.append(f"{indent_str} Dtype: {item.dtype}")
                lines.append(f"{indent_str} Data: {item[...]}")

        return lines