DataPrepare/utils/HYS_FileReader.py

from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd

# 尝试导入 Polars
try:
    import polars as pl
    HAS_POLARS = True
except ImportError:
    HAS_POLARS = False


def read_signal_txt(path: Union[str, Path]) -> np.ndarray:
    """
    Read a txt file and return the first column as a numpy array.

    Args:
        path (str | Path): Path to the txt file.

    Returns:
        np.ndarray: The first column of the txt file as a numpy array.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    if HAS_POLARS:
        df = pl.read_csv(path, has_header=False, infer_schema_length=0)
        return df[:, 0].to_numpy()
    else:
        df = pd.read_csv(path, header=None, dtype=float)
        return df.iloc[:, 0].to_numpy()


def read_label_csv(path: Union[str, Path], verbose=True) -> pd.DataFrame:
    """
    Read a CSV file and return it as a pandas DataFrame.

    Args:
        path (str | Path): Path to the CSV file.
    Returns:
        pd.DataFrame: The content of the CSV file as a pandas DataFrame.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    # 直接用pandas读取 包含中文 故指定编码
    df = pd.read_csv(path, encoding="gbk")
    if verbose:
        print(f"Label file read from {path}, number of rows: {len(df)}")

    # 统计打标情况
    # isLabeled=1 表示已打标
    # Event type 有值的为PSG导出的事件
    # Event type 为nan的为手动打标的事件
    # score=1 显著事件， score=2 为受干扰事件 score=3 为非显著应删除事件
    # 确认后的事件在correct_EventsType
    # 输出事件信息 按照总计事件、低通气、中枢性、阻塞性、混合型按行输出 格式为 总计/来自PSG/手动/删除/未标注
    # Columns:
    # Index	Event type	Stage	Time	Epoch	Date	Duration	HR bef.	HR extr.	HR delta	O2 bef.	O2 min.	O2 delta	Body Position	Validation	Start	End	score	remark	correct_Start	correct_End	correct_EventsType	isLabeled
    # Event type:
    # Hypopnea
    # Central apnea
    # Obstructive apnea
    # Mixed apnea

    num_labeled = np.sum(df["isLabeled"] == 1)
    num_psg_events = np.sum(df["Event type"].notna())
    num_manual_events = num_labeled - num_psg_events
    num_deleted = np.sum(df["score"] == 3)

    # 统计事件
    num_total = np.sum((df["isLabeled"] == 1) & (df["score"] != 3))
    num_unlabeled = num_total - num_labeled

    num_psg_hyp = np.sum(df["Event type"] == "Hypopnea")
    num_psg_csa = np.sum(df["Event type"] == "Central apnea")
    num_psg_osa = np.sum(df["Event type"] == "Obstructive apnea")
    num_psg_msa = np.sum(df["Event type"] == "Mixed apnea")

    num_hyp = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] != 3))
    num_csa = np.sum((df["correct_EventsType"] == "Central apnea")  & (df["score"] != 3))
    num_osa = np.sum((df["correct_EventsType"] == "Obstructive apnea")  & (df["score"] != 3))
    num_msa = np.sum((df["correct_EventsType"] == "Mixed apnea")  & (df["score"] != 3))

    num_manual_hyp = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Hypopnea"))
    num_manual_csa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Central apnea"))
    num_manual_osa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Obstructive apnea"))
    num_manual_msa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Mixed apnea"))

    num_deleted_hyp = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Hypopnea"))
    num_deleted_csa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Central apnea"))
    num_deleted_osa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Obstructive apnea"))
    num_deleted_msa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Mixed apnea"))

    num_unlabeled_hyp = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Hypopnea"))
    num_unlabeled_csa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Central apnea"))
    num_unlabeled_osa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Obstructive apnea"))
    num_unlabeled_msa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Mixed apnea"))


    if verbose:
        print("Event Statistics:")
        # 格式化输出 总计/来自PSG/手动/删除/未标注 指定宽度
        print("Type          Total / PSG / Manual / Deleted / Unlabeled")
        print(f"Hypopnea:       {num_hyp:4d} / {num_psg_hyp:4d} / {num_manual_hyp:4d} / {num_deleted_hyp:4d} / {num_unlabeled_hyp:4d}")
        print(f"Central apnea:  {num_csa:4d} / {num_psg_csa:4d} / {num_manual_csa:4d} / {num_deleted_csa:4d} / {num_unlabeled_csa:4d}")
        print(f"Obstructive ap: {num_osa:4d} / {num_psg_osa:4d} / {num_manual_osa:4d} / {num_deleted_osa:4d} / {num_unlabeled_osa:4d}")
        print(f"Mixed apnea:    {num_msa:4d} / {num_psg_msa:4d} / {num_manual_msa:4d} / {num_deleted_msa:4d} / {num_unlabeled_msa:4d}")
        print(f"Total events:   {num_total:4d} / {num_psg_events:4d} / {num_manual_events:4d} / {num_deleted:4d} / {num_unlabeled:4d}")


    df["Start"] = df["Start"].astype(int)
    df["End"] = df["End"].astype(int)
    return df


def read_disable_excel(path: Union[str, Path]) -> pd.DataFrame:
    """
    Read an Excel file and return it as a pandas DataFrame.

    Args:
        path (str | Path): Path to the Excel file.
    Returns:
        pd.DataFrame: The content of the Excel file as a pandas DataFrame.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    # 直接用pandas读取
    df = pd.read_excel(path)
    df["id"] = df["id"].astype(int)
    df["start"] = df["start"].astype(int)
    df["end"] = df["end"].astype(int)
    return df