DataPrepare/utils/HYS_FileReader.py

172 lines
7.7 KiB
Python

from pathlib import Path
from typing import Union
import numpy as np
import pandas as pd
# 尝试导入 Polars
try:
import polars as pl
HAS_POLARS = True
except ImportError:
HAS_POLARS = False
def read_signal_txt(path: Union[str, Path]) -> np.ndarray:
"""
Read a txt file and return the first column as a numpy array.
Args:
path (str | Path): Path to the txt file.
Returns:
np.ndarray: The first column of the txt file as a numpy array.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
if HAS_POLARS:
df = pl.read_csv(path, has_header=False, infer_schema_length=0)
return df[:, 0].to_numpy()
else:
df = pd.read_csv(path, header=None, dtype=float)
return df.iloc[:, 0].to_numpy()
def read_label_csv(path: Union[str, Path], verbose=True) -> pd.DataFrame:
"""
Read a CSV file and return it as a pandas DataFrame.
Args:
path (str | Path): Path to the CSV file.
Returns:
pd.DataFrame: The content of the CSV file as a pandas DataFrame.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
# 直接用pandas读取 包含中文 故指定编码
df = pd.read_csv(path, encoding="gbk")
if verbose:
print(f"Label file read from {path}, number of rows: {len(df)}")
# 统计打标情况
# isLabeled=1 表示已打标
# Event type 有值的为PSG导出的事件
# Event type 为nan的为手动打标的事件
# score=1 显著事件, score=2 为受干扰事件 score=3 为非显著应删除事件
# 确认后的事件在correct_EventsType
# 输出事件信息 按照总计事件、低通气、中枢性、阻塞性、混合型按行输出 格式为 总计/来自PSG/手动/删除/未标注
# Columns:
# Index Event type Stage Time Epoch Date Duration HR bef. HR extr. HR delta O2 bef. O2 min. O2 delta Body Position Validation Start End score remark correct_Start correct_End correct_EventsType isLabeled
# Event type:
# Hypopnea
# Central apnea
# Obstructive apnea
# Mixed apnea
num_total = np.sum((df["isLabeled"] == 1) & (df["score"] != 3))
num_psg_events = np.sum(df["Event type"].notna())
num_manual_events = np.sum(df["Event type"].isna())
num_deleted = np.sum(df["score"] == 3)
# 统计事件
num_unlabeled = np.sum(df["isLabeled"] == -1)
num_psg_hyp = np.sum(df["Event type"] == "Hypopnea")
num_psg_csa = np.sum(df["Event type"] == "Central apnea")
num_psg_osa = np.sum(df["Event type"] == "Obstructive apnea")
num_psg_msa = np.sum(df["Event type"] == "Mixed apnea")
num_hyp = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] != 3))
num_csa = np.sum((df["correct_EventsType"] == "Central apnea") & (df["score"] != 3))
num_osa = np.sum((df["correct_EventsType"] == "Obstructive apnea") & (df["score"] != 3))
num_msa = np.sum((df["correct_EventsType"] == "Mixed apnea") & (df["score"] != 3))
num_manual_hyp = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Hypopnea"))
num_manual_csa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Central apnea"))
num_manual_osa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Obstructive apnea"))
num_manual_msa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Mixed apnea"))
num_deleted_hyp = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Hypopnea"))
num_deleted_csa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Central apnea"))
num_deleted_osa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Obstructive apnea"))
num_deleted_msa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Mixed apnea"))
num_unlabeled_hyp = np.sum((df["isLabeled"] == -1) & (df["Event type"] == "Hypopnea"))
num_unlabeled_csa = np.sum((df["isLabeled"] == -1) & (df["Event type"] == "Central apnea"))
num_unlabeled_osa = np.sum((df["isLabeled"] == -1) & (df["Event type"] == "Obstructive apnea"))
num_unlabeled_msa = np.sum((df["isLabeled"] == -1) & (df["Event type"] == "Mixed apnea"))
num_hyp_1_score = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] == 1))
num_csa_1_score = np.sum((df["correct_EventsType"] == "Central apnea") & (df["score"] == 1))
num_osa_1_score = np.sum((df["correct_EventsType"] == "Obstructive apnea") & (df["score"] == 1))
num_msa_1_score = np.sum((df["correct_EventsType"] == "Mixed apnea") & (df["score"] == 1))
num_hyp_2_score = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] == 2))
num_csa_2_score = np.sum((df["correct_EventsType"] == "Central apnea") & (df["score"] == 2))
num_osa_2_score = np.sum((df["correct_EventsType"] == "Obstructive apnea") & (df["score"] == 2))
num_msa_2_score = np.sum((df["correct_EventsType"] == "Mixed apnea") & (df["score"] == 2))
num_hyp_3_score = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] == 3))
num_csa_3_score = np.sum((df["correct_EventsType"] == "Central apnea") & (df["score"] == 3))
num_osa_3_score = np.sum((df["correct_EventsType"] == "Obstructive apnea") & (df["score"] == 3))
num_msa_3_score = np.sum((df["correct_EventsType"] == "Mixed apnea") & (df["score"] == 3))
num_1_score = np.sum(df["score"] == 1)
num_2_score = np.sum(df["score"] == 2)
num_3_score = np.sum(df["score"] == 3)
if verbose:
print("Event Statistics:")
# 格式化输出 总计/来自PSG/手动/删除/未标注 指定宽度
print(f"Type {'Total':^8s} / {'From PSG':^8s} / {'Manual':^8s} / {'Deleted':^8s} / {'Unlabeled':^8s}")
print(
f"Hyp: {num_hyp:^8d} / {num_psg_hyp:^8d} / {num_manual_hyp:^8d} / {num_deleted_hyp:^8d} / {num_unlabeled_hyp:^8d}")
print(
f"CSA: {num_csa:^8d} / {num_psg_csa:^8d} / {num_manual_csa:^8d} / {num_deleted_csa:^8d} / {num_unlabeled_csa:^8d}")
print(
f"OSA: {num_osa:^8d} / {num_psg_osa:^8d} / {num_manual_osa:^8d} / {num_deleted_osa:^8d} / {num_unlabeled_osa:^8d}")
print(
f"MSA: {num_msa:^8d} / {num_psg_msa:^8d} / {num_manual_msa:^8d} / {num_deleted_msa:^8d} / {num_unlabeled_msa:^8d}")
print(
f"All: {num_total:^8d} / {num_psg_events:^8d} / {num_manual_events:^8d} / {num_deleted:^8d} / {num_unlabeled:^8d}")
print("Score Statistics (only for non-deleted events and manual created events):")
print(f"Type {'Total':^8s} / {'Score 1':^8s} / {'Score 2':^8s} / {'Score 3':^8s}")
print(f"Hyp: {num_hyp:^8d} / {num_hyp_1_score:^8d} / {num_hyp_2_score:^8d} / {num_hyp_3_score:^8d}")
print(f"CSA: {num_csa:^8d} / {num_csa_1_score:^8d} / {num_csa_2_score:^8d} / {num_csa_3_score:^8d}")
print(f"OSA: {num_osa:^8d} / {num_osa_1_score:^8d} / {num_osa_2_score:^8d} / {num_osa_3_score:^8d}")
print(f"MSA: {num_msa:^8d} / {num_msa_1_score:^8d} / {num_msa_2_score:^8d} / {num_msa_3_score:^8d}")
print(f"All: {num_total:^8d} / {num_1_score:^8d} / {num_2_score:^8d} / {num_3_score:^8d}")
df["Start"] = df["Start"].astype(int)
df["End"] = df["End"].astype(int)
return df
def read_disable_excel(path: Union[str, Path]) -> pd.DataFrame:
"""
Read an Excel file and return it as a pandas DataFrame.
Args:
path (str | Path): Path to the Excel file.
Returns:
pd.DataFrame: The content of the Excel file as a pandas DataFrame.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
# 直接用pandas读取
df = pd.read_excel(path)
df["id"] = df["id"].astype(int)
df["start"] = df["start"].astype(int)
df["end"] = df["end"].astype(int)
return df