From d2ed6787d48cceac8a72440332b1dd212857d203 Mon Sep 17 00:00:00 2001 From: marques Date: Sun, 12 Oct 2025 18:42:29 +0800 Subject: [PATCH] Add utility functions for signal processing and configuration management --- HYS_process.py | 59 +++++++++++++++++++++- dataset_config/HYS_config.yaml | 13 +++++ utils/HYS_FileReader.py | 90 +++++++++++++++++++++++++++++++++- utils/__init__.py | 2 + utils/operation_tools.py | 31 +++++++++++- 5 files changed, 191 insertions(+), 4 deletions(-) create mode 100644 dataset_config/HYS_config.yaml diff --git a/HYS_process.py b/HYS_process.py index 1f07192..f3de6d8 100644 --- a/HYS_process.py +++ b/HYS_process.py @@ -12,10 +12,65 @@ 提供数据处理前后的可视化对比,帮助理解数据变化 绘制多条可用性趋势图,展示数据的可用区间、体动区间、低幅值区间等 - +todo: 使用mask 屏蔽无用区间 # 低幅值区间规则标定与剔除 # 高幅值连续体动规则标定与剔除 # 手动标定不可用区间提剔除 -""" \ No newline at end of file +""" + +from pathlib import Path +from typing import Union +import utils +import numpy as np + + + + +def process_one_signal(samp_id): + signal_path = list((org_signal_root_path / f"{samp_id}").glob("OrgBCG_Sync_*.txt")) + if not signal_path: + raise FileNotFoundError(f"OrgBCG_Sync file not found for sample ID: {samp_id}") + signal_path = signal_path[0] + print(f"Processing OrgBCG_Sync signal file: {signal_path}") + + label_path = (label_root_path / f"{samp_id}").glob("SA Label_corrected.csv") + if not label_path: + raise FileNotFoundError(f"Label_corrected file not found for sample ID: {samp_id}") + label_path = list(label_path)[0] + print(f"Processing Label_corrected file: {label_path}") + + + signal_data = utils.read_signal_txt(signal_path) + signal_length = len(signal_data) + print(f"signal_length: {signal_length}") + signal_fs = int(signal_path.stem.split("_")[-1]) + print(f"signal_fs: {signal_fs}") + signal_second = signal_length // signal_fs + print(f"signal_second: {signal_second}") + + + label_data = utils.read_label_csv(label_path) + + manual_disable_mask = utils.generate_disable_mask(signal_second, all_samp_disable_df[all_samp_disable_df["id"] == samp_id]) + print(f"disable_mask_shape: {manual_disable_mask.shape}, num_disable: {np.sum(manual_disable_mask == 0)}") + + + + + +if __name__ == '__main__': + yaml_path = Path("./dataset_config/HYS_config.yaml") + disable_df_path = Path("./排除区间.xlsx") + + select_ids, root_path = utils.load_dataset_info(yaml_path) + print(f"select_ids: {select_ids}") + print(f"root_path: {root_path}") + + org_signal_root_path = root_path / "OrgBCG_Aligned" + label_root_path = root_path / "Label" + + all_samp_disable_df = utils.read_disable_excel(disable_df_path) + + process_one_signal(select_ids[0]) diff --git a/dataset_config/HYS_config.yaml b/dataset_config/HYS_config.yaml new file mode 100644 index 0000000..d30264f --- /dev/null +++ b/dataset_config/HYS_config.yaml @@ -0,0 +1,13 @@ +select_id: + - 1302 + - 286 + - 950 + - 220 + - 229 + - 541 + - 582 + - 670 + - 684 + - 960 + +root_path: /mnt/disk_wd/marques_dataset/DataCombine2023/HYS \ No newline at end of file diff --git a/utils/HYS_FileReader.py b/utils/HYS_FileReader.py index 82e4584..d7c477a 100644 --- a/utils/HYS_FileReader.py +++ b/utils/HYS_FileReader.py @@ -34,7 +34,7 @@ def read_signal_txt(path: Union[str, Path]) -> np.ndarray: return df.iloc[:, 0].to_numpy() -def read_laebl_csv(path: Union[str, Path]) -> pd.DataFrame: +def read_label_csv(path: Union[str, Path], verbose=True) -> pd.DataFrame: """ Read a CSV file and return it as a pandas DataFrame. @@ -49,6 +49,94 @@ def read_laebl_csv(path: Union[str, Path]) -> pd.DataFrame: # 直接用pandas读取 包含中文 故指定编码 df = pd.read_csv(path, encoding="gbk") + if verbose: + print(f"Label file read from {path}, number of rows: {len(df)}") + + # 统计打标情况 + # isLabeled=1 表示已打标 + # Event type 有值的为PSG导出的事件 + # Event type 为nan的为手动打标的事件 + # score=1 显著事件, score=2 为受干扰事件 score=3 为非显著应删除事件 + # 确认后的事件在correct_EventsType + # 输出事件信息 按照总计事件、低通气、中枢性、阻塞性、混合型按行输出 格式为 总计/来自PSG/手动/删除/未标注 + # Columns: + # Index Event type Stage Time Epoch Date Duration HR bef. HR extr. HR delta O2 bef. O2 min. O2 delta Body Position Validation Start End score remark correct_Start correct_End correct_EventsType isLabeled + # Event type: + # Hypopnea + # Central apnea + # Obstructive apnea + # Mixed apnea + + num_labeled = np.sum(df["isLabeled"] == 1) + num_psg_events = np.sum(df["Event type"].notna()) + num_manual_events = num_labeled - num_psg_events + num_deleted = np.sum(df["score"] == 3) + + # 统计事件 + num_total = np.sum((df["isLabeled"] == 1) & (df["score"] != 3)) + num_unlabeled = num_total - num_labeled + + num_psg_hyp = np.sum(df["Event type"] == "Hypopnea") + num_psg_csa = np.sum(df["Event type"] == "Central apnea") + num_psg_osa = np.sum(df["Event type"] == "Obstructive apnea") + num_psg_msa = np.sum(df["Event type"] == "Mixed apnea") + + num_hyp = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] != 3)) + num_csa = np.sum((df["correct_EventsType"] == "Central apnea") & (df["score"] != 3)) + num_osa = np.sum((df["correct_EventsType"] == "Obstructive apnea") & (df["score"] != 3)) + num_msa = np.sum((df["correct_EventsType"] == "Mixed apnea") & (df["score"] != 3)) + + num_manual_hyp = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Hypopnea")) + num_manual_csa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Central apnea")) + num_manual_osa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Obstructive apnea")) + num_manual_msa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Mixed apnea")) + + num_deleted_hyp = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Hypopnea")) + num_deleted_csa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Central apnea")) + num_deleted_osa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Obstructive apnea")) + num_deleted_msa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Mixed apnea")) + + num_unlabeled_hyp = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Hypopnea")) + num_unlabeled_csa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Central apnea")) + num_unlabeled_osa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Obstructive apnea")) + num_unlabeled_msa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Mixed apnea")) + + + + if verbose: + print("Event Statistics:") + # 格式化输出 总计/来自PSG/手动/删除/未标注 指定宽度 + print("Type Total / PSG / Manual / Deleted / Unlabeled") + print(f"Hypopnea: {num_hyp:4d} / {num_psg_hyp:4d} / {num_manual_hyp:4d} / {num_deleted_hyp:4d} / {num_unlabeled_hyp:4d}") + print(f"Central apnea: {num_csa:4d} / {num_psg_csa:4d} / {num_manual_csa:4d} / {num_deleted_csa:4d} / {num_unlabeled_csa:4d}") + print(f"Obstructive ap: {num_osa:4d} / {num_psg_osa:4d} / {num_manual_osa:4d} / {num_deleted_osa:4d} / {num_unlabeled_osa:4d}") + print(f"Mixed apnea: {num_msa:4d} / {num_psg_msa:4d} / {num_manual_msa:4d} / {num_deleted_msa:4d} / {num_unlabeled_msa:4d}") + print(f"Total events: {num_total:4d} / {num_psg_events:4d} / {num_manual_events:4d} / {num_deleted:4d} / {num_unlabeled:4d}") + + + + df["Start"] = df["Start"].astype(int) df["End"] = df["End"].astype(int) + return df + + +def read_disable_excel(path: Union[str, Path]) -> pd.DataFrame: + """ + Read an Excel file and return it as a pandas DataFrame. + + Args: + path (str | Path): Path to the Excel file. + Returns: + pd.DataFrame: The content of the Excel file as a pandas DataFrame. + """ + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + + # 直接用pandas读取 + df = pd.read_excel(path) + df["id"] = df["id"].astype(int) + df["start"] = df["start"].astype(int) + df["end"] = df["end"].astype(int) return df \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py index e69de29..d2c0727 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -0,0 +1,2 @@ +from utils.HYS_FileReader import read_label_csv, read_signal_txt, read_disable_excel +from utils.operation_tools import load_dataset_info, generate_disable_mask \ No newline at end of file diff --git a/utils/operation_tools.py b/utils/operation_tools.py index bcc5062..a4118b9 100644 --- a/utils/operation_tools.py +++ b/utils/operation_tools.py @@ -4,7 +4,7 @@ from pathlib import Path import numpy as np import pandas as pd from matplotlib import pyplot as plt - +import yaml plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 @@ -252,5 +252,34 @@ def calculate_by_slide_windows(func, signal_data, calc_mask, sampling_rate=100, return values_nan, values +def load_dataset_info(yaml_path): + with open(yaml_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + + select_ids = config.get('select_id', []) + root_path = config.get('root_path', None) + data_path = Path(root_path) + return select_ids, data_path + + +def generate_disable_mask(signal_second: int, disable_df) -> np.ndarray: + disable_mask = np.ones(signal_second, dtype=int) + + for _, row in disable_df.iterrows(): + start = row["start"] + end = row["end"] + disable_mask[start:end] = 0 + return disable_mask + + +def generate_event_mask(signal_second: int, event_df) -> np.ndarray: + event_mask = np.zeros(signal_second, dtype=int) + + for _, row in event_df.iterrows(): + start = row["start"] + end = row["end"] + event_mask[start:end] = 1 + return event_mask +