From d2ed6787d48cceac8a72440332b1dd212857d203 Mon Sep 17 00:00:00 2001
From: marques <gitea@marques22.com>
Date: Sun, 12 Oct 2025 18:42:29 +0800
Subject: [PATCH] Add utility functions for signal processing and configuration
 management

---
 HYS_process.py                 | 59 +++++++++++++++++++++-
 dataset_config/HYS_config.yaml | 13 +++++
 utils/HYS_FileReader.py        | 90 +++++++++++++++++++++++++++++++++-
 utils/__init__.py              |  2 +
 utils/operation_tools.py       | 31 +++++++++++-
 5 files changed, 191 insertions(+), 4 deletions(-)
 create mode 100644 dataset_config/HYS_config.yaml

diff --git a/HYS_process.py b/HYS_process.py
index 1f07192..f3de6d8 100644
--- a/HYS_process.py
+++ b/HYS_process.py
@@ -12,10 +12,65 @@
     提供数据处理前后的可视化对比，帮助理解数据变化
     绘制多条可用性趋势图，展示数据的可用区间、体动区间、低幅值区间等
 
-
+todo: 使用mask 屏蔽无用区间
 
 
 # 低幅值区间规则标定与剔除
 # 高幅值连续体动规则标定与剔除
 # 手动标定不可用区间提剔除
-"""
\ No newline at end of file
+"""
+
+from pathlib import Path
+from typing import Union
+import utils
+import numpy as np
+
+
+
+
+def process_one_signal(samp_id):
+    signal_path = list((org_signal_root_path / f"{samp_id}").glob("OrgBCG_Sync_*.txt"))
+    if not signal_path:
+        raise FileNotFoundError(f"OrgBCG_Sync file not found for sample ID: {samp_id}")
+    signal_path = signal_path[0]
+    print(f"Processing OrgBCG_Sync signal file: {signal_path}")
+
+    label_path = (label_root_path / f"{samp_id}").glob("SA Label_corrected.csv")
+    if not label_path:
+        raise FileNotFoundError(f"Label_corrected file not found for sample ID: {samp_id}")
+    label_path = list(label_path)[0]
+    print(f"Processing Label_corrected file: {label_path}")
+
+
+    signal_data = utils.read_signal_txt(signal_path)
+    signal_length = len(signal_data)
+    print(f"signal_length: {signal_length}")
+    signal_fs = int(signal_path.stem.split("_")[-1])
+    print(f"signal_fs: {signal_fs}")
+    signal_second = signal_length // signal_fs
+    print(f"signal_second: {signal_second}")
+
+
+    label_data = utils.read_label_csv(label_path)
+
+    manual_disable_mask = utils.generate_disable_mask(signal_second, all_samp_disable_df[all_samp_disable_df["id"] == samp_id])
+    print(f"disable_mask_shape: {manual_disable_mask.shape}, num_disable: {np.sum(manual_disable_mask == 0)}")
+
+
+
+
+
+if __name__ == '__main__':
+    yaml_path = Path("./dataset_config/HYS_config.yaml")
+    disable_df_path = Path("./排除区间.xlsx")
+
+    select_ids, root_path = utils.load_dataset_info(yaml_path)
+    print(f"select_ids: {select_ids}")
+    print(f"root_path: {root_path}")
+
+    org_signal_root_path = root_path / "OrgBCG_Aligned"
+    label_root_path = root_path / "Label"
+
+    all_samp_disable_df = utils.read_disable_excel(disable_df_path)
+
+    process_one_signal(select_ids[0])
diff --git a/dataset_config/HYS_config.yaml b/dataset_config/HYS_config.yaml
new file mode 100644
index 0000000..d30264f
--- /dev/null
+++ b/dataset_config/HYS_config.yaml
@@ -0,0 +1,13 @@
+select_id:
+  - 1302
+  - 286
+  - 950
+  - 220
+  - 229
+  - 541
+  - 582
+  - 670
+  - 684
+  - 960
+
+root_path: /mnt/disk_wd/marques_dataset/DataCombine2023/HYS
\ No newline at end of file
diff --git a/utils/HYS_FileReader.py b/utils/HYS_FileReader.py
index 82e4584..d7c477a 100644
--- a/utils/HYS_FileReader.py
+++ b/utils/HYS_FileReader.py
@@ -34,7 +34,7 @@ def read_signal_txt(path: Union[str, Path]) -> np.ndarray:
         return df.iloc[:, 0].to_numpy()
 
 
-def read_laebl_csv(path: Union[str, Path]) -> pd.DataFrame:
+def read_label_csv(path: Union[str, Path], verbose=True) -> pd.DataFrame:
     """
     Read a CSV file and return it as a pandas DataFrame.
 
@@ -49,6 +49,94 @@ def read_laebl_csv(path: Union[str, Path]) -> pd.DataFrame:
 
     # 直接用pandas读取 包含中文 故指定编码
     df = pd.read_csv(path, encoding="gbk")
+    if verbose:
+        print(f"Label file read from {path}, number of rows: {len(df)}")
+
+    # 统计打标情况
+    # isLabeled=1 表示已打标
+    # Event type 有值的为PSG导出的事件
+    # Event type 为nan的为手动打标的事件
+    # score=1 显著事件， score=2 为受干扰事件 score=3 为非显著应删除事件
+    # 确认后的事件在correct_EventsType
+    # 输出事件信息 按照总计事件、低通气、中枢性、阻塞性、混合型按行输出 格式为 总计/来自PSG/手动/删除/未标注
+    # Columns:
+    # Index	Event type	Stage	Time	Epoch	Date	Duration	HR bef.	HR extr.	HR delta	O2 bef.	O2 min.	O2 delta	Body Position	Validation	Start	End	score	remark	correct_Start	correct_End	correct_EventsType	isLabeled
+    # Event type:
+    # Hypopnea
+    # Central apnea
+    # Obstructive apnea
+    # Mixed apnea
+
+    num_labeled = np.sum(df["isLabeled"] == 1)
+    num_psg_events = np.sum(df["Event type"].notna())
+    num_manual_events = num_labeled - num_psg_events
+    num_deleted = np.sum(df["score"] == 3)
+
+    # 统计事件
+    num_total = np.sum((df["isLabeled"] == 1) & (df["score"] != 3))
+    num_unlabeled = num_total - num_labeled
+
+    num_psg_hyp = np.sum(df["Event type"] == "Hypopnea")
+    num_psg_csa = np.sum(df["Event type"] == "Central apnea")
+    num_psg_osa = np.sum(df["Event type"] == "Obstructive apnea")
+    num_psg_msa = np.sum(df["Event type"] == "Mixed apnea")
+
+    num_hyp = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] != 3))
+    num_csa = np.sum((df["correct_EventsType"] == "Central apnea")  & (df["score"] != 3))
+    num_osa = np.sum((df["correct_EventsType"] == "Obstructive apnea")  & (df["score"] != 3))
+    num_msa = np.sum((df["correct_EventsType"] == "Mixed apnea")  & (df["score"] != 3))
+
+    num_manual_hyp = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Hypopnea"))
+    num_manual_csa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Central apnea"))
+    num_manual_osa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Obstructive apnea"))
+    num_manual_msa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Mixed apnea"))
+
+    num_deleted_hyp = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Hypopnea"))
+    num_deleted_csa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Central apnea"))
+    num_deleted_osa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Obstructive apnea"))
+    num_deleted_msa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Mixed apnea"))
+
+    num_unlabeled_hyp = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Hypopnea"))
+    num_unlabeled_csa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Central apnea"))
+    num_unlabeled_osa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Obstructive apnea"))
+    num_unlabeled_msa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Mixed apnea"))
+
+
+
+    if verbose:
+        print("Event Statistics:")
+        # 格式化输出 总计/来自PSG/手动/删除/未标注 指定宽度
+        print("Type          Total / PSG / Manual / Deleted / Unlabeled")
+        print(f"Hypopnea:       {num_hyp:4d} / {num_psg_hyp:4d} / {num_manual_hyp:4d} / {num_deleted_hyp:4d} / {num_unlabeled_hyp:4d}")
+        print(f"Central apnea:  {num_csa:4d} / {num_psg_csa:4d} / {num_manual_csa:4d} / {num_deleted_csa:4d} / {num_unlabeled_csa:4d}")
+        print(f"Obstructive ap: {num_osa:4d} / {num_psg_osa:4d} / {num_manual_osa:4d} / {num_deleted_osa:4d} / {num_unlabeled_osa:4d}")
+        print(f"Mixed apnea:    {num_msa:4d} / {num_psg_msa:4d} / {num_manual_msa:4d} / {num_deleted_msa:4d} / {num_unlabeled_msa:4d}")
+        print(f"Total events:   {num_total:4d} / {num_psg_events:4d} / {num_manual_events:4d} / {num_deleted:4d} / {num_unlabeled:4d}")
+
+
+
+
     df["Start"] = df["Start"].astype(int)
     df["End"] = df["End"].astype(int)
+    return df
+
+
+def read_disable_excel(path: Union[str, Path]) -> pd.DataFrame:
+    """
+    Read an Excel file and return it as a pandas DataFrame.
+
+    Args:
+        path (str | Path): Path to the Excel file.
+    Returns:
+        pd.DataFrame: The content of the Excel file as a pandas DataFrame.
+    """
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {path}")
+
+    # 直接用pandas读取
+    df = pd.read_excel(path)
+    df["id"] = df["id"].astype(int)
+    df["start"] = df["start"].astype(int)
+    df["end"] = df["end"].astype(int)
     return df
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
index e69de29..d2c0727 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -0,0 +1,2 @@
+from utils.HYS_FileReader import read_label_csv, read_signal_txt, read_disable_excel
+from utils.operation_tools import load_dataset_info, generate_disable_mask
\ No newline at end of file
diff --git a/utils/operation_tools.py b/utils/operation_tools.py
index bcc5062..a4118b9 100644
--- a/utils/operation_tools.py
+++ b/utils/operation_tools.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
-
+import yaml
 
 plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
 plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
@@ -252,5 +252,34 @@ def calculate_by_slide_windows(func, signal_data, calc_mask, sampling_rate=100,
     return values_nan, values
 
 
+def load_dataset_info(yaml_path):
+    with open(yaml_path, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+
+    select_ids = config.get('select_id', [])
+    root_path = config.get('root_path', None)
+    data_path = Path(root_path)
+    return select_ids, data_path
+
+
+def generate_disable_mask(signal_second: int, disable_df) -> np.ndarray:
+    disable_mask = np.ones(signal_second, dtype=int)
+
+    for _, row in disable_df.iterrows():
+        start = row["start"]
+        end = row["end"]
+        disable_mask[start:end] = 0
+    return disable_mask
+
+
+def generate_event_mask(signal_second: int, event_df) -> np.ndarray:
+    event_mask = np.zeros(signal_second, dtype=int)
+
+    for _, row in event_df.iterrows():
+        start = row["start"]
+        end = row["end"]
+        event_mask[start:end] = 1
+    return event_mask
+