Add utility functions for signal processing and configuration management

This commit is contained in:
marques 2025-10-12 18:42:29 +08:00
parent 805f1dc7f8
commit d2ed6787d4
5 changed files with 191 additions and 4 deletions

View File

@ -12,10 +12,65 @@
提供数据处理前后的可视化对比帮助理解数据变化 提供数据处理前后的可视化对比帮助理解数据变化
绘制多条可用性趋势图展示数据的可用区间体动区间低幅值区间等 绘制多条可用性趋势图展示数据的可用区间体动区间低幅值区间等
todo: 使用mask 屏蔽无用区间
# 低幅值区间规则标定与剔除 # 低幅值区间规则标定与剔除
# 高幅值连续体动规则标定与剔除 # 高幅值连续体动规则标定与剔除
# 手动标定不可用区间提剔除 # 手动标定不可用区间提剔除
""" """
from pathlib import Path
from typing import Union
import utils
import numpy as np
def process_one_signal(samp_id):
signal_path = list((org_signal_root_path / f"{samp_id}").glob("OrgBCG_Sync_*.txt"))
if not signal_path:
raise FileNotFoundError(f"OrgBCG_Sync file not found for sample ID: {samp_id}")
signal_path = signal_path[0]
print(f"Processing OrgBCG_Sync signal file: {signal_path}")
label_path = (label_root_path / f"{samp_id}").glob("SA Label_corrected.csv")
if not label_path:
raise FileNotFoundError(f"Label_corrected file not found for sample ID: {samp_id}")
label_path = list(label_path)[0]
print(f"Processing Label_corrected file: {label_path}")
signal_data = utils.read_signal_txt(signal_path)
signal_length = len(signal_data)
print(f"signal_length: {signal_length}")
signal_fs = int(signal_path.stem.split("_")[-1])
print(f"signal_fs: {signal_fs}")
signal_second = signal_length // signal_fs
print(f"signal_second: {signal_second}")
label_data = utils.read_label_csv(label_path)
manual_disable_mask = utils.generate_disable_mask(signal_second, all_samp_disable_df[all_samp_disable_df["id"] == samp_id])
print(f"disable_mask_shape: {manual_disable_mask.shape}, num_disable: {np.sum(manual_disable_mask == 0)}")
if __name__ == '__main__':
yaml_path = Path("./dataset_config/HYS_config.yaml")
disable_df_path = Path("./排除区间.xlsx")
select_ids, root_path = utils.load_dataset_info(yaml_path)
print(f"select_ids: {select_ids}")
print(f"root_path: {root_path}")
org_signal_root_path = root_path / "OrgBCG_Aligned"
label_root_path = root_path / "Label"
all_samp_disable_df = utils.read_disable_excel(disable_df_path)
process_one_signal(select_ids[0])

View File

@ -0,0 +1,13 @@
select_id:
- 1302
- 286
- 950
- 220
- 229
- 541
- 582
- 670
- 684
- 960
root_path: /mnt/disk_wd/marques_dataset/DataCombine2023/HYS

View File

@ -34,7 +34,7 @@ def read_signal_txt(path: Union[str, Path]) -> np.ndarray:
return df.iloc[:, 0].to_numpy() return df.iloc[:, 0].to_numpy()
def read_laebl_csv(path: Union[str, Path]) -> pd.DataFrame: def read_label_csv(path: Union[str, Path], verbose=True) -> pd.DataFrame:
""" """
Read a CSV file and return it as a pandas DataFrame. Read a CSV file and return it as a pandas DataFrame.
@ -49,6 +49,94 @@ def read_laebl_csv(path: Union[str, Path]) -> pd.DataFrame:
# 直接用pandas读取 包含中文 故指定编码 # 直接用pandas读取 包含中文 故指定编码
df = pd.read_csv(path, encoding="gbk") df = pd.read_csv(path, encoding="gbk")
if verbose:
print(f"Label file read from {path}, number of rows: {len(df)}")
# 统计打标情况
# isLabeled=1 表示已打标
# Event type 有值的为PSG导出的事件
# Event type 为nan的为手动打标的事件
# score=1 显著事件, score=2 为受干扰事件 score=3 为非显著应删除事件
# 确认后的事件在correct_EventsType
# 输出事件信息 按照总计事件、低通气、中枢性、阻塞性、混合型按行输出 格式为 总计/来自PSG/手动/删除/未标注
# Columns:
# Index Event type Stage Time Epoch Date Duration HR bef. HR extr. HR delta O2 bef. O2 min. O2 delta Body Position Validation Start End score remark correct_Start correct_End correct_EventsType isLabeled
# Event type:
# Hypopnea
# Central apnea
# Obstructive apnea
# Mixed apnea
num_labeled = np.sum(df["isLabeled"] == 1)
num_psg_events = np.sum(df["Event type"].notna())
num_manual_events = num_labeled - num_psg_events
num_deleted = np.sum(df["score"] == 3)
# 统计事件
num_total = np.sum((df["isLabeled"] == 1) & (df["score"] != 3))
num_unlabeled = num_total - num_labeled
num_psg_hyp = np.sum(df["Event type"] == "Hypopnea")
num_psg_csa = np.sum(df["Event type"] == "Central apnea")
num_psg_osa = np.sum(df["Event type"] == "Obstructive apnea")
num_psg_msa = np.sum(df["Event type"] == "Mixed apnea")
num_hyp = np.sum((df["correct_EventsType"] == "Hypopnea") & (df["score"] != 3))
num_csa = np.sum((df["correct_EventsType"] == "Central apnea") & (df["score"] != 3))
num_osa = np.sum((df["correct_EventsType"] == "Obstructive apnea") & (df["score"] != 3))
num_msa = np.sum((df["correct_EventsType"] == "Mixed apnea") & (df["score"] != 3))
num_manual_hyp = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Hypopnea"))
num_manual_csa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Central apnea"))
num_manual_osa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Obstructive apnea"))
num_manual_msa = np.sum((df["Event type"].isna()) & (df["correct_EventsType"] == "Mixed apnea"))
num_deleted_hyp = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Hypopnea"))
num_deleted_csa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Central apnea"))
num_deleted_osa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Obstructive apnea"))
num_deleted_msa = np.sum((df["score"] == 3) & (df["correct_EventsType"] == "Mixed apnea"))
num_unlabeled_hyp = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Hypopnea"))
num_unlabeled_csa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Central apnea"))
num_unlabeled_osa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Obstructive apnea"))
num_unlabeled_msa = np.sum((df["isLabeled"] == 0) & (df["correct_EventsType"] == "Mixed apnea"))
if verbose:
print("Event Statistics:")
# 格式化输出 总计/来自PSG/手动/删除/未标注 指定宽度
print("Type Total / PSG / Manual / Deleted / Unlabeled")
print(f"Hypopnea: {num_hyp:4d} / {num_psg_hyp:4d} / {num_manual_hyp:4d} / {num_deleted_hyp:4d} / {num_unlabeled_hyp:4d}")
print(f"Central apnea: {num_csa:4d} / {num_psg_csa:4d} / {num_manual_csa:4d} / {num_deleted_csa:4d} / {num_unlabeled_csa:4d}")
print(f"Obstructive ap: {num_osa:4d} / {num_psg_osa:4d} / {num_manual_osa:4d} / {num_deleted_osa:4d} / {num_unlabeled_osa:4d}")
print(f"Mixed apnea: {num_msa:4d} / {num_psg_msa:4d} / {num_manual_msa:4d} / {num_deleted_msa:4d} / {num_unlabeled_msa:4d}")
print(f"Total events: {num_total:4d} / {num_psg_events:4d} / {num_manual_events:4d} / {num_deleted:4d} / {num_unlabeled:4d}")
df["Start"] = df["Start"].astype(int) df["Start"] = df["Start"].astype(int)
df["End"] = df["End"].astype(int) df["End"] = df["End"].astype(int)
return df
def read_disable_excel(path: Union[str, Path]) -> pd.DataFrame:
"""
Read an Excel file and return it as a pandas DataFrame.
Args:
path (str | Path): Path to the Excel file.
Returns:
pd.DataFrame: The content of the Excel file as a pandas DataFrame.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
# 直接用pandas读取
df = pd.read_excel(path)
df["id"] = df["id"].astype(int)
df["start"] = df["start"].astype(int)
df["end"] = df["end"].astype(int)
return df return df

View File

@ -0,0 +1,2 @@
from utils.HYS_FileReader import read_label_csv, read_signal_txt, read_disable_excel
from utils.operation_tools import load_dataset_info, generate_disable_mask

View File

@ -4,7 +4,7 @@ from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
import yaml
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
@ -252,5 +252,34 @@ def calculate_by_slide_windows(func, signal_data, calc_mask, sampling_rate=100,
return values_nan, values return values_nan, values
def load_dataset_info(yaml_path):
with open(yaml_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
select_ids = config.get('select_id', [])
root_path = config.get('root_path', None)
data_path = Path(root_path)
return select_ids, data_path
def generate_disable_mask(signal_second: int, disable_df) -> np.ndarray:
disable_mask = np.ones(signal_second, dtype=int)
for _, row in disable_df.iterrows():
start = row["start"]
end = row["end"]
disable_mask[start:end] = 0
return disable_mask
def generate_event_mask(signal_second: int, event_df) -> np.ndarray:
event_mask = np.zeros(signal_second, dtype=int)
for _, row in event_df.iterrows():
start = row["start"]
end = row["end"]
event_mask[start:end] = 1
return event_mask