DataPrepare/event_mask_process/HYS_PSG_process.py

200 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
本脚本完成对呼研所数据的处理,包含以下功能:
1. 数据读取与预处理
从传入路径中,进行数据和标签的读取,并进行初步的预处理
预处理包括为数据进行滤波、去噪等操作
2. 数据清洗与异常值处理
3. 输出清晰后的统计信息
4. 数据保存
将处理后的数据保存到指定路径,便于后续使用
主要是保存切分后的数据位置和标签
5. 可视化
提供数据处理前后的可视化对比,帮助理解数据变化
绘制多条可用性趋势图,展示数据的可用区间、体动区间、低幅值区间等
# 低幅值区间规则标定与剔除
# 高幅值连续体动规则标定与剔除
# 手动标定不可用区间提剔除
"""
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent))
project_root_path = Path(__file__).resolve().parent.parent
import shutil
import draw_tools
import utils
import numpy as np
import signal_method
import os
os.environ['DISPLAY'] = "localhost:10.0"
def resolve_sample_file(sample_dir: Path, prefix: str, suffix=".txt", prefer_tokens=("Sync", "RoughCut")) -> Path:
candidates = sorted(sample_dir.glob(f"{prefix}*{suffix}"))
if not candidates:
if sample_dir.exists():
available_files = ", ".join(sorted(path.name for path in sample_dir.iterdir()))
else:
available_files = "<sample dir missing>"
raise FileNotFoundError(
f"{prefix} file not found in {sample_dir}. "
f"searched pattern: {prefix}*{suffix}. available: {available_files}"
)
for token in prefer_tokens:
preferred = [
path for path in candidates
if f"_{token}_" in path.name or f"_{token}." in path.name
]
if preferred:
if len(preferred) > 1:
print(f"Warning!!! multiple preferred files found for {prefix}: {preferred}")
return preferred[0]
if len(candidates) > 1:
print(f"Warning!!! multiple files found for {prefix}: {candidates}")
return candidates[0]
def get_signal_duration_second(signal_path: Path) -> int:
signal_fs = int(signal_path.stem.split("_")[-1])
with signal_path.open("r", encoding="utf-8", errors="ignore") as file_obj:
signal_length = sum(1 for _ in file_obj)
return signal_length // signal_fs
def process_one_signal(samp_id, show=False):
sample_dir = org_signal_root_path / f"{samp_id}"
label_dir = label_root_path / f"{samp_id}"
tho_signal_path = resolve_sample_file(sample_dir, "Effort Tho")
abd_signal_path = resolve_sample_file(sample_dir, "Effort Abd")
flowp_signal_path = resolve_sample_file(sample_dir, "Flow P")
flowt_signal_path = resolve_sample_file(sample_dir, "Flow T")
spo2_signal_path = resolve_sample_file(sample_dir, "SpO2")
stage_signal_path = resolve_sample_file(sample_dir, "5_class")
label_path = resolve_sample_file(label_dir, "SA Label", suffix=".csv")
print(f"Processing Effort Tho signal file: {tho_signal_path}")
print(f"Processing Effort Abd signal file: {abd_signal_path}")
print(f"Processing Flow P signal file: {flowp_signal_path}")
print(f"Processing Flow T signal file: {flowt_signal_path}")
print(f"Processing SpO2 signal file: {spo2_signal_path}")
print(f"Processing 5_class signal file: {stage_signal_path}")
print(f"Processing SA Label file: {label_path}")
#
# # 保存处理后的数据和标签
save_samp_path = save_path / f"{samp_id}"
save_samp_path.mkdir(parents=True, exist_ok=True)
signal_seconds = {
"Effort Tho": get_signal_duration_second(tho_signal_path),
"Effort Abd": get_signal_duration_second(abd_signal_path),
"Flow P": get_signal_duration_second(flowp_signal_path),
"Flow T": get_signal_duration_second(flowt_signal_path),
"SpO2": get_signal_duration_second(spo2_signal_path),
"5_class": get_signal_duration_second(stage_signal_path),
}
common_second = min(signal_seconds.values())
print(f"Sample {samp_id} signal seconds: {signal_seconds}")
print(f"Sample {samp_id} common_second: {common_second}")
# # # 读取信号数据
stage_data_raw, stage_length, stage_fs, stage_second = utils.read_signal_txt(stage_signal_path, dtype=str, verbose=True)
#
# # 预处理与滤波
# tho_data, tho_data_filt, tho_fs = signal_method.psg_effort_filter(conf=conf, effort_data_raw=tho_data_raw, effort_fs=tho_fs)
# abd_data, abd_data_filt, abd_fs = signal_method.psg_effort_filter(conf=conf, effort_data_raw=abd_data_raw, effort_fs=abd_fs)
# flowp_data, flowp_data_filt, flowp_fs = signal_method.psg_effort_filter(conf=conf, effort_data_raw=flowp_data_raw, effort_fs=flowp_fs)
# flowt_data, flowt_data_filt, flowt_fs = signal_method.psg_effort_filter(conf=conf, effort_data_raw=flowt_data_raw, effort_fs=flowt_fs)
# 降采样
# old_tho_fs = tho_fs
# tho_fs = conf["effort"]["downsample_fs"]
# tho_data_filt = utils.downsample_signal_fast(original_signal=tho_data_filt, original_fs=old_tho_fs, target_fs=tho_fs)
# old_abd_fs = abd_fs
# abd_fs = conf["effort"]["downsample_fs"]
# abd_data_filt = utils.downsample_signal_fast(original_signal=abd_data_filt, original_fs=old_abd_fs, target_fs=abd_fs)
# old_flowp_fs = flowp_fs
# flowp_fs = conf["effort"]["downsample_fs"]
# flowp_data_filt = utils.downsample_signal_fast(original_signal=flowp_data_filt, original_fs=old_flowp_fs, target_fs=flowp_fs)
# old_flowt_fs = flowt_fs
# flowt_fs = conf["effort"]["downsample_fs"]
# flowt_data_filt = utils.downsample_signal_fast(original_signal=flowt_data_filt, original_fs=old_flowt_fs, target_fs=flowt_fs)
# spo2不降采样
# spo2_data_filt = spo2_data_raw
# spo2_fs = spo2_fs
label_data = utils.read_raw_psg_label(path=label_path)
event_mask, score_mask = utils.generate_event_mask(signal_second=common_second, event_df=label_data, use_correct=False, with_score=False)
# event_mask > 0 的部分为1其他为0
score_mask = np.where(event_mask > 0, 1, 0)
# 根据睡眠分期生成不可用区间
wake_mask = utils.get_wake_mask(stage_data_raw)
# 剔除短于60秒的觉醒区间
wake_mask = utils.remove_short_durations(wake_mask, time_points=np.arange(len(wake_mask) * stage_fs), min_duration_sec=60)
# 合并短于120秒的觉醒区间
wake_mask = utils.merge_short_gaps(wake_mask, time_points=np.arange(len(wake_mask) * stage_fs), max_gap_sec=60)
disable_label = wake_mask[:common_second]
# 复制事件文件 到保存路径
sa_label_save_name = f"{samp_id}_" + label_path.name
shutil.copyfile(label_path, save_samp_path / sa_label_save_name)
#
# 新建一个dataframe分别是秒数、SA标签
save_dict = {
"Second": np.arange(common_second),
"SA_Label": event_mask,
"SA_Score": score_mask,
"Disable_Label": disable_label,
"Resp_LowAmp_Label": np.zeros_like(event_mask),
"Resp_Movement_Label": np.zeros_like(event_mask),
"Resp_AmpChange_Label": np.zeros_like(event_mask),
"BCG_LowAmp_Label": np.zeros_like(event_mask),
"BCG_Movement_Label": np.zeros_like(event_mask),
"BCG_AmpChange_Label": np.zeros_like(event_mask)
}
mask_label_save_name = f"{samp_id}_Processed_Labels.csv"
utils.save_process_label(save_path=save_samp_path / mask_label_save_name, save_dict=save_dict)
if __name__ == '__main__':
yaml_path = project_root_path / "dataset_config/HYS_PSG_config.yaml"
# disable_df_path = project_root_path / "排除区间.xlsx"
#
conf = utils.load_dataset_conf(yaml_path)
root_path = Path(conf["root_path"])
save_path = Path(conf["mask_save_path"])
select_ids = conf["select_ids"]
#
print(f"select_ids: {select_ids}")
print(f"root_path: {root_path}")
print(f"save_path: {save_path}")
#
org_signal_root_path = root_path / "PSG_Aligned"
label_root_path = root_path / "PSG_Aligned"
#
# all_samp_disable_df = utils.read_disable_excel(disable_df_path)
#
# process_one_signal(select_ids[0], show=True)
# #
for samp_id in select_ids:
print(f"Processing sample ID: {samp_id}")
process_one_signal(samp_id, show=False)
print(f"Finished processing sample ID: {samp_id}\n\n")
pass