From 8209b13e087b06cae28bdadc300652d16c5000f1 Mon Sep 17 00:00:00 2001 From: andrew Date: Thu, 6 Oct 2022 23:08:35 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AE=9E=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + exam/000/main.py | 35 +- exam/001/main.py | 35 +- exam/002/main.py | 35 +- exam/003/main.py | 35 +- exam/003/test_save_result.py | 4 +- exam/004/generate_label_14.py | 561 +++++++++++++++++++++++++ exam/004/load_dataset.py | 178 ++++++++ exam/004/main.py | 288 +++++++++++++ exam/004/model/Hybrid_Net001.py | 97 +++++ exam/004/my_augment.py | 51 +++ exam/004/settings.yaml | 42 ++ exam/004/test_save_result.py | 454 ++++++++++++++++++++ exam/004/utils/Draw_ConfusionMatrix.py | 46 ++ exam/004/utils/Preprocessing.py | 181 ++++++++ exam/004/utils/calc_metrics.py | 84 ++++ exam/005/generate_label_14.py | 561 +++++++++++++++++++++++++ exam/005/load_dataset.py | 178 ++++++++ exam/005/main.py | 284 +++++++++++++ exam/005/model/Hybrid_Net001.py | 98 +++++ exam/005/my_augment.py | 51 +++ exam/005/settings.yaml | 42 ++ exam/005/test_save_result.py | 454 ++++++++++++++++++++ exam/005/utils/Draw_ConfusionMatrix.py | 46 ++ exam/005/utils/Preprocessing.py | 181 ++++++++ exam/005/utils/calc_metrics.py | 84 ++++ exam/011/main.py | 35 +- exam/011/settings.yaml | 4 +- exam/012/main.py | 35 +- exam/012/settings.yaml | 4 +- exam/013/main.py | 35 +- exam/013/settings.yaml | 6 +- exam/试验记录.txt | 98 ++++- 33 files changed, 4294 insertions(+), 30 deletions(-) create mode 100644 exam/004/generate_label_14.py create mode 100644 exam/004/load_dataset.py create mode 100644 exam/004/main.py create mode 100644 exam/004/model/Hybrid_Net001.py create mode 100644 exam/004/my_augment.py create mode 100644 exam/004/settings.yaml create mode 100644 exam/004/test_save_result.py create mode 100644 exam/004/utils/Draw_ConfusionMatrix.py create mode 100644 exam/004/utils/Preprocessing.py create mode 100644 exam/004/utils/calc_metrics.py create mode 100644 exam/005/generate_label_14.py create mode 100644 exam/005/load_dataset.py create mode 100644 exam/005/main.py create mode 100644 exam/005/model/Hybrid_Net001.py create mode 100644 exam/005/my_augment.py create mode 100644 exam/005/settings.yaml create mode 100644 exam/005/test_save_result.py create mode 100644 exam/005/utils/Draw_ConfusionMatrix.py create mode 100644 exam/005/utils/Preprocessing.py create mode 100644 exam/005/utils/calc_metrics.py diff --git a/.gitignore b/.gitignore index f8b73e7..ddb800a 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,5 @@ dmypy.json # Cython debug symbols cython_debug/ +# custom +.idea/ diff --git a/exam/000/main.py b/exam/000/main.py index a04cb6c..ccb3609 100644 --- a/exam/000/main.py +++ b/exam/000/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/001/main.py b/exam/001/main.py index a04cb6c..ccb3609 100644 --- a/exam/001/main.py +++ b/exam/001/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/002/main.py b/exam/002/main.py index a04cb6c..ccb3609 100644 --- a/exam/002/main.py +++ b/exam/002/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/003/main.py b/exam/003/main.py index a04cb6c..ccb3609 100644 --- a/exam/003/main.py +++ b/exam/003/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/003/test_save_result.py b/exam/003/test_save_result.py index 784ffb8..ae722a0 100644 --- a/exam/003/test_save_result.py +++ b/exam/003/test_save_result.py @@ -446,9 +446,9 @@ def segment_to_event(df_segment, dataset_type): # shap_values = explainer.shap_values() if __name__ == '__main__': - all_output_path = list(exam_path.rglob("KFold_0")) + all_output_path = list(exam_path.rglob("KFold_*")) for exam_index, test_exam_path in enumerate(all_output_path): # test_exam_path = exam_path / test_exam_path set_environment(exam_index) - # test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="test") test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/004/generate_label_14.py b/exam/004/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/004/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/004/load_dataset.py b/exam/004/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/004/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/004/main.py b/exam/004/main.py new file mode 100644 index 0000000..5c11001 --- /dev/null +++ b/exam/004/main.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net001 import HYBRIDNET001 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, stft, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, stft, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(1, 120, 1), (1, 1, 121, 26)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[1, (len(train_dataset) - train_dataset.count_SA()) / train_dataset.count_SA()]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([1, (len(train_dataset) - train_dataset.count_SA()) / train_dataset.count_SA()]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/004/model/Hybrid_Net001.py b/exam/004/model/Hybrid_Net001.py new file mode 100644 index 0000000..412ed32 --- /dev/null +++ b/exam/004/model/Hybrid_Net001.py @@ -0,0 +1,97 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net001.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 4 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET001(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET001, self).__init__() + + self.lstm = nn.LSTM(input_size=1, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(16), + + nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(32), + + nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(32) + + + + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear((120 * 32 + 32 * 16 * 4), 512), + nn.ReLU(inplace=True), + nn.Linear(512, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + x1, (_, _) = self.lstm(x1) + x2 = self.right(x2) + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET001(2).cuda() + summary(model, [(32, 120, 1), (32, 1, 121, 26)]) diff --git a/exam/004/my_augment.py b/exam/004/my_augment.py new file mode 100644 index 0000000..6d7e891 --- /dev/null +++ b/exam/004/my_augment.py @@ -0,0 +1,51 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + dataset = dataset - dataset.mean() + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) + dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + dataset = {"low": dataset_low, + "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::25] + dataset_low = dataset_low.reshape(dataset_low.shape[0], 1) + + _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + dataset_high = dataset_high.astype(np.float).T + dataset_high = dataset_high.reshape(1, dataset_high.shape[0], dataset_high.shape[1]) + + return dataset_low, dataset_high + + +if __name__ == '__main__': + pass diff --git a/exam/004/settings.yaml b/exam/004/settings.yaml new file mode 100644 index 0000000..697a7ed --- /dev/null +++ b/exam/004/settings.yaml @@ -0,0 +1,42 @@ +# environment config +GPU: "1" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET001 +select_sampno: + - 282 + - 726 + - 229 + - 582 + - 286 + - 966 + - 1000 + - 1004 + - 1006 + - 1009 + +# train hyperparameters config +epoch: 50 +lr: 0.00001 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID001 NOBD STFT RESP DUAL" +tags: ["ReduceLROnPlateau", "RESP LSTM", "STFT CNN"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/004/test_save_result.py b/exam/004/test_save_result.py new file mode 100644 index 0000000..ae722a0 --- /dev/null +++ b/exam/004/test_save_result.py @@ -0,0 +1,454 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" + +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net001 import HYBRIDNET001 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + print(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + print(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + print(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + print(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + print(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + if value > 30: + print([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/004/utils/Draw_ConfusionMatrix.py b/exam/004/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/004/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/004/utils/Preprocessing.py b/exam/004/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/004/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/004/utils/calc_metrics.py b/exam/004/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/004/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/005/generate_label_14.py b/exam/005/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/005/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/005/load_dataset.py b/exam/005/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/005/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/005/main.py b/exam/005/main.py new file mode 100644 index 0000000..fead3da --- /dev/null +++ b/exam/005/main.py @@ -0,0 +1,284 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net001 import HYBRIDNET001 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, stft, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = out.squeeze(-1) + loss = loss_func(out, labels.float()) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + # labels = torch.unsqueeze(labels, dim=1) + # out = F.softmax(out, dim=1) + # out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, stft, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = out.squeeze(-1) + # out = F.softmax(out, dim=1) + loss = loss_func(out, labels.float()) + + valid_loss += loss.item() + # labels = torch.unsqueeze(labels, dim=1) + # out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = out.squeeze(-1) + # out = F.softmax(out, dim=1) + loss = loss_func(out, labels.float()) + + test_loss += loss.item() + # labels = torch.unsqueeze(labels, dim=1) + # out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(1, 120, 1), (1, 1, 121, 26)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[(len(train_dataset) - train_dataset.count_SA()) / train_dataset.count_SA()]}") + # 损失函数与优化器 + loss_function = nn.BCELoss( + weight=torch.Tensor([(len(train_dataset) - train_dataset.count_SA()) / train_dataset.count_SA()]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/005/model/Hybrid_Net001.py b/exam/005/model/Hybrid_Net001.py new file mode 100644 index 0000000..980ad4a --- /dev/null +++ b/exam/005/model/Hybrid_Net001.py @@ -0,0 +1,98 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net001.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 4 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET001(nn.Module): + def __init__(self, num_classes=1, init_weights=True): + super(HYBRIDNET001, self).__init__() + + self.lstm = nn.LSTM(input_size=1, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(16), + + nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(32), + + nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(32) + + + + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear((120 * 32 + 32 * 16 * 4), 512), + nn.ReLU(inplace=True), + nn.Linear(512, num_classes), + nn.Sigmoid() + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + x1, (_, _) = self.lstm(x1) + x2 = self.right(x2) + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET001(1).cuda() + summary(model, [(32, 120, 1), (32, 1, 121, 26)]) diff --git a/exam/005/my_augment.py b/exam/005/my_augment.py new file mode 100644 index 0000000..6d7e891 --- /dev/null +++ b/exam/005/my_augment.py @@ -0,0 +1,51 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + dataset = dataset - dataset.mean() + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) + dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + dataset = {"low": dataset_low, + "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::25] + dataset_low = dataset_low.reshape(dataset_low.shape[0], 1) + + _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + dataset_high = dataset_high.astype(np.float).T + dataset_high = dataset_high.reshape(1, dataset_high.shape[0], dataset_high.shape[1]) + + return dataset_low, dataset_high + + +if __name__ == '__main__': + pass diff --git a/exam/005/settings.yaml b/exam/005/settings.yaml new file mode 100644 index 0000000..697a7ed --- /dev/null +++ b/exam/005/settings.yaml @@ -0,0 +1,42 @@ +# environment config +GPU: "1" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET001 +select_sampno: + - 282 + - 726 + - 229 + - 582 + - 286 + - 966 + - 1000 + - 1004 + - 1006 + - 1009 + +# train hyperparameters config +epoch: 50 +lr: 0.00001 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID001 NOBD STFT RESP DUAL" +tags: ["ReduceLROnPlateau", "RESP LSTM", "STFT CNN"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/005/test_save_result.py b/exam/005/test_save_result.py new file mode 100644 index 0000000..ae722a0 --- /dev/null +++ b/exam/005/test_save_result.py @@ -0,0 +1,454 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" + +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net001 import HYBRIDNET001 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + print(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + print(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + print(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + print(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + print(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + if value > 30: + print([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/005/utils/Draw_ConfusionMatrix.py b/exam/005/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/005/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/005/utils/Preprocessing.py b/exam/005/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/005/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/005/utils/calc_metrics.py b/exam/005/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/005/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/011/main.py b/exam/011/main.py index a04cb6c..ccb3609 100644 --- a/exam/011/main.py +++ b/exam/011/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/011/settings.yaml b/exam/011/settings.yaml index 2a460c9..022f35c 100644 --- a/exam/011/settings.yaml +++ b/exam/011/settings.yaml @@ -24,8 +24,8 @@ select_sampno: - 1009 # train hyperparameters config -epoch: 20 -lr: 0.0001 +epoch: 50 +lr: 0.00001 nc: 1 # wandb config diff --git a/exam/012/main.py b/exam/012/main.py index a04cb6c..ccb3609 100644 --- a/exam/012/main.py +++ b/exam/012/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/012/settings.yaml b/exam/012/settings.yaml index 2a460c9..022f35c 100644 --- a/exam/012/settings.yaml +++ b/exam/012/settings.yaml @@ -24,8 +24,8 @@ select_sampno: - 1009 # train hyperparameters config -epoch: 20 -lr: 0.0001 +epoch: 50 +lr: 0.00001 nc: 1 # wandb config diff --git a/exam/013/main.py b/exam/013/main.py index a04cb6c..ccb3609 100644 --- a/exam/013/main.py +++ b/exam/013/main.py @@ -121,6 +121,8 @@ def model_train(model, train_loader, optimizer, scheduler, loss_func, training_s train_loss /= len(train_loader) calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") logger.info(training_state) logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) calc_metrics.reset() @@ -162,6 +164,34 @@ def model_valid(model, valid_loader, wdir, loss_func): calc_metrics.reset() +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + if __name__ == '__main__': try: @@ -191,7 +221,7 @@ if __name__ == '__main__': model_net.cuda() k_folds = 5 - kfold = KFold(n_splits=k_folds, shuffle=True) + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) logger.info('--------------------------------') for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): @@ -213,9 +243,11 @@ if __name__ == '__main__': train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) # 重新初始化模型 del model_net @@ -251,5 +283,6 @@ if __name__ == '__main__': model_train(model_net, train_loader, optimizer, scheduler, loss_function, f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) if wandb is not None: calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/013/settings.yaml b/exam/013/settings.yaml index 2a460c9..697a7ed 100644 --- a/exam/013/settings.yaml +++ b/exam/013/settings.yaml @@ -1,5 +1,5 @@ # environment config -GPU: "0" +GPU: "1" # dataset config @@ -24,8 +24,8 @@ select_sampno: - 1009 # train hyperparameters config -epoch: 20 -lr: 0.0001 +epoch: 50 +lr: 0.00001 nc: 1 # wandb config diff --git a/exam/试验记录.txt b/exam/试验记录.txt index 4a65a57..bb88feb 100644 --- a/exam/试验记录.txt +++ b/exam/试验记录.txt @@ -4,13 +4,12 @@ # 初探学习率 ------------------------------------------------- 000 -学习率 1e-4 epoch 20 可以完成任务 +学习率 1e-4 epoch 20 过早(epoch 6)过拟合 001 -学习率 1e-5 epoch 50 和000结果相近,所以选择000参数 - +学习率 1e-5 epoch 50 可以 002 -学习率 1e-6 epoch 50 收敛过慢 +学习率 1e-6 epoch 50 比较合适,就是太慢了 # 主要影响F1的是precision ------------------------------------------------- @@ -20,7 +19,23 @@ 数据集减去平均值外,没有其他的预处理方法 -------------------------------------------------------- 003 -学习率 1e-5 从目前结果看用整晚数据集也行,不过还没有加入潮式呼吸数据 +学习率 1e-5 epoch 50 +-------------------------------------------------------- + +-------------------------------------------------------- +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +数据集减去平均值外,没有其他的预处理方法 保持正样本权重为1改成保持负样本权重为1 +-------------------------------------------------------- +004 +学习率 1e-5 epoch 50 +-------------------------------------------------------- + +-------------------------------------------------------- +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +数据集减去平均值外,没有其他的预处理方法 改成单输出 +-------------------------------------------------------- +005 +学习率 1e-5 epoch 50 -------------------------------------------------------- @@ -31,7 +46,7 @@ 数据集减去平均值外,削顶600,归一化 -------------------------------------------------------- 011 -学习率 1e-5 +学习率 1e-5 epoch 50 -------------------------------------------------------- -------------------------------------------------------- @@ -39,7 +54,7 @@ 数据集减去平均值外,使用整体混叠信号Z-score整理数据集 -------------------------------------------------------- 012 -学习率 1e-4 EPOCH 20 +学习率 1e-5 epoch 50 -------------------------------------------------------- @@ -48,34 +63,89 @@ 数据集减去平均值外,使用每个片段的混叠信号Z-score整理数据集 -------------------------------------------------------- 013 -学习率 1e-4 EPOCH 20 +学习率 1e-5 epoch 50 -------------------------------------------------------- - - -------------------------------------------------------- 非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, 数据集减去平均值外,使用大体动检测后缩放 -------------------------------------------------------- 014 -学习率 1e-4 EPOCH 20 +学习率 1e-5 EPOCH 20 TODO -------------------------------------------------------- -------------------------------------------------------- -非整晚数据集训练 1:3.5,仅用质量好的数据训练,class_weight 解决不平衡问题, +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, 数据集减去平均值外,选上述最好的处理方法 +# 修改卷积层的激活函数 -------------------------------------------------------- 021 -学习率 1e-4 这个不实现先 +学习率 1e-5 EPOCH 20 TODO -------------------------------------------------------- +-------------------------------------------------------- +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +# 引入残差块 +-------------------------------------------------------- +022 +学习率 1e-5 EPOCH 20 TODO +-------------------------------------------------------- + + +-------------------------------------------------------- +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +# 修改卷积层的激活函数 RESP用CNN(RESNET?) STFT用LSTM +-------------------------------------------------------- +023 +学习率 1e-5 EPOCH 20 TODO +-------------------------------------------------------- + + +-------------------------------------------------------- +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +# 修改卷积层的激活函数 RESP用CNN(RESNET?) STFT用LSTM 对两者输出的结果再用CNN +-------------------------------------------------------- +024 +学习率 1e-5 EPOCH 20 TODO +-------------------------------------------------------- + + + +-------------------------------------------------------- +非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +# 修改卷积层的激活函数 RESP用CNN(RESNET?) STFT用LSTM 对两者输出的结果再用CNN +-------------------------------------------------------- +025 +学习率 1e-5 EPOCH 20 TODO +-------------------------------------------------------- + + +## +TODO: 质量好的信号结果分析;严重程度的分析 + + -------------------------------------------------------- **全量**非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, 数据集减去平均值外,选上述最好的处理方法 -------------------------------------------------------- -022 +031 学习率 1e-4 +-------------------------------------------------------- + + + + +-------------------------------------------------------- +非整晚数据集训练 1:3.5,仅用质量好的数据训练,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +-------------------------------------------------------- +0xx +学习率 1e-4 这个不实现先 -------------------------------------------------------- \ No newline at end of file