diff --git a/exam/002/my_augment.py b/exam/002/my_augment.py index 6d7e891..a7e57e4 100644 --- a/exam/002/my_augment.py +++ b/exam/002/my_augment.py @@ -21,6 +21,7 @@ def my_augment(dataset): dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + dataset = {"low": dataset_low, "high": dataset_high} return dataset diff --git a/exam/005/test_save_result.py b/exam/005/test_save_result.py index f66b9d5..37fc145 100644 --- a/exam/005/test_save_result.py +++ b/exam/005/test_save_result.py @@ -63,7 +63,8 @@ del hyp all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 train_set, test_set = None, None -loss_func = nn.CrossEntropyLoss() +# loss_func = nn.CrossEntropyLoss() +loss_func = nn.BCELoss() columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", @@ -141,14 +142,14 @@ def test_and_analysis_and_visual(dataset_type): labels = labels.cuda() if gpu else labels with torch.no_grad(): out = model(resp, stft) - - loss = loss_func(out, labels) + out = out.squeeze(-1) + loss = loss_func(out, labels.float()) test_loss += loss.item() - labels = torch.unsqueeze(labels, dim=1) - out = F.softmax(out, dim=1) - out = torch.unsqueeze(out[:, 1], dim=1) + # labels = torch.unsqueeze(labels, dim=1) + # out = F.softmax(out, dim=1) + # out = torch.unsqueeze(out[:, 1], dim=1) calc_metrics.update(out.cpu(), labels.cpu()) # one[0] = list(one[0].cpu().numpy()) diff --git a/exam/014/generate_label_14.py b/exam/014/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/014/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/014/load_dataset.py b/exam/014/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/014/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/014/main.py b/exam/014/main.py new file mode 100644 index 0000000..ccb3609 --- /dev/null +++ b/exam/014/main.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net001 import HYBRIDNET001 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, stft, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, stft, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(1, 120, 1), (1, 1, 121, 26)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/model/Hybrid_Net001.py b/exam/014/model/Hybrid_Net001.py similarity index 100% rename from model/Hybrid_Net001.py rename to exam/014/model/Hybrid_Net001.py diff --git a/exam/014/my_augment.py b/exam/014/my_augment.py new file mode 100644 index 0000000..3e715c1 --- /dev/null +++ b/exam/014/my_augment.py @@ -0,0 +1,82 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft +from utils.signal_available import high_check + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_process(data, movement): + normal_segments = [[0, movement[0][0]], [movement[-1][-1], len(data)]] + for index in range(len(movement) - 1): + normal_segments.append([movement[index][-1], movement[index + 1][0]]) + + for index in range(len(normal_segments)): + SP, EP = normal_segments[index] + if SP == EP: + # print(index, SP, EP) + continue + std = data[SP:EP].std() + data[SP:EP][data[SP:EP] > std * 1.8] = std * 1.8 + data[SP:EP][data[SP:EP] < std * -1.8] = std * 1.8 + + data[SP:EP] /= std * 1.8 + + for index in range(len(movement)): + SP, EP = movement[index] + data[SP:EP] /= data[SP:EP].max() + + return data + + +def my_augment(dataset): + movement = high_check(dataset) + dataset = my_process(dataset, movement) + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) + dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = {"low": dataset_low, + "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + # Z-SCORE + dataset_low = (dataset_low - dataset_low.mean()) / dataset_low.std() + dataset_high = (dataset_high - dataset_high.mean()) / dataset_high.std() + + dataset_low = dataset_low[::25] + dataset_low = dataset_low.reshape(dataset_low.shape[0], 1) + + _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + dataset_high = dataset_high.astype(np.float).T + dataset_high = dataset_high.reshape(1, dataset_high.shape[0], dataset_high.shape[1]) + + return dataset_low, dataset_high + + +if __name__ == '__main__': + pass diff --git a/exam/014/settings.yaml b/exam/014/settings.yaml new file mode 100644 index 0000000..697a7ed --- /dev/null +++ b/exam/014/settings.yaml @@ -0,0 +1,42 @@ +# environment config +GPU: "1" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET001 +select_sampno: + - 282 + - 726 + - 229 + - 582 + - 286 + - 966 + - 1000 + - 1004 + - 1006 + - 1009 + +# train hyperparameters config +epoch: 50 +lr: 0.00001 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID001 NOBD STFT RESP DUAL" +tags: ["ReduceLROnPlateau", "RESP LSTM", "STFT CNN"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/014/test_save_result.py b/exam/014/test_save_result.py new file mode 100644 index 0000000..f66b9d5 --- /dev/null +++ b/exam/014/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net001 import HYBRIDNET001 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/014/utils/Draw_ConfusionMatrix.py b/exam/014/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/014/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/014/utils/Preprocessing.py b/exam/014/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/014/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/014/utils/calc_metrics.py b/exam/014/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/014/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/021/generate_label_14.py b/exam/021/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/021/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/021/load_dataset.py b/exam/021/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/021/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/021/main.py b/exam/021/main.py new file mode 100644 index 0000000..2a2b950 --- /dev/null +++ b/exam/021/main.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net002 import HYBRIDNET002 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, stft, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, stft, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(1, 300, 1), (1, 1, 121, 26)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/021/model/Hybrid_Net002.py b/exam/021/model/Hybrid_Net002.py new file mode 100644 index 0000000..55040ac --- /dev/null +++ b/exam/021/model/Hybrid_Net002.py @@ -0,0 +1,103 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net001.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET002(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET002, self).__init__() + + self.lstm = nn.LSTM(input_size=5, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.GELU(), + + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(16), + + nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.GELU(), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(32), + + nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3, 3), + stride=(1, 1), padding=(1, 1)), + nn.GELU(), + nn.MaxPool2d(kernel_size=3, stride=(2, 2), padding=1), + nn.BatchNorm2d(32) + + + + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(3968, 512), + nn.GELU(), + nn.Linear(512, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + x1 = x1.view(x1.shape[0], -1, 5) + x1, (_, _) = self.lstm(x1) + x2 = self.right(x2) + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET002().cuda() + summary(model, [(32, 30 * RESPIRATORY_FRE, 1), (32, 1, 121, 26)]) diff --git a/exam/021/my_augment.py b/exam/021/my_augment.py new file mode 100644 index 0000000..c4cf524 --- /dev/null +++ b/exam/021/my_augment.py @@ -0,0 +1,68 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + if 1000 < dataset.mean() < 2000: + dataset[dataset == 0] = 1825 + dataset[dataset < 1200] = 1225 + dataset[dataset > 2400] = 2425 + elif 2000 < dataset.mean() < 3000: + dataset[dataset == 0] = 2375 + dataset[dataset < 1775] = 1775 + dataset[dataset > 2975] = 2975 + dataset -= 550 + else: + print("something wrong") + + # select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=0.7, order=3) + + dataset -= dataset.mean() + dataset = dataset.dot(1 / 600) + + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) + dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = {"low": dataset_low, + "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::10] + dataset_low = dataset_low.reshape(dataset_low.shape[0], 1) + + _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + dataset_high = dataset_high.astype(np.float).T + dataset_high = dataset_high.reshape(1, dataset_high.shape[0], dataset_high.shape[1]) + + return dataset_low, dataset_high + + +if __name__ == '__main__': + pass diff --git a/exam/021/settings.yaml b/exam/021/settings.yaml new file mode 100644 index 0000000..b51088d --- /dev/null +++ b/exam/021/settings.yaml @@ -0,0 +1,42 @@ +# environment config +GPU: "0" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET002 +select_sampno: + - 282 + - 726 + - 229 + - 582 + - 286 + - 966 + - 1000 + - 1004 + - 1006 + - 1009 + +# train hyperparameters config +epoch: 50 +lr: 0.00001 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID002 NOBD STFT RESP DUAL" +tags: ["ReduceLROnPlateau", "RESP LSTM", "STFT CNN"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/021/test_save_result.py b/exam/021/test_save_result.py new file mode 100644 index 0000000..63605d3 --- /dev/null +++ b/exam/021/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net002 import HYBRIDNET002 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/021/utils/Draw_ConfusionMatrix.py b/exam/021/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/021/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/021/utils/Preprocessing.py b/exam/021/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/021/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/021/utils/calc_metrics.py b/exam/021/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/021/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/022/generate_label_14.py b/exam/022/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/022/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/022/load_dataset.py b/exam/022/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/022/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/022/main.py b/exam/022/main.py new file mode 100644 index 0000000..dffa980 --- /dev/null +++ b/exam/022/main.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net003 import HYBRIDNET003 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, stft, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, stft, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(32, 1, 300), (32, 121, 26)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/022/model/Hybrid_Net003.py b/exam/022/model/Hybrid_Net003.py new file mode 100644 index 0000000..b28130d --- /dev/null +++ b/exam/022/model/Hybrid_Net003.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net003.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET003(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET003, self).__init__() + + self.lstm = nn.LSTM(input_size=26, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv1d(in_channels=1, out_channels=8, kernel_size=20, + stride=2, padding=10), + nn.GELU(), + + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(8), + + nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(16), + + nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(32) + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(4480, 512), + nn.GELU(), + nn.Linear(512, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + # x1 = x1.view(x1.shape[0], -1, 5) + x1 = self.right(x1) + x2, (_, _) = self.lstm(x2) + + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET003().cuda() + summary(model, [(32, 1, 30 * RESPIRATORY_FRE), (32, 121, 26)]) diff --git a/exam/022/my_augment.py b/exam/022/my_augment.py new file mode 100644 index 0000000..7bb4018 --- /dev/null +++ b/exam/022/my_augment.py @@ -0,0 +1,52 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + dataset -= dataset.mean() + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) + dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = {"low": dataset_low, + "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::10] + dataset_low = dataset_low.reshape(1, dataset_low.shape[0]) + + _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + dataset_high = dataset_high.astype(np.float).T + dataset_high = dataset_high.reshape(dataset_high.shape[0], dataset_high.shape[1]) + + return dataset_low, dataset_high + + +if __name__ == '__main__': + pass diff --git a/exam/022/settings.yaml b/exam/022/settings.yaml new file mode 100644 index 0000000..871a1d1 --- /dev/null +++ b/exam/022/settings.yaml @@ -0,0 +1,42 @@ +# environment config +GPU: "0" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET003 +select_sampno: + - 282 + - 726 + - 229 + - 582 + - 286 + - 966 + - 1000 + - 1004 + - 1006 + - 1009 + +# train hyperparameters config +epoch: 50 +lr: 0.00001 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID003 BD STFT RESP DUAL" +tags: ["ReduceLROnPlateau", "RESP CNN", "STFT LSTM"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/022/test_save_result.py b/exam/022/test_save_result.py new file mode 100644 index 0000000..07a1d2f --- /dev/null +++ b/exam/022/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net003 import HYBRIDNET003 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/022/utils/Draw_ConfusionMatrix.py b/exam/022/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/022/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/022/utils/Preprocessing.py b/exam/022/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/022/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/022/utils/calc_metrics.py b/exam/022/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/022/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/023/generate_label_14.py b/exam/023/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/023/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/023/load_dataset.py b/exam/023/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/023/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/023/main.py b/exam/023/main.py new file mode 100644 index 0000000..dffa980 --- /dev/null +++ b/exam/023/main.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net003 import HYBRIDNET003 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, stft, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, stft, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, stft, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp, stft) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(32, 1, 300), (32, 121, 26)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/023/model/Hybrid_Net003.py b/exam/023/model/Hybrid_Net003.py new file mode 100644 index 0000000..b28130d --- /dev/null +++ b/exam/023/model/Hybrid_Net003.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net003.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET003(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET003, self).__init__() + + self.lstm = nn.LSTM(input_size=26, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv1d(in_channels=1, out_channels=8, kernel_size=20, + stride=2, padding=10), + nn.GELU(), + + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(8), + + nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(16), + + nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(32) + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(4480, 512), + nn.GELU(), + nn.Linear(512, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + # x1 = x1.view(x1.shape[0], -1, 5) + x1 = self.right(x1) + x2, (_, _) = self.lstm(x2) + + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET003().cuda() + summary(model, [(32, 1, 30 * RESPIRATORY_FRE), (32, 121, 26)]) diff --git a/exam/023/my_augment.py b/exam/023/my_augment.py new file mode 100644 index 0000000..c9fc3ad --- /dev/null +++ b/exam/023/my_augment.py @@ -0,0 +1,68 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + if 1000 < dataset.mean() < 2000: + dataset[dataset == 0] = 1825 + dataset[dataset < 1200] = 1225 + dataset[dataset > 2400] = 2425 + elif 2000 < dataset.mean() < 3000: + dataset[dataset == 0] = 2375 + dataset[dataset < 1775] = 1775 + dataset[dataset > 2975] = 2975 + dataset -= 550 + else: + print("something wrong") + + # select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=0.7, order=3) + + dataset -= dataset.mean() + dataset = dataset.dot(1 / 600) + + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.7, order=6) + dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = {"low": dataset_low, + "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::10] + dataset_low = dataset_low.reshape(1, dataset_low.shape[0]) + + _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + dataset_high = dataset_high.astype(np.float).T + dataset_high = dataset_high.reshape(dataset_high.shape[0], dataset_high.shape[1]) + + return dataset_low, dataset_high + + +if __name__ == '__main__': + pass diff --git a/exam/023/settings.yaml b/exam/023/settings.yaml new file mode 100644 index 0000000..871a1d1 --- /dev/null +++ b/exam/023/settings.yaml @@ -0,0 +1,42 @@ +# environment config +GPU: "0" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET003 +select_sampno: + - 282 + - 726 + - 229 + - 582 + - 286 + - 966 + - 1000 + - 1004 + - 1006 + - 1009 + +# train hyperparameters config +epoch: 50 +lr: 0.00001 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID003 BD STFT RESP DUAL" +tags: ["ReduceLROnPlateau", "RESP CNN", "STFT LSTM"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/023/test_save_result.py b/exam/023/test_save_result.py new file mode 100644 index 0000000..07a1d2f --- /dev/null +++ b/exam/023/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net003 import HYBRIDNET003 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/023/utils/Draw_ConfusionMatrix.py b/exam/023/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/023/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/023/utils/Preprocessing.py b/exam/023/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/023/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/023/utils/calc_metrics.py b/exam/023/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/023/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/031/generate_label_14.py b/exam/031/generate_label_14.py new file mode 100644 index 0000000..cc5f96d --- /dev/null +++ b/exam/031/generate_label_14.py @@ -0,0 +1,561 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 1 + +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + select_numpy_len = len(select_numpy) + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].fillna("2") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + # 对于无事件的样本,直接将所有的片段 + if len(segment_labels) == 0: + # 剔除补零片段(把开始点移动到补零结束) + normal_SP = not_zero_point + # 开头至少满足一个窗长 + if normal_SP < window_second: + normal_SP = window_second + + # 结束点为样本总长 除以 采样率 + normal_EP = select_numpy_len // 100 + label_type = 0 + # 正常时间编号为秒数 除以 30,即epoch + negative_labels += [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, + SP1 + window_second] for SP1 in + range(normal_SP - window_second + step_second, normal_EP - window_second + step_second, + window_second)] + + # 对于有事件的样本 + # 遍历事件,获取事件之间的背景片段 + for index in range(len(segment_labels) + 1): + # 前一个事件的结尾 与 下一事件开头即为背景 + # 对于开头的无事件片段,则设定开始事件为0 + if index == 0: + normal_SP = 0 + # 非开头片段,开始点为上一个事件的结尾 + else: + # 加一秒 作为 缓冲 + normal_SP = segment_labels[index - 1][-1] + 1 + + # 最后一个事件则取到样本片段结尾 + if index == len(segment_labels): + normal_EP = select_numpy_len // 100 - window_second + # 否则结束事件取 本事件开头 + else: + # 减一秒 作为 缓冲 + normal_EP = segment_labels[index][-2] - 1 + + # 剔除包含开头补零的片段 + if normal_EP < not_zero_point: + continue + + # 剔除开头不足30s的正常片段 + if normal_SP < window_second: + continue + label_type = 0 + + # 将背景事件按照滑窗距离逐个加入到背景事件中 + temp_1 = [[sampNo, 10000 + normal_SP // 30, label_type, normal_event_quality_label, SP1, SP1 + window_second] + for SP1 in range(normal_SP - window_second, normal_EP - window_second, window_second)] + + negative_labels += temp_1 + + train_label, valid_label, test_label = [], [], [] + # assert (valid_ratio + test_ratio) < 1 <= len(segment_labels), f"{sampNo}训练集与测试集数量应小于总数据集数量{len(segment_labels)}" + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + event_label = np.zeros(select_numpy_len // 100) + quality_label = np.zeros(select_numpy_len // 100) + # 用于存储事件标签 + for PN, segmentNo, label_type, new_label, SP, EP in segment_labels: + event_label[SP:EP] = label_type + + test_label = [] + # 剔除补零片段 + normal_SP = not_zero_point + if normal_SP < window_second: + normal_SP = window_second + normal_EP = select_numpy_len // 100 + + # 分成指定窗长的滑窗片段 + test_label += [ + [sampNo, SP1 // 30, int(event_label[SP1 + window_second - step_second]), + int(quality_label[SP1 + window_second - step_second]), + SP1, SP1 + window_second] for SP1 in range(normal_SP - window_second + step_second, + normal_EP - window_second + step_second, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + # 分别将各事件切分为30s, 步进1秒的片段 + for index in range(train_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + train_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(train_segment_num, valid_segment_num): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + valid_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second, step_second)] + + for index in range(valid_segment_num, len(segment_labels)): + PN, segmentNo, label_type, new_label, SP, EP = segment_labels[index] + test_label += [[PN, segmentNo, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(SP - window_second, EP - window_second + step_second * step_second)] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + # if 0 < valid_ratio < 1: + # train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + # valid_eve2 = int(len(negative_labels) * valid_ratio) + # else: + # train_eve2 = len(negative_labels) - valid_ratio - test_ratio + # valid_eve2 = valid_ratio + # + # test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # 直接补充到足够个数的背景事件 + train_eve2 = max(train_eve, 300) + valid_eve2 = max(valid_eve, 40) + test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[:train_eve2]: + label_type = int(label_type) + train_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2: train_eve2 + valid_eve2]: + label_type = int(label_type) + valid_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 + in + range(normal_SP, normal_EP, step_second)] + + for sampNo, index, label_type, new_label, normal_SP, normal_EP in negative_labels[ + train_eve2 + valid_eve2:train_eve2 + valid_eve2 + test_eve2]: + label_type = int(label_type) + test_label += [[sampNo, 10000 + normal_SP // 30, label_type, new_label, SP1, SP1 + window_second] for SP1 in + range(normal_SP, normal_EP, step_second)] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label)[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label)[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label)[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label)[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label)[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label)[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/031/generate_label_A01.py b/exam/031/generate_label_A01.py new file mode 100644 index 0000000..f19b674 --- /dev/null +++ b/exam/031/generate_label_A01.py @@ -0,0 +1,486 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# A01 +# 以每三十秒为窗 步进十秒, 每个窗最后十秒内 超过五秒存在暂停则为正样本 + +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 10 +view_apnea_length = 10 +apnea_length_threshold = 5 +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_label"] = label_csv["new_label"].fillna("2") + + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + select_numpy_len = len(select_numpy) // 100 + event_list = np.zeros(select_numpy_len) + quality_list = np.zeros(select_numpy_len) + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + print(label.to_numpy()) + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label.to_numpy()) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + event_list[label["new_start"]: label["new_end"]] = label_type + quality_list[label["new_start"]: label["new_end"]] = label["new_label"] + # segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + for SP in range(not_zero_point, select_numpy_len - window_second, step_second): + start = SP + window_second - view_apnea_length + end = SP + window_second + i = SP // 30 + if sum((select_numpy[start:end] > 1)) >= apnea_length_threshold: + segment_labels.append( + [sampNo, i, event_list[start:end].max(), quality_list[start:end].max(), SP, SP + window_second]) + else: + negative_labels.append( + [sampNo, i, event_list[start:end].max(), quality_list[start:end].max(), SP, SP + window_second] + ) + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + test_label = [] + # 分成指定窗长的滑窗片段 + test_label = segment_labels + negative_labels + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + train_label = segment_labels[:train_segment_num] + valid_label = segment_labels[train_segment_num:train_segment_num + valid_segment_num] + test_label = segment_labels[train_segment_num + valid_segment_num:] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + if 0 < valid_ratio < 1: + train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + valid_eve2 = int(len(negative_labels) * valid_ratio) + else: + train_eve2 = len(negative_labels) - valid_ratio - test_ratio + valid_eve2 = valid_ratio + + test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # # 直接补充到足够个数的背景事件 + # train_eve2 = max(train_eve, 300) + # valid_eve2 = max(valid_eve, 40) + # test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + train_label += negative_labels[:train_eve2] + valid_label += negative_labels[train_eve2: train_eve2 + valid_eve2] + test_label += negative_labels[train_eve2 + valid_eve2:] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/031/load_dataset.py b/exam/031/load_dataset.py new file mode 100644 index 0000000..25e7603 --- /dev/null +++ b/exam/031/load_dataset.py @@ -0,0 +1,156 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][SP*10:EP*10].copy() + segment = segment.reshape(-1, 5) + return segment, int(float(label_type) > 1) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][SP*10:EP*10].copy() + segment = segment.reshape(-1, 5) + return segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/031/main.py b/exam/031/main.py new file mode 100644 index 0000000..b266b5d --- /dev/null +++ b/exam/031/main.py @@ -0,0 +1,289 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net010 import HYBRIDNET010 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment + +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(32, 60, 5)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/031/model/Hybrid_Net003.py b/exam/031/model/Hybrid_Net003.py new file mode 100644 index 0000000..b28130d --- /dev/null +++ b/exam/031/model/Hybrid_Net003.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net003.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET003(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET003, self).__init__() + + self.lstm = nn.LSTM(input_size=26, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv1d(in_channels=1, out_channels=8, kernel_size=20, + stride=2, padding=10), + nn.GELU(), + + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(8), + + nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(16), + + nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(32) + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(4480, 512), + nn.GELU(), + nn.Linear(512, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + # x1 = x1.view(x1.shape[0], -1, 5) + x1 = self.right(x1) + x2, (_, _) = self.lstm(x2) + + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET003().cuda() + summary(model, [(32, 1, 30 * RESPIRATORY_FRE), (32, 121, 26)]) diff --git a/exam/031/model/Hybrid_Net010.py b/exam/031/model/Hybrid_Net010.py new file mode 100644 index 0000000..0f79a7d --- /dev/null +++ b/exam/031/model/Hybrid_Net010.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net011.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/10/14 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET010(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET010, self).__init__() + + self.lstm = nn.LSTM(input_size=5, + hidden_size=32, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(64, 8), + nn.GELU(), + nn.Linear(8, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x, (_, _) = self.lstm(x) + # print(x.shape) + x = x[:, -1] + + x = torch.flatten(x, start_dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET010().cuda() + summary(model, [(32, 60, 5)]) diff --git a/exam/031/my_augment.py b/exam/031/my_augment.py new file mode 100644 index 0000000..6b4453e --- /dev/null +++ b/exam/031/my_augment.py @@ -0,0 +1,54 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + dataset -= dataset.mean() + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.5, order=4) + # dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = dataset_low[::10] + # "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + # dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::10] + dataset_low = dataset_low.reshape(-1, 5) + + # _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + # dataset_high = dataset_high.astype(np.float).T + # dataset_high = dataset_high.reshape(dataset_high.shape[0], dataset_high.shape[1]) + + # return dataset_low, dataset_high + + return [dataset_low] + + +if __name__ == '__main__': + pass diff --git a/exam/031/settings.yaml b/exam/031/settings.yaml new file mode 100644 index 0000000..14ea7ae --- /dev/null +++ b/exam/031/settings.yaml @@ -0,0 +1,77 @@ +# environment config +GPU: "0" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET010 +select_sampno: + - 88 + - 220 + - 221 + - 229 + - 282 + - 286 + - 541 + - 579 + - 582 + - 670 + - 671 + - 683 + - 703 + - 704 + - 726 + - 735 + - 736 + - 933 + - 935 + - 952 + - 954 + - 955 + - 960 + - 961 + - 962 + - 966 + - 967 + - 969 + - 971 + - 972 + - 1000 + - 1004 + - 1006 + - 1009 + - 1010 + - 1296 + - 1300 + - 1301 + - 1302 + - 1308 + - 1354 + - 1374 + - 1378 + - 1478 + + +# train hyperparameters config +epoch: 300 +lr: 0.01 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID010 RESP " +tags: ["ReduceLROnPlateau", "RESP LSTM"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/031/test_save_result.py b/exam/031/test_save_result.py new file mode 100644 index 0000000..07a1d2f --- /dev/null +++ b/exam/031/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net003 import HYBRIDNET003 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/031/utils/Draw_ConfusionMatrix.py b/exam/031/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/031/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/031/utils/Preprocessing.py b/exam/031/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/031/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/031/utils/calc_metrics.py b/exam/031/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/031/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/032/generate_label_A01.py b/exam/032/generate_label_A01.py new file mode 100644 index 0000000..f19b674 --- /dev/null +++ b/exam/032/generate_label_A01.py @@ -0,0 +1,486 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# A01 +# 以每三十秒为窗 步进十秒, 每个窗最后十秒内 超过五秒存在暂停则为正样本 + +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 10 +view_apnea_length = 10 +apnea_length_threshold = 5 +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_label"] = label_csv["new_label"].fillna("2") + + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + select_numpy_len = len(select_numpy) // 100 + event_list = np.zeros(select_numpy_len) + quality_list = np.zeros(select_numpy_len) + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + print(label.to_numpy()) + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label.to_numpy()) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + event_list[label["new_start"]: label["new_end"]] = label_type + quality_list[label["new_start"]: label["new_end"]] = label["new_label"] + # segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + for SP in range(not_zero_point, select_numpy_len - window_second, step_second): + start = SP + window_second - view_apnea_length + end = SP + window_second + i = SP // 30 + if sum((select_numpy[start:end] > 1)) >= apnea_length_threshold: + segment_labels.append( + [sampNo, i, event_list[start:end].max(), quality_list[start:end].max(), SP, SP + window_second]) + else: + negative_labels.append( + [sampNo, i, event_list[start:end].max(), quality_list[start:end].max(), SP, SP + window_second] + ) + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + test_label = [] + # 分成指定窗长的滑窗片段 + test_label = segment_labels + negative_labels + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + train_label = segment_labels[:train_segment_num] + valid_label = segment_labels[train_segment_num:train_segment_num + valid_segment_num] + test_label = segment_labels[train_segment_num + valid_segment_num:] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + if 0 < valid_ratio < 1: + train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + valid_eve2 = int(len(negative_labels) * valid_ratio) + else: + train_eve2 = len(negative_labels) - valid_ratio - test_ratio + valid_eve2 = valid_ratio + + test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # # 直接补充到足够个数的背景事件 + # train_eve2 = max(train_eve, 300) + # valid_eve2 = max(valid_eve, 40) + # test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + train_label += negative_labels[:train_eve2] + valid_label += negative_labels[train_eve2: train_eve2 + valid_eve2] + test_label += negative_labels[train_eve2 + valid_eve2:] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/032/load_dataset.py b/exam/032/load_dataset.py new file mode 100644 index 0000000..83ed5ef --- /dev/null +++ b/exam/032/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][SP*10:EP*10].copy() + segment = segment.reshape(-1, 1) + return segment, int(float(label_type) > 1) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][SP*10:EP*10].copy() + segment = segment.reshape(-1, 1) + return segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/032/main.py b/exam/032/main.py new file mode 100644 index 0000000..2e19646 --- /dev/null +++ b/exam/032/main.py @@ -0,0 +1,289 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net011 import HYBRIDNET011 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment + +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(32, 300, 1)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/032/model/Hybrid_Net003.py b/exam/032/model/Hybrid_Net003.py new file mode 100644 index 0000000..b28130d --- /dev/null +++ b/exam/032/model/Hybrid_Net003.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net003.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/30 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET003(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET003, self).__init__() + + self.lstm = nn.LSTM(input_size=26, + hidden_size=16, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.right = nn.Sequential( + nn.Conv1d(in_channels=1, out_channels=8, kernel_size=20, + stride=2, padding=10), + nn.GELU(), + + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(8), + + nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(16), + + nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(32) + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(4480, 512), + nn.GELU(), + nn.Linear(512, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x1, x2): + # x1 = x1.view(x1.shape[0], -1, 5) + x1 = self.right(x1) + x2, (_, _) = self.lstm(x2) + + # print(x1.shape) + # print(x2.shape) + x1 = torch.flatten(x1, start_dim=1) + x2 = torch.flatten(x2, start_dim=1) + x = torch.cat((x1, x2), dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET003().cuda() + summary(model, [(32, 1, 30 * RESPIRATORY_FRE), (32, 121, 26)]) diff --git a/exam/032/model/Hybrid_Net011.py b/exam/032/model/Hybrid_Net011.py new file mode 100644 index 0000000..2010f0c --- /dev/null +++ b/exam/032/model/Hybrid_Net011.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net011.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/10/14 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET011(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET011, self).__init__() + + self.lstm = nn.LSTM(input_size=1, + hidden_size=32, + num_layers=1, + bidirectional=True, + batch_first=True) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(64, 8), + nn.GELU(), + nn.Linear(8, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x, (_, _) = self.lstm(x) + # print(x.shape) + x = x[:, -1] + + x = torch.flatten(x, start_dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET011().cuda() + summary(model, [(32, 300, 1)]) diff --git a/exam/032/my_augment.py b/exam/032/my_augment.py new file mode 100644 index 0000000..067e35a --- /dev/null +++ b/exam/032/my_augment.py @@ -0,0 +1,54 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + dataset -= dataset.mean() + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.5, order=4) + # dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = dataset_low[::10] + # "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + # dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::10] + dataset_low = dataset_low.reshape(-1, 1) + + # _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + # dataset_high = dataset_high.astype(np.float).T + # dataset_high = dataset_high.reshape(dataset_high.shape[0], dataset_high.shape[1]) + + # return dataset_low, dataset_high + + return dataset_low + + +if __name__ == '__main__': + pass diff --git a/exam/032/settings.yaml b/exam/032/settings.yaml new file mode 100644 index 0000000..91d9649 --- /dev/null +++ b/exam/032/settings.yaml @@ -0,0 +1,77 @@ +# environment config +GPU: "0" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET011 +select_sampno: + - 88 + - 220 + - 221 + - 229 + - 282 + - 286 + - 541 + - 579 + - 582 + - 670 + - 671 + - 683 + - 703 + - 704 + - 726 + - 735 + - 736 + - 933 + - 935 + - 952 + - 954 + - 955 + - 960 + - 961 + - 962 + - 966 + - 967 + - 969 + - 971 + - 972 + - 1000 + - 1004 + - 1006 + - 1009 + - 1010 + - 1296 + - 1300 + - 1301 + - 1302 + - 1308 + - 1354 + - 1374 + - 1378 + - 1478 + + +# train hyperparameters config +epoch: 300 +lr: 0.01 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID010 RESP " +tags: ["ReduceLROnPlateau", "RESP LSTM"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/032/test_save_result.py b/exam/032/test_save_result.py new file mode 100644 index 0000000..07a1d2f --- /dev/null +++ b/exam/032/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net003 import HYBRIDNET003 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/032/utils/Draw_ConfusionMatrix.py b/exam/032/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/032/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/032/utils/Preprocessing.py b/exam/032/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/032/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/032/utils/calc_metrics.py b/exam/032/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/032/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/033/generate_label_A01.py b/exam/033/generate_label_A01.py new file mode 100644 index 0000000..f19b674 --- /dev/null +++ b/exam/033/generate_label_A01.py @@ -0,0 +1,486 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:generate_label_11.0.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/09/05 +""" +# A01 +# 以每三十秒为窗 步进十秒, 每个窗最后十秒内 超过五秒存在暂停则为正样本 + +# 14.0 +# 手动均衡数量 + +# 13.0 +# 限制选择部分数据集,先做测试 + +# 12.0 +# 置不可用事件的片段为上限,不可用片段设置为背景,不记录事件 + +# 10.0 +# 使用提出质量差的信号 + +# 9.0 +# 增加 最新的质量标签 未使用 + +# 8.0 +# 生成 除低通气所有事件标签 + +# 尝试过步进两秒 会造成不足两秒的数据被抛弃,造成较多误判,但是可以考虑囊括这部分 +# 采用 30秒数据 移动 1秒 将所有呼吸暂停标注为1 低通气为0 正常为0 + +# 预处理操作 为 50Hz陷波滤波器去工频 外加 20Hz的低通滤波器 这个20Hz要看BCG信号的频谱范围 + +# 先提剔除极端值 +# 数值大于最高基准线或最低基准线 +# type1: average:1800 low:1200 high:2400 +# type2: average:2400 low:1800 high:3000 + +# 过多片段会造成平均值偏移 +# TODO +# 加入体动标签,计算除体动外的平均值 + +# 最后降采为100hz + +import time +import logging +import numpy as np +import pandas as pd +from pathlib import Path + +from datetime import datetime + +import yaml +from pathos import multiprocessing +from tqdm import tqdm + +# 数据集 和 标签 位置 +bcg_numpy_data_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/") +bcg_label_path = Path(r"/home/marques/code/marques/apnea/dataset/BCG_label_0616/") + +# BCG 记录开始时间 +bcg_start_time = np.loadtxt(Path(r"/home/marques/code/marques/apnea/dataset/start_time.csv"), delimiter=', ', + dtype=object) +bcg_start_time = dict(zip(bcg_start_time[:, 0], bcg_start_time[:, 1])) + +# 读取每个数据集路径 +all_numpy_dataset = list(bcg_numpy_data_path.rglob("*.npy")) +all_numpy_dataset.sort() + +# 划分后的数据集保存路径 +# dataset_save_path = Path(r"/home/marques/code/marques/apnea/dataset/dataset/dataset0623_300_30_30/") +dataset_save_path = Path(r"./dataset/") +dataset_save_path.mkdir(exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +fh = logging.FileHandler(dataset_save_path / (realtime + ".log"), mode='w') +fh.setLevel(logging.NOTSET) +# fh.setFormatter(logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(ch) + +# all_label = [] +# 输出统计数据标题栏 +# logger.info("sampNo".center(8) + +# "hpy_num".center(8) + "hpy_time".center(10) + +# "csa_num".center(8) + "csa_time".center(10) + +# "osa_num".center(8) + "osa_time".center(10) + +# "msa_num".center(8) + "msa_time".center(10) +# ) + +logger.info("sampNo".center(8) + ',' + + "train_num".center(10) + ',' + "train_P".center(10) + ',' + "train_N".center(10) + ',' + + "valid_num".center(10) + ',' + "valid_P".center(10) + ',' + "valid_N".center(10) + ',' + + "test_num".center(10) + ',' + "test_P".center(10) + ',' + "test_N".center(10) + ',' + + "train_eve".center(10) + ',' + "valid_eve".center(10) + ',' + "test_eve".center(10) + ) + +base_random_seed = 42 + +window_second = 30 +step_second = 10 +view_apnea_length = 10 +apnea_length_threshold = 5 +valid_ratio = 0.1 +test_ratio = 0.1 + +normal_event_quality_label = 0 +# valid_ratio = 5000 +# test_ratio = 10000 + +assert ((valid_ratio + test_ratio) < 1 and 0 < valid_ratio < 1 and 0 < test_ratio < 1) or ( + valid_ratio > 1 and valid_ratio > 1), "验证集与测试集输入应同时为比例或数量" + +# dataset sampNo for test +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) +select_dataset = hyp["select_sampno"] + +# 需要置成0的片段,前面不一定补零,还有可能上万 +disable_segment = { + '221': [[0, 10000]], + '670': [[0, 20000]], + '683': [[0, 20000]], + '704': [[0, 26000]], + '726': [[0, 20000]], + '736': [[0, 47000]], + '933': [[0, 773560]], + '935': [[0, 26600]], + '952': [[0, 17000]], + '955': [[0, 78000]], + '961': [[0, 107000]], + '962': [[0, 15100]], + '966': [[0, 13120]], + '967': [[0, 44000]], + '1006': [[0, 60000]], + '1009': [[0, 1000]], + '1010': [[0, 49000]], + '1296': [[0, 27000]], + '1300': [[0, 33800]], + '1301': [[0, 14000]], + '1302': [[0, 5600]], + '1374': [[0, 1000]], + '1478': [[0, 998000]], + +} + + +# 生成数据集主函数 +def generate_label(No, dataset_path): + """ + :param dataset_path: 数据集路径 + :return: + """ + + # 获取数据编号 + sampNo = dataset_path.stem.split("samp")[0] + # 标签路径 + label_path = bcg_label_path / f"export{sampNo}_all.csv" + + if not label_path.exists(): + raise FileNotFoundError(f"{label_path} not exist") + + if not dataset_path.exists(): + raise Exception(f"{dataset_path} not exists") + + # 加载数据集 + select_numpy = np.load(dataset_path) + + + # 开头不合理片段置零 + if sampNo in disable_segment.keys(): + for sp, ep in disable_segment[sampNo]: + select_numpy[sp:ep] = 0 + + # 剔除质量差信号 + if sampNo == "670": + select_numpy = select_numpy[:17195 * 100] + + # 获取前面补了多少0 + not_zero_point = 0 + for num in select_numpy: + if num > 10: + break + not_zero_point += 1 + not_zero_point //= 100 + + # 读取标签 + label_csv = pd.read_csv(label_path, encoding='gbk') + label_csv["new_label"] = label_csv["new_label"].fillna("2") + + label_csv["new_start"] = label_csv["new_start"].astype("int") + label_csv["new_end"] = label_csv["new_end"].astype("int") + label_csv["Duration"] = label_csv["Duration"].astype("int") + label_csv["new_label"] = label_csv["new_label"].astype("int") + # 剔除质量不好的样本 + # drop_csv = label_csv[ + # (label_csv["Event type"].isin(["Central apnea", "Obstructive apnea"])) & (label_csv["new_label"] == 2)] + # label_csv = label_csv.drop(drop_csv.index) + + # 事件片段与背景片段, 每个背景长度均为设定窗长 + segment_labels = [] + negative_labels = [] + hpy_num = csa_num = osa_num = msa_num = 0 + hpy_time = csa_time = osa_time = msa_time = 0 + select_numpy_len = len(select_numpy) // 100 + event_list = np.zeros(select_numpy_len) + quality_list = np.zeros(select_numpy_len) + + # 遍历全部事件并统计 + for i in range(len(label_csv)): + # 进行LabelEncoder + label = label_csv.iloc[i, :] + # 如果事件在补零片段,则不添加到事件列表 + if label["new_end"] < not_zero_point: + continue + + if sampNo == "670" and label["new_start"] > 17195: + continue + + if label["new_end"] - label["new_start"] < 10: + print(label.to_numpy()) + continue + + # 将事件添加到事件列表 + if label["Event type"] == "Hypopnea": + label_type = 1 + hpy_num += 1 + hpy_time += label["new_end"] - label["new_start"] + + # 将低通气添加到背景 好像不用专门加入到负样本事件中? + # negative_labels.append( + # [sampNo, i, label_type, normal_event_quality_label, label["new_start"], label["new_end"]]) + continue + elif label["Event type"] == "Central apnea": + label_type = 2 + csa_num += 1 + csa_time += label["new_end"] - label["new_start"] + elif label["Event type"] == "Obstructive apnea": + label_type = 3 + osa_num += 1 + osa_time += label["new_end"] - label["new_start"] + # MSA 认为是OSA + elif label["Event type"] == "Mixed apnea": + label_type = 3 + msa_num += 1 + msa_time += label["new_end"] - label["new_start"] + else: + continue + # label_type = 0 + if label["new_end"] - label["new_start"] > label["Duration"] + 20: + print(sampNo, label.to_numpy()) + + # 格式为 样本编号 第几个事件 标签 开始事件 结束事件 + event_list[label["new_start"]: label["new_end"]] = label_type + quality_list[label["new_start"]: label["new_end"]] = label["new_label"] + # segment_labels.append([sampNo, i, label_type, label["new_label"], label["new_start"], label["new_end"]]) + + # logger.info(sampNo.center(8) + + # str(hpy_num).center(8) + str(hpy_time).center(10) + + # str(csa_num).center(8) + str(csa_time).center(10) + + # str(osa_num).center(8) + str(osa_time).center(10) + + # str(msa_num).center(8) + str(msa_time).center(10)) + + # 设置随机树种子 + random_seed = base_random_seed + int(sampNo) + + for SP in range(not_zero_point, select_numpy_len - window_second, step_second): + start = SP + window_second - view_apnea_length + end = SP + window_second + i = SP // 30 + if sum((select_numpy[start:end] > 1)) >= apnea_length_threshold: + segment_labels.append( + [sampNo, i, event_list[start:end].max(), quality_list[start:end].max(), SP, SP + window_second]) + else: + negative_labels.append( + [sampNo, i, event_list[start:end].max(), quality_list[start:end].max(), SP, SP + window_second] + ) + + # 对于测试数据全部直接保存 + if int(sampNo) in select_dataset: + test_label = [] + # 分成指定窗长的滑窗片段 + test_label = segment_labels + negative_labels + + logger.info(sampNo.center(8) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(0).center(10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center(10) + + ',' + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + str(0).center(10) + ',' + str(0).center(10) + ',' + str(len(segment_labels)).center(10) + ) + + df2.loc[No] = [sampNo, + str(0), str(0), str(0), + str(0), str(0), str(0), + str(len(test_label)), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(0), str(0), str(len(segment_labels))] + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_all_{window_second}s_sa_test2_label.npy", + # np.array(test_label)) + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_all_label.csv", + index=False) + + train_label, valid_label, test_label = [], [], [] + # 对于训练与验证集样本 + if True: + # 打乱片段顺序 + np.random.seed(random_seed) + np.random.shuffle(segment_labels) + np.random.shuffle(negative_labels) + + # 获取训练集、验证集、测试集分到事件个数 + if 0 < valid_ratio < 1: + train_segment_num = int(len(segment_labels) * (1 - valid_ratio - test_ratio)) + valid_segment_num = int(len(segment_labels) * (1 - test_ratio)) + else: + train_segment_num = len(segment_labels) - valid_ratio - test_ratio + valid_segment_num = valid_ratio + + train_label = segment_labels[:train_segment_num] + valid_label = segment_labels[train_segment_num:train_segment_num + valid_segment_num] + test_label = segment_labels[train_segment_num + valid_segment_num:] + + # 计算片段和事件个数 + train_num, valid_num, test_num = len(train_label), len(valid_label), len(test_label) + train_eve, valid_eve, test_eve = train_segment_num, (valid_segment_num - train_segment_num), ( + len(segment_labels) - valid_segment_num) + + # 数据集补偿 + # if train_num < 300: + # train_num = 300 - train_num + # + # if valid_num < 300: + # valid_num = 300 - valid_num + # + # if test_num < 300: + # test_num = 300 - test_num + + # 获取训练集、验证集、测试集分到背景个数 + if 0 < valid_ratio < 1: + train_eve2 = int(len(negative_labels) * (1 - valid_ratio - test_ratio)) + valid_eve2 = int(len(negative_labels) * valid_ratio) + else: + train_eve2 = len(negative_labels) - valid_ratio - test_ratio + valid_eve2 = valid_ratio + + test_eve2 = len(negative_labels) - train_eve2 - valid_eve2 + # # 直接补充到足够个数的背景事件 + # train_eve2 = max(train_eve, 300) + # valid_eve2 = max(valid_eve, 40) + # test_eve2 = max(test_eve, 40) + + # 强制背景数量 + # train_eve2 = train_eve + # valid_eve2 = valid_eve + # test_eve2 = test_eve + + # 添加背景事件数量 + train_label += negative_labels[:train_eve2] + valid_label += negative_labels[train_eve2: train_eve2 + valid_eve2] + test_label += negative_labels[train_eve2 + valid_eve2:] + + logger.info(sampNo.center(8) + ',' + + str(len(train_label)).center(10) + ',' + + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) > 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) < 1) if len(train_label) != 0 else 0).center( + 10) + ',' + + str(len(valid_label)).center(10) + ',' + + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0).center( + 10) + ',' + + str(len(test_label)).center(10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0).center( + 10) + ',' + + str(train_eve).center(10) + ',' + str(valid_eve).center(10) + ',' + str(test_eve).center(10) + ) + + df2.loc[No] = [sampNo.center(8), + str(len(train_label)), + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) > 1) if len(train_label) != 0 else 0), + str(sum(np.array(train_label).astype('float64')[:, 2].astype(int) < 1) if len(train_label) != 0 else 0), + str(len(valid_label)), + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) > 1) if len(valid_label) != 0 else 0), + str(sum(np.array(valid_label).astype('float64')[:, 2].astype(int) < 1) if len(valid_label) != 0 else 0), + str(len(test_label)), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) > 1) if len(test_label) != 0 else 0), + str(sum(np.array(test_label).astype('float64')[:, 2].astype(int) < 1) if len(test_label) != 0 else 0), + str(train_eve), str(valid_eve), str(test_eve).center(10)] + + def label_check(label_list): + temp_list = [] + for sampNo, index, label_type, new_label, SP, EP in label_list: + if EP - SP < window_second: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if SP < 0: + print(sampNo, index, label_type, SP, EP) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, SP, EP, len(select_numpy[SP * 100:EP * 100])) + temp_list.append([sampNo, index, label_type, new_label, SP, EP]) + + for j in temp_list: + label_list.remove(j) + + label_check(train_label) + label_check(valid_label) + label_check(test_label) + for sampNo, index, label_type, new_label, SP, EP in train_label: + if EP - SP < window_second: + print(sampNo, index, label_type, new_label, SP, EP) + + if SP < 0: + print(sampNo, index, label_type, new_label, SP, EP) + + if len(select_numpy[SP * 100:EP * 100]) != window_second * 100: + print(sampNo, index, label_type, new_label, SP, EP, len(select_numpy[SP * 100:EP * 100])) + + df1 = pd.DataFrame(data=train_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.csv", + index=False) + + df1 = pd.DataFrame(data=valid_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.csv", + index=False) + + df1 = pd.DataFrame(data=test_label, + columns=["sampNo", "index", "label_type", "new_label", "SP", "EP"]) + df1.to_csv(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.csv", index=False) + + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_train_label.npy", + # np.array(train_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_valid_label.npy", + # np.array(valid_label)) + # np.save(dataset_save_path / f"{sampNo}_{step_second}s_focal_{window_second}s_sa_test_label.npy", + # np.array(test_label)) + + +if __name__ == '__main__': + # pool = multiprocessing.Pool(processes=44) + # pool.map(generate_label, list(all_numpy_dataset)) + # pool.close() + # pool.join() + + df2 = pd.DataFrame(data=None, + columns=["sampNo", + "train_num", "train_P", "train_N", + "valid_num", "valid_P", "valid_N", + "test_num", "test_P", "test_N", + "train_eve", "valid_eve", "test_eve"]) + + temp = [] + for one_dataset in all_numpy_dataset: + if int(one_dataset.stem.split("samp")[0]) in [*select_dataset]: + temp.append(one_dataset) + # for one_dataset in temp: + # all_numpy_dataset.remove(one_dataset) + + for No, one_dataset in enumerate(temp): + generate_label(No, one_dataset) + + df2.to_csv(dataset_save_path / (realtime + ".csv"), index=False) + # generate_label(all_numpy_dataset[0]) diff --git a/exam/033/load_dataset.py b/exam/033/load_dataset.py new file mode 100644 index 0000000..99806f8 --- /dev/null +++ b/exam/033/load_dataset.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:load_dataset.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2021/12/03 +""" +import sys +from pathlib import Path +import pandas as pd +import numpy as np +import torch.utils.data +from torch.utils.data import Dataset +from tqdm import tqdm +from utils.Preprocessing import BCG_Operation + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 +""" +1. 读取方法 +# 无论是否提前切分,均提前转成npy格式 +# 1.1 提前预处理,切分好后生成npy,直接载入切分好的片段 内存占用多 读取简单 +使用此方法: 1.2 提前预处理,载入整夜数据,切分好后生成csv或xls,根据片段读取 内存占用少 读取较为复杂 +""" + +datasets = {} + + +# 减少重复读取 +def read_dataset(data_path, augment=None): + data_path = Path(data_path) + try: + f = [] + if data_path.is_dir(): + dataset_list = list(data_path.rglob("*.npy")) + dataset_list.sort() + f += dataset_list + elif data_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{data_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {data_path}: {e} \n') + + print("loading dataset") + for i in tqdm(f): + select_dataset = np.load(i) + select_dataset = preprocessing.Butterworth(select_dataset, "lowpass", low_cut=20, order=3) + if augment is not None: + select_dataset = augment(select_dataset) + datasets[i.name.split("samp")[0]] = select_dataset + + +# 用第二种方法读取 +class ApneaDataset(Dataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + self.data_path = data_path + self.label_path = label_path + self.segment_augment = segment_augment + self.labels = None + self.dataset_type = dataset_type + self.select_sampNo = select_sampno + + # self._getAllData() + self._getAllLabels() + + def __getitem__(self, index): + # PN patience number + # SP/EP start point, end point + # temp_label.append([sampNo, label[-1], i, hpy_num, csa_num, osa_num, mean_low, flow_low]) + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1)) + else: + raise Exception(f'dataset read failure!') + + def count_SA(self): + return sum(self.labels[:, 3] > 1) + + def __len__(self): + return len(self.labels) + + def _getAllLabels(self): + label_path = Path(self.label_path) + if not label_path.exists(): + raise Exception(f'{self.label_path} does not exist') + + try: + f = [] + if label_path.is_dir(): + if self.dataset_type == "train": + label_list = list(label_path.rglob("*_train_label.csv")) + elif self.dataset_type == "valid": + label_list = list(label_path.rglob("*_valid_label.csv")) + elif self.dataset_type == "test": + label_list = list(label_path.glob("*_sa_test_label.csv")) + # label_list = list(label_path.rglob("*_test_label.npy")) + elif self.dataset_type == "all_test": + label_list = list(label_path.rglob("*_sa_all_label.csv")) + else: + raise ValueError("self.dataset type error") + # label_list = list(label_path.rglob("*_label.npy")) + label_list.sort() + f += label_list + elif label_path.is_file(): + raise Exception(f'dataset path should be a dir') + else: + raise Exception(f'{self.label_path} does not exist') + except Exception as e: + raise Exception(f'Error loading data from {self.label_path}: {e} \n') + print("loading labels") + for i in tqdm(f): + if int(i.name.split("_")[0]) not in self.select_sampNo: + continue + + if self.labels is None: + self.labels = pd.read_csv(i).to_numpy(dtype=int) + else: + labels = pd.read_csv(i).to_numpy(dtype=int) + if len(labels) > 0: + self.labels = np.concatenate((self.labels, labels)) + # self.labels = self.labels[:10000] + print(f"{self.dataset_type} length is {len(self.labels)}") + + +class TestApneaDataset(ApneaDataset): + def __init__(self, data_path, label_path, dataset_type, select_sampno, segment_augment=None): + super(TestApneaDataset, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + select_sampno=select_sampno, + segment_augment=segment_augment + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + segment = datasets[str(PN)][int(SP) * 100:int(EP) * 100].copy() + if self.segment_augment is not None: + segment = self.segment_augment(segment) + return segment, int(float(label_type) > 1), PN, segmentNo, SP, EP + else: + raise Exception(f'dataset read failure!') + + +class TestApneaDataset2(ApneaDataset): + def __init__(self, data_path, label_path, select_sampno, dataset_type, segment_augment=None): + super(TestApneaDataset2, self).__init__( + data_path=data_path, + label_path=label_path, + dataset_type=dataset_type, + segment_augment=segment_augment, + select_sampno=select_sampno + ) + + def __getitem__(self, index): + PN, segmentNo, label_type, new_label, SP, EP = self.labels[index] + # PN, label, SP, EP, hpy_num, csa_num, osa_num, mean_low, flow_low = self.labels[index] + + if isinstance(datasets, dict): + dataset = datasets[str(PN)] + segment = self.segment_augment(dataset, SP, EP) + return (*segment, int(float(label_type) > 1), PN, segmentNo, label_type, new_label, SP, EP) + else: + raise Exception(f'dataset read failure!') + + +if __name__ == '__main__': + pass diff --git a/exam/033/main.py b/exam/033/main.py new file mode 100644 index 0000000..eab38c1 --- /dev/null +++ b/exam/033/main.py @@ -0,0 +1,289 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@time:2021/10/15 +""" +import os + +import yaml +import logging +from pathlib import Path + +import time +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.cuda +from tqdm import tqdm +from torchinfo import summary +from load_dataset import ApneaDataset, read_dataset + +from torch import nn +from utils.calc_metrics import CALC_METRICS +from sklearn.model_selection import KFold +from model.Hybrid_Net010 import HYBRIDNET010 +# from utils.LossFunction import Foca1lLoss +from my_augment import my_augment, my_segment_augment + +# 加载配置 +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) + +os.environ["CUDA_VISIBLE_DEVICES"] = hyp["GPU"] +os.environ["WANDB_MODE"] = "dryrun" + +realtime = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + +# 读取地址参数 +data_path = hyp["Path"]["dataset"] +label_path = hyp["Path"]["label"] + +save_dir = Path(hyp["Path"]["save"]) / (Path(hyp["Path"]["save"]).name + "_" + realtime) +save_dir.mkdir(parents=True, exist_ok=True) + +# 设置日志 +logger = logging.getLogger() +logger.setLevel(logging.NOTSET) +fh = logging.FileHandler(save_dir / (realtime + ".log"), mode='a') +fh.setLevel(logging.NOTSET) +fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.NOTSET) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) +logging.getLogger('matplotlib.font_manager').disabled = True +logger.info("------------------------------------") +logger.info('hyper_parameters: ' + ', '.join(f'{k}={v}\n' for k, v in hyp.items())) + +# 备份配置 +with open(save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + +# Hyper-parameters +gpu = torch.cuda.is_available() +epochs = hyp["epoch"] +lr = hyp["lr"] +nc = hyp["nc"] +bs = hyp["batch_size"] +worker = hyp["number_worker"] +select_sampno = hyp["select_sampno"] + +read_dataset(data_path, augment=my_augment) +calc_metrics = CALC_METRICS(nc) + + +# 开始训练 +# 训练 +def model_train(model, train_loader, optimizer, scheduler, loss_func, training_state): + model.train() + train_loss = 0.0 + optimizer.zero_grad() + + pbar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=80) + pbar.set_description(training_state) + for i, (resp, labels) in pbar: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + # 强行归一化数据 + # segments = F.normalize(segments) + # print(segments.size()) + # 减去平均值 + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # 一维卷积在最后一位上卷积 所以输入CNN应为【batch_size, embedding size, sequence size】 + # 所以输入为【batch_size, 1, 3000】 3000 = 30秒 * 100Hz + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + + loss = loss_func(out, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + # 余弦退火传入变量 + # scheduler.step(epoch + i / len(train_loader.dataset)) + # 自适应调整传入变量 + scheduler.step(loss) + + loss_value = loss.item() + train_loss += loss_value + # cur_lr = optimizer.param_groups[-1]['lr'] + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + # if i % 20 == 0: + # pbar.write(calc_metrics.get_matrix(loss=loss_value, cur_lr=cur_lr, epoch=epoch)) + + cur_lr = optimizer.param_groups[-1]['lr'] + train_loss /= len(train_loader) + + calc_metrics.compute() + logger.info("") + logger.info("--------------------------------------") + logger.info(training_state) + logger.info(calc_metrics.get_matrix(loss=train_loss, epoch=epoch, epoch_type="train", cur_lr=cur_lr)) + calc_metrics.reset() + + +def model_valid(model, valid_loader, wdir, loss_func): + model.eval() + valid_loss = 0.0 + for resp, labels in valid_loader: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + valid_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + valid_loss /= len(valid_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=valid_loss, epoch=epoch, epoch_type="valid")) + global best_f1 + valid_f1 = calc_metrics.metrics[-1].compute() + if valid_f1 > best_f1: + best_f1 = valid_f1 + torch.save(model.state_dict(), wdir / f"best_{epoch}_{str(round(float(valid_f1), 3))}.pt") + torch.save(model.state_dict(), wdir / f"best.pt") + if wandb is not None: + wandb.run.summary["best_f1"] = valid_f1 + calc_metrics.reset() + + +def model_test(model, test_loader, loss_func): + model.eval() + test_loss = 0.0 + for resp, labels in test_loader: + resp = resp.float().cuda() if gpu else resp.float() + # stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + # segments = F.normalize(segments) + # segments = segments - torch.mean(segments, dim=1).view(-1, 1) + # segments = F.normalize(segments - torch.mean(segments, dim=1).view(-1, 1)) + # segments = segments.view(len(segments), 1, -1) + + out = model(resp) + out = F.softmax(out, dim=1) + loss = loss_func(out, labels) + + test_loss += loss.item() + labels = torch.unsqueeze(labels, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + calc_metrics.update(out.cpu(), labels.cpu()) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=epoch, epoch_type="test")) + calc_metrics.reset() + + +if __name__ == '__main__': + + try: + import wandb + except ImportError: + wandb = None + prefix = 'wandb: ' + logger.info(f"{prefix}Install Weights & Biases logger with 'pip install wandb'") + + if wandb is not None and wandb.run is None: + wandb_run = wandb.init( + config=hyp, + name=save_dir.stem, + project=hyp["project"], + notes=hyp["Note"], + tags=hyp["tags"], + entity=hyp["entity"], + ) + exam_name = Path("./").absolute().name + + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + summary(model_net, [(32, 300, 1)]) + + time.sleep(3) + if gpu: + model_net.cuda() + + k_folds = 5 + kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42) + + logger.info('--------------------------------') + for fold, (train_ids, test_ids) in enumerate(kfold.split(select_sampno)): + logger.info(f'Start FOLD {fold} / {k_folds}----------------------') + train_set = [select_sampno[i] for i in train_ids] + test_set = [select_sampno[i] for i in test_ids] + logger.info(f'Train_Set:{train_set}') + logger.info(f'Independent_Test_Set:{test_set}') + + sub_save_dir = save_dir / f"KFold_{fold}" + sub_save_dir.mkdir(exist_ok=True, parents=True) + wdir = sub_save_dir / "weights" + wdir.mkdir(exist_ok=True, parents=True) + + hyp["train_set"] = train_set + hyp["test_set"] = test_set + with open(sub_save_dir / 'settings.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + + train_dataset = ApneaDataset(data_path, label_path, train_set, "train", my_segment_augment) + valid_dataset = ApneaDataset(data_path, label_path, train_set, "valid", my_segment_augment) + test_dataset = ApneaDataset(data_path, label_path, train_set, "test", my_segment_augment) + + train_loader = DataLoader(train_dataset, batch_size=bs, pin_memory=True, num_workers=worker, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + test_loader = DataLoader(test_dataset, batch_size=bs, pin_memory=True, num_workers=worker) + + # 重新初始化模型 + del model_net + model_net = eval(hyp["model_name"])() + model_net.initialize_weights() + if gpu: + model_net.cuda() + + logger.info(f"Weight is {[train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]}") + # 损失函数与优化器 + loss_function = nn.CrossEntropyLoss( + weight=torch.Tensor([train_dataset.count_SA() / (len(train_dataset) - train_dataset.count_SA()), 1]).cuda()) + + # loss_func = nn.BCEWithLogitsLoss() + # loss_func = FocalLoss(class_num=nc, alpha=0.75, size_average="sum") + + # momentum + # nesterov 牛顿动量 + # weight_decay L2正则 + # optimizer = torch.optim.SGD(model_net.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-6) + + optimizer = torch.optim.Adam(model_net.parameters(), lr=lr) + # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=int(hyp["T_max"]), + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, + patience=2836, min_lr=1e-8, + verbose=True) + + # 参数记录 + best_f1 = 0 + + for epoch in range(epochs): + model_train(model_net, train_loader, optimizer, scheduler, loss_function, + f"EXAM:{exam_name} FOLD:{fold}/{k_folds} EPOCH:{epoch}/{epochs}") + model_valid(model_net, valid_loader, wdir, loss_function) + model_test(model_net, test_loader, loss_function) + if wandb is not None: + calc_metrics.wandb_log(wandb=wandb, cur_lr=optimizer.param_groups[-1]['lr']) diff --git a/exam/033/model/Hybrid_Net011.py b/exam/033/model/Hybrid_Net011.py new file mode 100644 index 0000000..6a6bbbf --- /dev/null +++ b/exam/033/model/Hybrid_Net011.py @@ -0,0 +1,91 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Hybrid_Net011.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/10/14 +""" + +import os + +import torch +from torch import nn +from torchinfo import summary +from torch import cat + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + +# 修改激活函数 +# 提高呼吸采样率 + +# 输入时长 +WHOLE_SEGMENT_SECOND = 30 + +# 呼吸采样率 +RESPIRATORY_FRE = 10 + +# BCG 时频图大小 +BCG_GRAPH_SIZE = (26, 121) + + +class HYBRIDNET010(nn.Module): + def __init__(self, num_classes=2, init_weights=True): + super(HYBRIDNET010, self).__init__() + + self.right = nn.Sequential( + nn.Conv1d(in_channels=1, out_channels=8, kernel_size=20, + stride=2, padding=10), + nn.GELU(), + + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(8), + + nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(16), + + nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, + stride=1, padding=1), + nn.GELU(), + nn.MaxPool1d(kernel_size=3, stride=2, padding=1), + nn.BatchNorm1d(32) + ) + + self.classifier = nn.Sequential( + # nn.Dropout(p=0.5), + nn.Linear(64, 8), + nn.GELU(), + nn.Linear(8, num_classes), + ) + + if init_weights: + self.initialize_weights() + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Conv1d)): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # 何教授方法 + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) # 正态分布赋值 + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x, (_, _) = self.lstm(x) + # print(x.shape) + x = x[:, -1] + + x = torch.flatten(x, start_dim=1) + # print(x.shape) + x = self.classifier(x) + return x + + +if __name__ == '__main__': + model = HYBRIDNET010().cuda() + summary(model, [(32, 300, 1)]) diff --git a/exam/033/my_augment.py b/exam/033/my_augment.py new file mode 100644 index 0000000..dbb694d --- /dev/null +++ b/exam/033/my_augment.py @@ -0,0 +1,54 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:my_augment.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/07/26 +""" +from utils.Preprocessing import BCG_Operation +import numpy as np +from scipy.signal import stft + +preprocessing = BCG_Operation() +preprocessing.sample_rate = 100 + + +def my_augment(dataset): + dataset -= dataset.mean() + dataset = preprocessing.Iirnotch(dataset) + dataset = preprocessing.Butterworth(dataset, "lowpass", low_cut=20, order=6) + dataset_low = preprocessing.Butterworth(dataset, "lowpass", low_cut=0.5, order=4) + # dataset_high = preprocessing.Butterworth(dataset, "highpass", high_cut=1, order=6) + + dataset = {"low": dataset_low} + # "high": dataset_high} + return dataset + + +def get_stft(x, fs, n): + print(len(x)) + f, t, amp = stft(x, fs, nperseg=n) + z = np.abs(amp.copy()) + return f, t, z + + +def my_segment_augment(dataset, SP, EP): + dataset_low = dataset["low"][int(SP) * 100:int(EP) * 100].copy() + # dataset_high = dataset["high"][int(SP) * 100:int(EP) * 100].copy() + + dataset_low = dataset_low[::10] + dataset_low = dataset_low.reshape(-1, 1) + + # _, _, dataset_high = stft(dataset_high, 100, nperseg=50) + # dataset_high = dataset_high.astype(np.float).T + # dataset_high = dataset_high.reshape(dataset_high.shape[0], dataset_high.shape[1]) + + # return dataset_low, dataset_high + + return [dataset_low] + + +if __name__ == '__main__': + pass diff --git a/exam/033/settings.yaml b/exam/033/settings.yaml new file mode 100644 index 0000000..14ea7ae --- /dev/null +++ b/exam/033/settings.yaml @@ -0,0 +1,77 @@ +# environment config +GPU: "0" + + +# dataset config +Path: + dataset: /home/marques/code/marques/apnea/dataset/BCG_100hz_lowpass50/ + label: ./dataset/ + save: ./output/ + +batch_size: 256 +number_worker: 0 +model_name: HYBRIDNET010 +select_sampno: + - 88 + - 220 + - 221 + - 229 + - 282 + - 286 + - 541 + - 579 + - 582 + - 670 + - 671 + - 683 + - 703 + - 704 + - 726 + - 735 + - 736 + - 933 + - 935 + - 952 + - 954 + - 955 + - 960 + - 961 + - 962 + - 966 + - 967 + - 969 + - 971 + - 972 + - 1000 + - 1004 + - 1006 + - 1009 + - 1010 + - 1296 + - 1300 + - 1301 + - 1302 + - 1308 + - 1354 + - 1374 + - 1378 + - 1478 + + +# train hyperparameters config +epoch: 300 +lr: 0.01 +nc: 1 + +# wandb config +entity: "marques" +project: "Sleep_Apnea_HYBRID00X" +Note: "HYBRID010 RESP " +tags: ["ReduceLROnPlateau", "RESP LSTM"] + + +# "CW":class_weight + + +# "CosineAnnealingLR" +# "ReduceLROnPlateau" \ No newline at end of file diff --git a/exam/033/test_save_result.py b/exam/033/test_save_result.py new file mode 100644 index 0000000..07a1d2f --- /dev/null +++ b/exam/033/test_save_result.py @@ -0,0 +1,478 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:test_analysis.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/21 +""" +import logging +import os +import sys + +import pandas as pd +import torch.cuda +import numpy as np +import yaml +from matplotlib import pyplot as plt +from tqdm import tqdm +from pathlib import Path +from torch.nn import functional as F +from torch.utils.data import DataLoader +from load_dataset import TestApneaDataset2, read_dataset +from utils.Draw_ConfusionMatrix import draw_confusionMatrix +from torch import nn +from utils.calc_metrics import CALC_METRICS +from my_augment import my_augment, my_segment_augment +from model.Hybrid_Net003 import HYBRIDNET003 + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +exam_path = Path("./output/") + +# 置信率阈值 +thresh = 0.5 +# 间隔最小距离 +thresh_event_interval = 2 +# 最小事件长度 +thresh_event_length = 8 + +# +event_thresh = 1 + +severity_path = Path(r"/home/marques/code/marques/apnea/dataset/loc_first_csa.xlsx") +severity_label = {"all": "none"} +severity_df = pd.read_excel(severity_path) +for one_data in severity_df.index: + one_data = severity_df.loc[one_data] + severity_label[str(one_data["数据编号"])] = one_data["程度"] + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +gpu = torch.cuda.is_available() + +num_classes = 1 +calc_metrics = CALC_METRICS(num_classes) + +with open("./settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps +data_path = hyp["Path"]["dataset"] +read_dataset(data_path, augment=my_augment) +del hyp + +# 默认取最新的文件夹 +all_output_path, output_path, segments_results_save_path, events_results_save_path, = [None, ] * 4 +my_augment, model_path, label_path, data_path, model, model_name = [None, ] * 6 +train_set, test_set = None, None +loss_func = nn.CrossEntropyLoss() + +columns = ["sampNo", "segmentNo", "label_type", "new_label", "SP", "EP", "pred"] +columns2 = ["sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", + "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + +logging.getLogger('matplotlib.font_manager').disabled = True +logging.getLogger('matplotlib.ticker').disabled = True +logger = logging.getLogger() +logger.setLevel(logging.INFO) +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s: %(message)s")) +logger.addHandler(ch) + +if (exam_path / "test.log").exists(): + (exam_path / "test.log").unlink() +fh = logging.FileHandler(exam_path / "test.log", mode='a') +fh.setLevel(logging.INFO) +fh.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.info("------------------------------------") + + +def set_environment(i): + global output_path, segments_results_save_path, events_results_save_path, model_path, label_path, data_path, \ + model, model_name, train_set, test_set + + output_path = all_output_path[i] + logger.info(output_path) + segments_results_save_path = (output_path / "segments_results") + segments_results_save_path.mkdir(exist_ok=True) + events_results_save_path = (output_path / "events_results") + events_results_save_path.mkdir(exist_ok=True) + + # 加载配置 + with open(output_path / "settings.yaml") as f: + hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps + data_path = hyp["Path"]["dataset"] + label_path = hyp["Path"]["label"] + train_set = hyp["train_set"] + test_set = hyp["test_set"] + + model_path = output_path / "weights" / "best.pt" + model = eval(hyp["model_name"])() + model_name = hyp["model_name"] + model.load_state_dict(torch.load(model_path)) + model.cuda() + model.eval() + + +def test_and_analysis_and_visual(dataset_type): + if dataset_type == "test": + sampNo = train_set + elif dataset_type == "all_test": + sampNo = test_set + else: + sampNo = None + logger.info("出错了") + + exam_name = Path("./").absolute().name + + test_dataset = TestApneaDataset2(data_path, label_path, select_sampno=sampNo, dataset_type=dataset_type, + segment_augment=my_segment_augment) + test_loader = DataLoader(test_dataset, batch_size=128, pin_memory=True, num_workers=0) + + test_loss = 0.0 + + df_segment = pd.DataFrame(columns=columns) + + for one in tqdm(test_loader, total=len(test_loader)): + resp, stft, labels = one[:3] + other_info = one[3:] + resp = resp.float().cuda() if gpu else resp.float() + stft = stft.float().cuda() if gpu else stft.float() + labels = labels.cuda() if gpu else labels + with torch.no_grad(): + out = model(resp, stft) + + loss = loss_func(out, labels) + + test_loss += loss.item() + + labels = torch.unsqueeze(labels, dim=1) + out = F.softmax(out, dim=1) + out = torch.unsqueeze(out[:, 1], dim=1) + + calc_metrics.update(out.cpu(), labels.cpu()) + # one[0] = list(one[0].cpu().numpy()) + # one[1] = list(one[1].cpu().numpy()) + # one = one[1:] + # out = out.view(1, -1).cpu().numpy().tolist() + # one += out + # result_record += [i for i in list(np.array(one, dtype=object).transpose(1, 0))] + + one2 = np.array([i.cpu().numpy() for i in (other_info + [out.squeeze()])]) + one2 = one2.transpose((1, 0)) + df = pd.DataFrame(data=one2, columns=columns) + df_segment = df_segment.append(df, ignore_index=True) + + test_loss /= len(test_loader) + calc_metrics.compute() + logger.info(f"EXAM_NAME: {exam_name} SampNO: {sampNo}") + logger.info(calc_metrics.get_matrix(loss=test_loss, epoch=0, epoch_type="test")) + calc_metrics.reset() + + df_segment["thresh_label"] = 1 * (df_segment["label_type"] > event_thresh).copy() + df_segment["thresh_Pred"] = 1 * (df_segment["pred"] > thresh).copy() + df_segment["pred"] = df_segment["pred"].copy().apply(lambda x: round(x, 3)) + + # 片段级分析 + df_segment_metrics = analysis_results(df_segment, segments_results_save_path, dataset_type) + + # 绘制混淆矩阵 + # 每个样本都绘制一份 + confusionMatrix(df_segment_metrics, segments_results_save_path, dataset_type) + # 绘制柱状图 + + # 事件级分析 + # 对于inner_test 每个编号就是一个事件 + # 而对于整晚的independence_test,需要另行计算 + df_all_event = segment_to_event(df_segment, dataset_type) + df_event_metrics = analysis_results(df_all_event, events_results_save_path, dataset_type, is_event=True) + confusionMatrix(df_event_metrics, events_results_save_path, dataset_type) + + # 剔除质量不好的样本 + df_bad_segment = df_segment[ + (df_segment["label_type"].isin([2, 3])) & (df_segment["new_label"] == 2)] + df_select_segment = df_segment.drop(df_bad_segment.index) + df_select_segment_metrics = analysis_results(df_select_segment, segments_results_save_path / "remove_2", + dataset_type) + df_select_event = segment_to_event(df_select_segment, dataset_type) + df_event_metrics = analysis_results(df_select_event, events_results_save_path / "remove_2", dataset_type, + is_event=True) + + +def analysis_results(df_result, base_path, dataset_type, is_event=False): + if df_result.empty: + logger.info(base_path, dataset_type, "is_empty") + return None + + (base_path / dataset_type).mkdir(exist_ok=True, parents=True) + + all_sampNo = df_result["sampNo"].unique() + df_metrics = pd.DataFrame(columns=columns2) + df_metrics.loc[0] = 0 + df_metrics.loc[0]["sampNo"] = dataset_type + + for index, sampNo in enumerate(all_sampNo): + df = df_result[df_result["sampNo"] == sampNo] + df.to_csv( + base_path / dataset_type / + f"{int(sampNo)}_{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_result.csv", + index=False) + + df_metrics.loc[index + 1] = np.NAN + df_metrics.loc[index + 1]["sampNo"] = str(int(sampNo)) + df_metrics.loc[index + 1]["support"] = df.shape[0] + df_metrics.loc[index + 1]["severity"] = severity_label[str(int(sampNo))] + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # continue + # else: + df_metrics.loc[index + 1]["origin_P"] = df[df["thresh_label"] == 1].shape[0] + df_metrics.loc[index + 1]["origin_N"] = df[df["thresh_label"] == 0].shape[0] + df_metrics.loc[index + 1]["pred_P"] = df[df["thresh_Pred"] == 1].shape[0] + df_metrics.loc[index + 1]["pred_N"] = df[df["thresh_Pred"] == 0].shape[0] + df_metrics.loc[index + 1]["T"] = df[df["thresh_Pred"] == df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["F"] = df[df["thresh_Pred"] != df["thresh_label"]].shape[0] + df_metrics.loc[index + 1]["TP"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["FP"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 1)].shape[0] + df_metrics.loc[index + 1]["TN"] = \ + df[(df["thresh_Pred"] == df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + df_metrics.loc[index + 1]["FN"] = \ + df[(df["thresh_Pred"] != df["thresh_label"]) & (df["thresh_Pred"] == 0)].shape[0] + + df_metrics.loc[0]["origin_P"] += df_metrics.loc[index + 1]["origin_P"] + df_metrics.loc[0]["origin_N"] += df_metrics.loc[index + 1]["origin_N"] + df_metrics.loc[0]["pred_P"] += df_metrics.loc[index + 1]["pred_P"] + df_metrics.loc[0]["pred_N"] += df_metrics.loc[index + 1]["pred_N"] + df_metrics.loc[0]["T"] += df_metrics.loc[index + 1]["T"] + df_metrics.loc[0]["F"] += df_metrics.loc[index + 1]["F"] + df_metrics.loc[0]["TP"] += df_metrics.loc[index + 1]["TP"] + df_metrics.loc[0]["FP"] += df_metrics.loc[index + 1]["FP"] + df_metrics.loc[0]["TN"] += df_metrics.loc[index + 1]["TN"] + df_metrics.loc[0]["FN"] += df_metrics.loc[index + 1]["FN"] + df_metrics.loc[0]["support"] += df_metrics.loc[index + 1]["support"] + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[index + 1][col] = df_metrics.loc[index + 1][col] if df_metrics.loc[index + 1][ + col] != 0 else np.NAN + + df_metrics.loc[index + 1]["acc"] = df_metrics.iloc[index + 1]["T"] / df_metrics.iloc[index + 1]["support"] + df_metrics.loc[index + 1]["recall"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["origin_P"] + df_metrics.loc[index + 1]["spec"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["origin_N"] + df_metrics.loc[index + 1]["pre"] = df_metrics.iloc[index + 1]["TP"] / df_metrics.iloc[index + 1]["pred_P"] + df_metrics.loc[index + 1]["NPV"] = df_metrics.iloc[index + 1]["TN"] / df_metrics.iloc[index + 1]["pred_N"] + df_metrics.loc[index + 1]["F1score"] = 2 * df_metrics.iloc[index + 1]["recall"] * df_metrics.iloc[index + 1][ + "pre"] / (df_metrics.iloc[index + 1]["recall"] + df_metrics.iloc[index + 1]["pre"]) + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[index + 1][col] = 0 if pd.isna(df_metrics.loc[index + 1][col]) else \ + df_metrics.loc[index + 1][col] + df_metrics.loc[index + 1][col] = round(df_metrics.loc[index + 1][col], 3) + + # if dataset_type == "independence_test" or dataset_type == "train_all_test": + # return None + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_metrics.loc[0][col] = df_metrics.loc[0][col] if df_metrics.loc[0][col] != 0 else np.NAN + + df_metrics.loc[0]["acc"] = df_metrics.iloc[0]["T"] / df_metrics.iloc[0]["support"] + df_metrics.loc[0]["recall"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["origin_P"] + df_metrics.loc[0]["spec"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["origin_N"] + df_metrics.loc[0]["pre"] = df_metrics.iloc[0]["TP"] / df_metrics.iloc[0]["pred_P"] + df_metrics.loc[0]["NPV"] = df_metrics.iloc[0]["TN"] / df_metrics.iloc[0]["pred_N"] + df_metrics.loc[0]["F1score"] = 2 * df_metrics.iloc[0]["recall"] * df_metrics.iloc[0]["pre"] / ( + df_metrics.iloc[0]["recall"] + df_metrics.iloc[0]["pre"]) + for col in ["TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score"]: + df_metrics.loc[0][col] = 0 if pd.isna(df_metrics.loc[0][col]) else df_metrics.loc[0][col] + df_metrics.loc[0][col] = round(df_metrics.loc[0][col], 3) + + # 在inner_test中根据 分严重程度绘制 + if dataset_type == "test": + all_severity = ["正常", "轻度", "中度", "重度"] + for index, severity in enumerate(all_severity): + df_event = df_metrics[df_metrics["severity"] == severity] + df_temp = pd.DataFrame(columns=columns2) + df_temp.loc[0] = 0 + df_temp.loc[0]["sampNo"] = severity + df_temp.loc[0]["severity"] = str(index + 1) + + df_temp.loc[0]["origin_P"] += df_event["origin_P"].sum() + df_temp.loc[0]["origin_N"] += df_event["origin_N"].sum() + df_temp.loc[0]["pred_P"] += df_event["pred_P"].sum() + df_temp.loc[0]["pred_N"] += df_event["pred_N"].sum() + df_temp.loc[0]["T"] += df_event["T"].sum() + df_temp.loc[0]["F"] += df_event["F"].sum() + df_temp.loc[0]["TP"] += df_event["TP"].sum() + df_temp.loc[0]["FP"] += df_event["FP"].sum() + df_temp.loc[0]["TN"] += df_event["TN"].sum() + df_temp.loc[0]["FN"] += df_event["FN"].sum() + df_temp.loc[0]["support"] += df_event["support"].sum() + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN"]: + df_temp.loc[0][col] = df_temp.loc[0][col] if df_temp.loc[0][col] != 0 else np.NAN + + df_temp.loc[0]["acc"] = df_temp.iloc[0]["T"] / df_temp.iloc[0]["support"] + df_temp.loc[0]["recall"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["origin_P"] + df_temp.loc[0]["spec"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["origin_N"] + df_temp.loc[0]["pre"] = df_temp.iloc[0]["TP"] / df_temp.iloc[0]["pred_P"] + df_temp.loc[0]["NPV"] = df_temp.iloc[0]["TN"] / df_temp.iloc[0]["pred_N"] + df_temp.loc[0]["F1score"] = 2 * df_temp.iloc[0]["recall"] * df_temp.iloc[0]["pre"] / ( + df_temp.iloc[0]["recall"] + df_temp.iloc[0]["pre"]) + + for col in ["origin_P", "origin_N", "pred_P", "pred_N", "T", "F", "TP", "TN", "FP", "FN", "acc", "recall", + "spec", "pre", "NPV", "F1score"]: + df_temp.loc[0][col] = 0 if pd.isna(df_temp.loc[0][col]) else df_temp.loc[0][col] + df_temp.loc[0][col] = round(df_temp.loc[0][col], 3) + + df_metrics = df_metrics.append(df_temp, ignore_index=True) + + df_backup = df_metrics + df_metrics = df_metrics.astype("str") + df_metrics = df_metrics.sort_values("severity") + df_metrics.to_csv(base_path / dataset_type / + f"{model_name}_{dataset_type}_{'segment' if not is_event else 'event'}_all_metrics.csv", + index=False, encoding="gbk") + + return df_backup + + +def confusionMatrix(df_analysis, base_path, dataset_type): + if df_analysis is None: + logger.info(base_path, dataset_type, "is None") + return + + if df_analysis.empty: + logger.info(base_path, dataset_type, "is_empty") + return + classes = ["normal", "SA"] + (base_path / dataset_type / "confusionMatrix").mkdir(exist_ok=True, parents=True) + for one_samp in df_analysis.index: + one_samp = df_analysis.loc[one_samp] + cm = np.array([[one_samp["TN"], one_samp["FP"]], [one_samp["FN"], one_samp["TP"]]]) + draw_confusionMatrix(cm, classes=classes, title=str(one_samp["severity"]) + " " + one_samp["sampNo"], + save_path=base_path / dataset_type / "confusionMatrix" / f"{one_samp['sampNo']}.jpg") + + +def segment_to_event(df_segment, dataset_type): + df_all_event = pd.DataFrame(columns=columns) + all_sampNo = df_segment["sampNo"].unique() + + if dataset_type == "test": + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + all_segments_no = df["segmentNo"].unique() + + for index_se, segment_No in enumerate(all_segments_no): + df_temp = df[df["segmentNo"] == segment_No].copy() + SP = df_temp.iloc[0]["EP"] + EP = df_temp.iloc[-1]["EP"] + 1 + df_event.loc[index_se] = [int(sampNo), segment_No, df_temp.iloc[0]["label_type"], + df_temp.iloc[0]["new_label"], SP, EP, 0] + + thresh_Pred = df_temp["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + for i in value_ar: + if i > thresh_event_length: + df_event.iloc[index_se]["pred"] = 1 + + # df_event.to_csv(events_results / dataset_type / f"{int(sampNo)}_event_results.csv", index=False, + # encoding="gbk") + df_all_event = df_all_event.append(df_event, ignore_index=True) + else: + for index, sampNo in enumerate(all_sampNo): + df_event = pd.DataFrame(columns=columns) + df = df_segment[df_segment["sampNo"] == sampNo].copy() + df["thresh_label"] = 1 * (df["label_type"] > event_thresh) + df["thresh_Pred"] = 1 * (df["pred"] > thresh) + thresh_Pred = df["thresh_Pred"].values + thresh_Pred2 = thresh_Pred.copy() + # 扩充 + for index_pred, pred in enumerate(thresh_Pred): + if pred == 0: + continue + + for interval in range(1, thresh_event_interval): + if pred == 1 and index_pred + interval < thresh_Pred.size: + thresh_Pred2[index_pred + interval] = 1 + else: + continue + + # 判断 + same_ar = np.concatenate(([True], thresh_Pred2[:-1] != thresh_Pred2[1:], [True])) + index_ar = np.where(same_ar)[0] + count_ar = np.diff(index_ar) + value_ar = thresh_Pred2[same_ar[:-1]] * count_ar + + for value_index, value in enumerate(value_ar): + SP = index_ar[value_index] + EP = index_ar[value_index] + count_ar[value_index] + # TP, FP + if value > thresh_event_length: + + # label_type = 1 if thresh_Pred2[SP:EP].sum() > 0 else 0 + label_type = df["label_type"][SP:EP].max() + new_label = df["new_label"][SP:EP].max() + df_event = df_event.append(pd.DataFrame([[int(sampNo), SP // 30, label_type, new_label, + SP, EP, thresh_Pred2[SP]]], columns=columns), + ignore_index=True) + # if value > 30: + # logger.info([int(sampNo), SP // 30, label_type, new_label, SP, EP, thresh_Pred2[SP]]) + # 长度不够 + else: + df["thresh_Pred"][SP:EP] = 0 + + # 对负样本进行统计 + for segment_no in df["segmentNo"].unique(): + df_temp = df[df["segmentNo"] == segment_no] + if df_temp["thresh_Pred"].sum() > 0: + continue + + df_event = df_event.append(pd.DataFrame( + [[int(sampNo), segment_no, df_temp["label_type"].max(), df_temp["new_label"].max(), segment_no * 30, + (segment_no + 1) * 30, 0]], columns=columns), + ignore_index=True) + + df_all_event = df_all_event.append(df_event, ignore_index=True) + + df_temp = df_all_event.loc[:, ["label_type", "pred"]] + df_all_event["thresh_label"] = 1 * (df_temp["label_type"] > event_thresh) + df_all_event["thresh_Pred"] = 1 * (df_temp["pred"] > thresh) + return df_all_event + + +# 分sampNo保存结果,并不重合地可视化 +# inner_test + +# 分sampNo将与标签不一致的另行保存,并不重合地可视化 + +# import shap +# explainer = shap.TreeExplainer() +# shap_values = explainer.shap_values() + +if __name__ == '__main__': + all_output_path = list(exam_path.rglob("KFold_*")) + for exam_index, test_exam_path in enumerate(all_output_path): + # test_exam_path = exam_path / test_exam_path + set_environment(exam_index) + test_and_analysis_and_visual(dataset_type="test") + test_and_analysis_and_visual(dataset_type="all_test") diff --git a/exam/033/utils/Draw_ConfusionMatrix.py b/exam/033/utils/Draw_ConfusionMatrix.py new file mode 100644 index 0000000..591af60 --- /dev/null +++ b/exam/033/utils/Draw_ConfusionMatrix.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:Draw_ConfusionMatrix.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/08/10 +""" +import numpy as np +from matplotlib import pyplot as plt + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + + +def draw_confusionMatrix(cm, classes, title, save_path, cmap=plt.cm.Blues): + fig_cm, ax = plt.subplots(figsize=(8, 8), dpi=120) + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + ax.set_ylim(len(classes) - 0.5, -0.5) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + normalize = False + fmt = '.2f' if normalize else 'd' + thresh = cm.max() * 0.8 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig_cm.tight_layout() + fig_cm.savefig(save_path) + plt.close() + # + + +if __name__ == '__main__': + pass diff --git a/exam/033/utils/Preprocessing.py b/exam/033/utils/Preprocessing.py new file mode 100644 index 0000000..e3b60c4 --- /dev/null +++ b/exam/033/utils/Preprocessing.py @@ -0,0 +1,181 @@ +# encoding:utf-8 + +""" +@ date: 2020-09-16 +@ author: jingxian +@ illustration: Pre-processing +""" + + +import sys +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import pywt +from scipy import signal +from scipy import fftpack + + +def Dilate(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.min(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def Eorde(x, N, g, M): + returndata = np.array([]) + for num in range(N - M + 1): + returndata = np.append(returndata, np.max(np.array(x[num:num + M]) - np.array(g))) + return returndata + + +def fin_turn(data, peak): + if len(data) == 0 or len(peak) == 0: return peak + return_peak = [] + for p in peak: + minx, maxx = max(0, p - 100), min(len(data), p + 100) + return_peak.append(minx + np.argmax(data[minx: maxx])) + return return_peak + + +class BCG_Operation(): + def __init__(self, sample_rate=1000): + self.sample_rate = sample_rate + + def down_sample(self, data=None, down_radio=10): + if data is None: + raise ValueError("data is None, please given an real value!") + data = data[:len(data) // down_radio * down_radio].reshape(-1, down_radio)[:, 0] + self.sample_rate = self.sample_rate / down_radio + return data + + def Splitwin(self, data=None, len_win=None, coverage=1.0, calculate_to_end=False): + """ + 分窗 + :param len_win: length of window + :return: signal windows + """ + if (len_win is None) or (data is None): + raise ValueError("length of window or data is None, please given an real value!") + else: + length = len_win * self.sample_rate # number point of a window + # step of split windows + step = length * coverage + start = 0 + Splitdata = [] + while (len(data) - start >= length): + Splitdata.append(data[int(start):int(start + length)]) + start += step + if calculate_to_end and (len(data) - start > 2000): + remain = len(data) - start + start = start - step + step = int(remain / 2000) + start = start + step * 2000 + Splitdata.append(data[int(start):int(start + length)]) + return np.array(Splitdata), step + elif calculate_to_end: + return np.array(Splitdata), 0 + else: + return np.array(Splitdata) + + def Butterworth(self, data, type, low_cut=0.0, high_cut=0.0, order=10): + """ + :param type: Type of Butter. filter, lowpass, bandpass, ... + :param lowcut: Low cutoff frequency + :param highcut: High cutoff frequency + :param order: Order of filter + :return: Signal after filtering + """ + if type == "lowpass": # 低通滤波处理 + b, a = signal.butter(order, low_cut / (self.sample_rate * 0.5), btype='lowpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "bandpass": # 带通滤波处理 + low = low_cut / (self.sample_rate * 0.5) + high = high_cut / (self.sample_rate * 0.5) + b, a = signal.butter(order, [low, high], btype='bandpass') + return signal.filtfilt(b, a, np.array(data)) + elif type == "highpass": # 高通滤波处理 + b, a = signal.butter(order, high_cut / (self.sample_rate * 0.5), btype='highpass') + return signal.filtfilt(b, a, np.array(data)) + else: # 警告,滤波器类型必须有 + raise ValueError("Please choose a type of fliter") + + def MorphologicalFilter(self, data=None, M=200, get_bre=False): + """ + :param data: Input signal + :param M: Length of structural element + :return: Signal after filter + """ + if not data.any(): + raise ValueError("The input data is None, please given real value data") + g = np.ones(M) + Data_pre = np.insert(data, 0, np.zeros(M)) + Data_pre = np.insert(Data_pre, -1, np.zeros(M)) + # Opening: 腐蚀 + 膨胀 + out1 = Eorde(Data_pre, len(Data_pre), g, M) + out2 = Dilate(out1, len(out1), g, M) + out2 = np.insert(out2, 0, np.zeros(M - 2)) + # Closing: 膨胀 + 腐蚀 + out5 = Dilate(Data_pre, len(Data_pre), g, M) + out6 = Eorde(out5, len(out5), g, M) + out6 = np.insert(out6, 0, np.zeros(M - 2)) + + baseline = (out2 + out6) / 2 + # -------------------------保留剩余价值------------------------ + data_filtered = Data_pre[:len(baseline)] - baseline + data_filtered = data_filtered[M: M + len(data)] + baseline = baseline[M:] + data_filtered[-1] = data_filtered[-2] = data_filtered[-3] + baseline[-1] = baseline[-2] = baseline[-3] + if get_bre: + return data_filtered, baseline + else: + return data_filtered + + def Iirnotch(self, data=None, cut_fre=50, quality=3): + """陷波器""" + b, a = signal.iirnotch(cut_fre / (self.sample_rate * 0.5), quality) + return signal.filtfilt(b, a, np.array(data)) + + def ChebyFilter(self, data, rp=1, type=None, low_cut=0, high_cut=0, order=10): + """ + 切比雪夫滤波器 + :param data: Input signal + :param rp: The maximum ripple allowed + :param type: 'lowpass', 'bandpass, 'highpass' + :param low_cut: Low cut-off fre + :param high_cut: High cut-off fre + :param order: The order of filter + :return: Signal after filter + """ + if type == 'lowpass': + b, a = signal.cheby1(order, rp, low_cut, btype='lowpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'bandpass': + b, a = signal.cheby1(order, rp, [low_cut, high_cut], btype='bandpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + elif type == 'highpass': + b, a = signal.cheby1(order, rp, high_cut, btype='highpass', fs=self.sample_rate) + return signal.filtfilt(b, a, np.array(data)) + else: + raise ValueError("The type of filter is None, please given the real value!") + + def Envelope(self, data): + """取信号包络""" + if len(data) <= 1: raise ValueError("Wrong input data") + hx = fftpack.hilbert(data) + return np.sqrt(hx ** 2, data ** 2) + + def wavelet_trans(self, data,c_level=['aaa','aad'], wavelet='db4', mode='symmetric',maxlevel=10): + wp = pywt.WaveletPacket(data=data, wavelet=wavelet, mode=mode, maxlevel=maxlevel) + new_wp = pywt.WaveletPacket(data=None, wavelet=wavelet, mode=mode) + for c in c_level : + new_wp[c] = wp[c] + return new_wp.reconstruct() + + # def em_decomposition(self, data): + # from pyhht.emd import EMD + # return EMD(data).decompose() + diff --git a/exam/033/utils/calc_metrics.py b/exam/033/utils/calc_metrics.py new file mode 100644 index 0000000..dcf78cc --- /dev/null +++ b/exam/033/utils/calc_metrics.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:Marques +@file:calc_metrics.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/02/12 +""" +import torch +import torchmetrics + + +class CALC_METRICS: + metrics = [] + nc = 0 + + def __init__(self, nc): + self.nc = nc + self.metrics.append(torchmetrics.Accuracy(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Recall(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Precision(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.Specificity(average="none", num_classes=nc, multiclass=False)) + self.metrics.append(torchmetrics.F1Score(average="none", num_classes=nc, multiclass=False)) + self.valid_result = self.train_result = None + + def update(self, pred, target): + for part1 in self.metrics: + part1.update(pred.cpu(), target.cpu()) + + def compute(self): + result = [] + for part1 in self.metrics: + result.append(part1.compute()) + + def reset(self): + for part1 in self.metrics: + part1.reset() + + def get_matrix(self, loss=None, cur_lr=None, epoch=None, epoch_type=None): + temp_result = [] + for j in self.metrics: + compute_result = (j.compute().cpu().numpy() * 100).tolist() + temp_result.append(compute_result) + + if epoch_type == "train": + self.train_result = [loss] + temp_result + elif epoch_type == "valid": + self.valid_result = [loss] + temp_result + else: + pass + + a = "" + a += f"{epoch_type} epoch: {str(epoch)} loss: {str(loss)} lr: {str(cur_lr)} \n" + a += " " * 8 + "Acc".center(8) + "Rec".center(8) + "Pre".center(8) + "Spe".center(8) + "F1".center(8) + "\n" + a += "all".center(8) + "".join([str(round(float(i), 2)).center(8) for i in temp_result]) + "\n" + return a + + def wandb_log(self, wandb=None, cur_lr=None): + if wandb is None: + return + + keyword = ["Accuracy", "Recall", "Precision", "Specificity", "F1Score"] + dict_key = [] + for epoch_type in ["train", "valid"]: + dict_key.append(epoch_type + "/" + "loss") + for i in keyword: + dict_key.append(epoch_type + "/" + i) + + log_dict = dict(zip(dict_key, self.train_result + self.valid_result)) + log_dict["lr"] = cur_lr + wandb.log(log_dict) + + +if __name__ == '__main__': + # pred = [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [1.0]] + # true = [[0], [0], [1], [0], [0], [0], [0], [0], [0], [1]] + pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + true = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] + pred = torch.tensor(pred).cuda() + true = torch.tensor(true).cuda() + calc_metrics = CALC_METRICS(1) + calc_metrics.update(pred, true) + print(calc_metrics.get_matrix()) diff --git a/exam/试验记录.txt b/exam/试验记录.txt index b2f60f2..614e0b5 100644 --- a/exam/试验记录.txt +++ b/exam/试验记录.txt @@ -65,34 +65,7 @@ 数据集减去平均值外,使用每个片段的混叠信号Z-score整理数据集 -------------------------------------------------------- 013 -BEST TEST 学习率 1e-5 epoch 50 - Acc Rec Pre Spe F1 - all 84.56 85.53 65.01 84.22 73.87 - -EXAM_NAME: 013 SampNO: [582, 1000] -test epoch: 0 loss: 1.1915464037836785 lr: None - Acc Rec Pre Spe F1 - all 60.04 11.88 4.92 66.96 6.96 - -EXAM_NAME: 013 SampNO: [286, 1009] -test epoch: 0 loss: 0.6172940751263513 lr: None - Acc Rec Pre Spe F1 - all 77.71 61.15 62.02 84.55 61.58 - -EXAM_NAME: 013 SampNO: [282, 966] -test epoch: 0 loss: 1.210226387001914 lr: None - Acc Rec Pre Spe F1 - all 39.09 95.56 0.23 39.01 0.46 - -EXAM_NAME: 013 SampNO: [726, 1006] -test epoch: 0 loss: 0.603616701004974 lr: None - Acc Rec Pre Spe F1 - all 73.5 56.9 41.81 78.1 48.21 - -EXAM_NAME: 013 SampNO: [229, 1004] -test epoch: 0 loss: 0.3528211629552555 lr: None - Acc Rec Pre Spe F1 - all 84.68 17.63 4.82 87.15 7.56 +学习率 1e-5 epoch 50 -------------------------------------------------------- @@ -101,37 +74,37 @@ test epoch: 0 loss: 0.3528211629552555 lr: None 数据集减去平均值外,使用大体动检测后缩放 -------------------------------------------------------- 014 -学习率 1e-5 EPOCH 20 TODO +学习率 1e-5 EPOCH 50 -------------------------------------------------------- -------------------------------------------------------- 非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, -数据集减去平均值外,选上述最好的处理方法 -# 修改卷积层的激活函数 +数据集减去平均值外,选上述最好的处理方法[无处理] +# 修改卷积层的激活函数, 将呼吸的采样率从提升至10Hz,并每0.5秒输入到LSTM -------------------------------------------------------- 021 -学习率 1e-5 EPOCH 20 TODO +学习率 1e-5 EPOCH 50 -------------------------------------------------------- -------------------------------------------------------- 非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, -数据集减去平均值外,选上述最好的处理方法 -# 引入残差块 +数据集减去平均值外,无预处理 +# 修改卷积层的激活函数 RESP用CNN STFT用LSTM -------------------------------------------------------- 022 -学习率 1e-5 EPOCH 20 TODO +学习率 1e-5 EPOCH 50 -------------------------------------------------------- -------------------------------------------------------- 非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, -数据集减去平均值外,选上述最好的处理方法 -# 修改卷积层的激活函数 RESP用CNN(RESNET?) STFT用LSTM +数据集减去平均值外,削顶600,归一化 +# 修改卷积层的激活函数 RESP用CNN STFT用LSTM -------------------------------------------------------- 023 -学习率 1e-5 EPOCH 20 TODO +学习率 1e-5 EPOCH 50 -------------------------------------------------------- @@ -161,15 +134,38 @@ TODO: 质量好的信号结果分析;严重程度的分析 -------------------------------------------------------- -**全量**非整晚数据集训练 1:3.5,class_weight 解决不平衡问题, +**全量**整晚数据集训练 1:10,class_weight 解决不平衡问题, 数据集减去平均值外,选上述最好的处理方法 -------------------------------------------------------- 031 -学习率 1e-4 +学习率 1e-2 epoch 300 +呼吸10Hz 纯BI-LSTM +300个点,折成60,5 -------------------------------------------------------- +-------------------------------------------------------- +**全量**整晚数据集训练 1:10,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +-------------------------------------------------------- +032 +学习率 1e-2 epoch 300 +呼吸10Hz 纯BI-LSTM +300个点,保持300, 1 +-------------------------------------------------------- + +-------------------------------------------------------- +**全量**整晚数据集训练 1:10,class_weight 解决不平衡问题, +数据集减去平均值外,选上述最好的处理方法 +-------------------------------------------------------- +033 +学习率 1e-2 epoch 300 +呼吸10Hz 两层BI-LSTM +300个点,保持300, 1 +-------------------------------------------------------- + + -------------------------------------------------------- 非整晚数据集训练 1:3.5,仅用质量好的数据训练,class_weight 解决不平衡问题, diff --git a/utils/Draw_Result.py b/utils/Draw_Result.py new file mode 100644 index 0000000..7de88d6 --- /dev/null +++ b/utils/Draw_Result.py @@ -0,0 +1,120 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +""" +@author:andrew +@file:Draw_Result.py +@email:admin@marques22.com +@email:2021022362@m.scnu.edu.cn +@time:2022/10/07 +""" +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import patchworklib as pw +# import seaborn as sns +import yaml +from plotnine import * + +# from matplotlib import pyplot as plt +# +# plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 +# plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 +from tqdm import tqdm + + +base_path = Path("../exam") +save_path = Path("../output/20221007") +save_path.mkdir(exist_ok=True, parents=True) +# 比较对象 +all_exam_path = list(base_path.iterdir()) +compare_path = [ + # '001', + '002', + # '003', + # '004', + '005', + '011', + '012', + '013', +] + + +# +# compare_path = [ +# '001', +# '002', +# '003', +# '004', +# ] + +def load_data(): + columns = ["exam_name", "Fold_index", "sampNo", "severity", "origin_P", "origin_N", "pred_P", "pred_N", "T", + "F", "TP", "TN", "FP", "FN", "acc", "recall", "spec", "pre", "NPV", "F1score", "support"] + df_event_all_test_result = pd.DataFrame(columns=columns) + df_segment_all_test_result = pd.DataFrame(columns=columns) + df_event_test_result = pd.DataFrame(columns=columns) + df_segment_test_result = pd.DataFrame(columns=columns) + + for index, exam_name in enumerate(compare_path): + exam_path = base_path / exam_name / 'output' + # all_sub_exam_path = list(exam_path.glob(f'*output*')) + + exam_path = list(exam_path.glob(f'output_*'))[0] + print(exam_path) + + sub_fold_path = list(exam_path.glob("KFold_*")) + sub_fold_path.sort() + + for sub_index, sub_exam_path in enumerate(sub_fold_path): + segment_result_path = sub_exam_path / "segments_results" + segment_test_result_path = segment_result_path / "test" + segment_all_test_result_path = segment_result_path / "all_test" + + event_result_path = sub_exam_path / "events_results" + event_test_result_path = event_result_path / "test" + event_all_test_result_path = event_result_path / "all_test" + + for sub_samp in segment_test_result_path.glob("*_test_segment_all_metrics.csv"): + df_temp = pd.read_csv(sub_samp, encoding="gbk") + + df_segment_test_result.loc[len(df_segment_test_result)] = [exam_name, sub_exam_path.name, + *df_temp.iloc[0].tolist()] + for i in range(5, len(df_temp)): + df_segment_test_result.loc[len(df_segment_test_result)] = [exam_name, sub_exam_path.name, + *df_temp.iloc[i].tolist()] + + for sub_samp in segment_all_test_result_path.glob("*_all_test_segment_all_metrics.csv"): + df_temp = pd.read_csv(sub_samp, encoding="gbk") + + for i in range(0, len(df_temp)): + df_segment_all_test_result.loc[len(df_segment_all_test_result)] = [exam_name, sub_exam_path.name, + *df_temp.iloc[i].tolist()] + + for sub_samp in event_test_result_path.glob("*_test_event_all_metrics.csv"): + df_temp = pd.read_csv(sub_samp, encoding="gbk") + + df_event_test_result.loc[len(df_event_test_result)] = [exam_name, sub_exam_path.name, + *df_temp.iloc[0].tolist()] + for i in range(5, len(df_temp)): + df_event_test_result.loc[len(df_event_test_result)] = [exam_name, sub_exam_path.name, + *df_temp.iloc[i].tolist()] + + for sub_samp in event_all_test_result_path.glob("*_all_test_event_all_metrics.csv"): + df_temp = pd.read_csv(sub_samp, encoding="gbk") + + for i in range(0, len(df_temp)): + df_event_all_test_result.loc[len(df_event_all_test_result)] = [exam_name, sub_exam_path.name, + *df_temp.iloc[i].tolist()] + + df_event_test_result.to_csv(save_path / "event_test_result_metrics.csv", index=False, encoding="GBK") + df_segment_test_result.to_csv(save_path / "segment_test_result_metrics.csv", index=False, encoding="GBK") + df_event_all_test_result.to_csv(save_path / "event_all_test_result_metrics.csv", index=False, encoding="GBK") + df_segment_all_test_result.to_csv(save_path / "segment_all_test_segment_metrics.csv", index=False, + encoding="GBK") + + return + +if __name__ == '__main__': + pass