Add PPG signal processing and SpO2 calculation modules

2025-11-19 16:27:04 +08:00 · 2025-11-19 16:27:04 +08:00 · 667bdc8213
commit 667bdc8213
5 changed files with 623 additions and 0 deletions
--- a/PPG2SpO2.py
+++ b/PPG2SpO2.py
@ -0,0 +1,114 @@
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 import os
 os.environ["DISPLAY"] = "localhost:10.0"
 from func import calculate_hr_spo2_zhihu, ppg2spo2_pipeline, func_1
 from spo2_pipeline import spo2_pipeline
 def read_data(file_path):
    """
    Read data from a file and return it as a list of floats.
    Args:
        file_path (str or Path): The path to the file to read.
    Returns:
        list of float: The data read from the file.
    """
    df = pd.read_csv(file_path)
    # red = df['red'].values[300:]
    # ir = df['ied'].values[300:]
    red = df['red'].values[50:]
    ir = df['ied'].values[50:]
    return red, ir
 def read_labels(file_path):
    """
    Read labels from a file and return them as a list of integers.
    Args:
        file_path (str or Path): The path to the file to read.
    Returns:
        list of int: The labels read from the file.
    """
    df = pd.read_csv(file_path)
    timestamp = df['timestamp'].values
    hr_label = df['心率_值'].values
    spo2_label = df['血氧组_值'].values.astype(float)
    r_value = df['R组_均方根'].values
    np.place(spo2_label, spo2_label == 0, np.nan)
    return timestamp, hr_label, spo2_label, r_value
 def draw_compare(red_signal, ir_signal, spo2_values, hr_values, r_values, fs=25):
    """
    Draw comparison plots for red and IR signals along with SpO2 values.
    Args:
        red_signal (array-like): The red light signal data.
        ir_signal (array-like): The infrared light signal data.
        spo2_values (array-like): The computed SpO2 values.
        fs (int): Sampling frequency in Hz. Default is 25.
    """
    time_axis = [i / fs for i in range(len(red_signal))]
    plt.figure(figsize=(15, 10))
    plt.subplot(3, 1, 1)
    plt.plot(time_axis, red_signal, label='Red Signal', color='red')
    plt.plot(time_axis, ir_signal, label='IR Signal', color='blue')
    plt.title('Red and IR Signals')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.subplot(3, 1, 2)
    plt.plot(spo2_values, label='Computed SpO2', color='green')
    plt.title('Computed SpO2 Values')
    plt.xlabel('Beats')
    plt.ylabel('SpO2 (%)')
    plt.legend()
    plt.subplot(3, 1, 3)
    plt.plot(hr_values, label='Computed Heart Rate', color='orange')
    plt.title('Computed Heart Rate Values')
    plt.xlabel('Beats')
    plt.ylabel('Heart Rate (bpm)')
    plt.legend()
    plt.tight_layout()
    plt.show()
 if __name__ == '__main__':
    file_path = Path("./data/spo2hr[42-A3-C5-2F-F7-32]2025.11.17_17.12.15.csv")
    # file_path = "./data/spo2hr[42-A3-C5-2F-F7-32]2025.11.17_17.01.13.csv"
    red_signal, ir_signal = read_data(file_path)
    label_path = file_path.parent / "spo2_alg_info[42_A3_C5_2F_F7_32]2025.11.17_17.12.16.csv"
    timestamp, hr_label, spo2_label, r_value = read_labels(label_path)
    # draw_compare(red_signal, ir_signal, spo2_label, hr_label, r_value, fs=25)
    # df, summary = spo2_pipeline(
    #     red_signal,
    #     ir_signal,
    #     fs=25,
    #     calibrate_with_ref=spo2_label,
    #     A=104.0,
    #     B=17.0
    # )
    # print(summary)
    # print(df)
    # HR_result, SpO2_result = ppg2spo2_pipeline(red_signal, ir_signal, fs=25)
    func_1(red_signal, ir_signal, fs=25)
--- a/func.py
+++ b/func.py
@ -0,0 +1,243 @@
 import numpy as np
 import pandas as pd
 from scipy import signal
 from matplotlib import pyplot as plt
 def calculate_hr_spo2_zhihu(X, fs, FFT_size=512):
    """
    X          : ndarray, shape (N, 2)   # 第0列 RED，第1列 IR
    fs         : 采样率 (Hz)
    FFT_size   : FFT 点数（建议 2 的幂）
    返回值     : (Heart_Rate, SpO2_Level)  已经取整
    https://zhuanlan.zhihu.com/p/658858641
    https://github.com/thinkng/ppgprocessing/blob/master/HR_SpO2_Estimation.m
    """
    X = np.asarray(X)
    if X.ndim != 2 or X.shape[1] != 2:
        raise ValueError("X 必须是 (样本数, 2) 的数组，列0=RED，列1=IR")
    # 计算能处理的完整窗口数量（每个窗口长度 = FFT_size）
    step = fs                      # MATLAB 中是 n*fs 开始，步长为 fs（1 秒）
    n_windows = int((len(X) / (2 * fs)) - 2)   # 与原 MATLAB 一致
    HEART_RATE = np.zeros(n_windows)
    SpO2       = np.zeros(n_windows)
    for n in range(n_windows):
        start_idx = n * step
        end_idx   = start_idx + FFT_size
        y1 = X[start_idx:end_idx, 0]   # RED
        y2 = X[start_idx:end_idx, 1]   # IR
        # ---------- FFT RED ----------
        Y1 = np.fft.fft(y1, n=FFT_size)
        Y1_abs = np.abs(Y1[:FFT_size//2 + 1])
        f1 = fs / 2 * np.linspace(0, 1, FFT_size//2 + 1)
        # ---------- FFT IR ----------
        Y2 = np.fft.fft(y2, n=FFT_size)
        Y2_abs = np.abs(Y2[:FFT_size//2 + 1])
        f2 = f1.copy()   # 与 f1 完全相同
        # ---------- 在 0.5~2.5 Hz（对应心率 30~150 bpm）范围内找局部最大 ----------
        # MATLAB 中索引 6:12 对应频率大约 0.6~1.4 Hz（取决于 FFT_size/fs）
        # 这里统一取频率 0.5~2.5 Hz 对应的索引
        idx_range = np.where((f1 >= 0.5) & (f1 <= 2.5))[0]
        # RED 峰值索引
        segment_red = Y1_abs[idx_range]
        local_max_i = np.argmax(segment_red)
        pk_RED_i = idx_range[local_max_i]
        # IR 峰值索引
        segment_ir = Y2_abs[idx_range]
        local_max_i = np.argmax(segment_ir)
        pk_IR_i = idx_range[local_max_i]
        # ---------- 心率 ----------
        heart_rate_bpm = f2[pk_IR_i] * 60
        HEART_RATE[n] = heart_rate_bpm
        # ---------- SpO2 ----------
        R_RED = Y1_abs[pk_RED_i] / (Y1_abs[0] + 1e-12)   # 防止除以 0
        R_IR  = Y2_abs[pk_IR_i]  / (Y2_abs[0]  + 1e-12)
        R = R_RED / R_IR
        spo2 = 104 - 28 * R
        SpO2[n] = spo2
    # 去掉首尾（与原 MATLAB 相同）
    if len(HEART_RATE) > 2:
        HR_mean = np.mean(HEART_RATE[1:-1])
        SpO2_mean = np.mean(SpO2[1:-1])
    else:
        HR_mean = np.mean(HEART_RATE)
        SpO2_mean = np.mean(SpO2)
    Heart_Rate = round(HR_mean)
    SpO2_Level = round(SpO2_mean)
    return Heart_Rate, SpO2_Level
 def _culculate_spo2(ir_list_data, red_list_data):
    ir_dc = min(ir_list_data)
    red_dc = min(red_list_data)
    ir_ac = max(ir_list_data) - ir_dc
    red_ac = max(red_list_data) - red_dc
    temp1 = ir_ac * red_dc
    if temp1 < 1:
        temp1 = 1
    R2 = (red_ac * ir_dc) / temp1
    SPO2 = -45.060 * R2 * R2 + 30.354 * R2 + 94.845
    if SPO2 > 100 or SPO2 < 0:
        SPO2 = 0
    return SPO2
 def _culculate_HR(ir_list_data_filtered, data_list_time):
    HR_num = signal.find_peaks(ir_list_data_filtered, distance=10)[0]
    time = data_list_time[-1] -data_list_time[0]
    HR = len(HR_num) / (time / 1000) * 60
    return HR
 def process_signal(signal_segment, fs, highpass=True):
    if highpass:
        h_b, h_a = signal.butter(N=8, Wn=1/(fs/2), btype="highpass", output="ba")
        data = signal.filtfilt(h_b, h_a, signal_segment, axis=0)
    else:
        data = signal_segment
    data = signal.detrend(data,
                          axis=0,
                          type='linear',
                          bp=0,
                            overwrite_data=False)
    return data
 def ppg2spo2_pipeline(red, ir, fs=25):
    """
    采用滑窗分析法计算心率和血氧饱和度
    每秒中输出结果
    red : ndarray, 红光信号
    ir  : ndarray, 红外光信号
    fs  : 采样率 (Hz)
    """
    red = np.asarray(red).reshape(-1)
    ir  = np.asarray(ir).reshape(-1)
    if len(red) != len(ir):
        raise ValueError("红光和红外光信号长度必须相同")
    red_filtered = process_signal(red, fs, highpass=False)
    ir_filtered  = process_signal(ir, fs, highpass=True)
    bpm_list_data = []
    spo2_list_data = []
    temp_bpm_list_data = []
    temp_spo2_list_data = []
    for i in range(len(red)//fs - 1):
        red_segment = red_filtered[i*fs:(i+1)*fs]
        ir_segment  = ir_filtered[i*fs:(i+1)*fs]
        spo2 = _culculate_spo2(ir_segment, red_segment)
        bpm = _culculate_HR(ir_segment, np.arange(i*fs, (i+1)*fs) * (1000/fs))
        temp_bpm_list_data.append(bpm)
        temp_spo2_list_data.append(spo2)
    # matlab
    # python
    plt.figure(figsize=(10, 5))
    timestamp = np.linspace(0, len(red_filtered)/fs, len(red_filtered))
    ax1 = plt.subplot(211)
    plt.plot( timestamp, red, label='Red Signal', color='red', alpha=0.5)
    plt.plot(timestamp,ir, label='IR Signal', color='blue', alpha=0.5)
    plt.title('Raw PPG Signals')
    plt.xlabel("seconds")
    plt.ylabel('Amplitude')
    plt.legend()
    plt.subplot(212, sharex=ax1)
    plt.plot(timestamp,red_filtered, label='Filtered Red Signal', color='red', alpha=0.5)
    plt.plot(timestamp,ir_filtered, label='Filtered IR Signal', color='blue', alpha=0.5)
    plt.title('Filtered PPG Signals')
    plt.xlabel("seconds")
    plt.ylabel('Amplitude')
    plt.legend()
    plt.show()
 def func_1(red, ir, fs=25):
    # Processing_PPG_Signal
    # make a move window find min and max of ArrayIR
    def movmin1(A, k):
        x = A.rolling(k, min_periods=1, center=True).min().to_numpy()  #
        return x
    def movmax1(A, k):
        x = A.rolling(k, min_periods=1, center=True).max().to_numpy()
        return x
    ArrayIR = pd.DataFrame(ir)
    ArrayRed = pd.DataFrame(red)
    # calculate ac/dc ir
    max_ir = movmax1(ArrayIR, fs)
    # print(f"max_ir: {max_ir}")
    min_ir = movmin1(ArrayIR, fs)
    # print(f"min_ir: {min_ir}")
    baseline_data_ir = (max_ir + min_ir) / 2
    # print(f"baseline_data_ir: {baseline_data_ir}")
    acDivDcIr = (max_ir - min_ir) / baseline_data_ir
    # calculate ac/dc red
    max_red = movmax1(ArrayRed, fs)
    min_red = movmin1(ArrayRed, fs)
    baseline_data_red = (max_red + min_red) / 2
    acDivDcRed = (max_red - min_red) / baseline_data_red
    # Plot SPO2 = 110-25*(ac/dc_red)/(ac/dc_ir)
    SPO2 = 110 - 25 * (acDivDcRed / acDivDcIr)
    # plt.figure("SPO2")
    timestamp = np.linspace(0, len(red) / fs, len(red))
    plt.figure(figsize=(10, 5))
    ax1 = plt.subplot(311)
    plt.plot(timestamp, red, label='Red Signal', color='red', alpha=0.5)
    plt.plot(timestamp, ir, label='IR Signal', color='blue', alpha=0.5)
    plt.title('Raw PPG Signals')
    plt.xlabel("seconds")
    plt.ylabel('Amplitude')
    plt.legend()
    plt.subplot(312, sharex=ax1)
    plt.plot(timestamp, red - baseline_data_red, label='Detrended Red Signal', color='red', alpha=0.5)
    plt.plot(timestamp, ir - baseline_data_ir, label='Detrended IR Signal', color='blue', alpha=0.5)
    plt.title('Detrended PPG Signals')
    plt.xlabel("seconds")
    plt.ylabel('Amplitude')
    plt.legend()
    plt.subplot(313, sharex=ax1)
    plt.plot(timestamp,acDivDcRed, label='AC/DC Red', color='red', alpha=0.5)
    plt.plot(timestamp,acDivDcIr, label='AC/DC IR', color='blue', alpha=0.5)
    plt.title('AC/DC Ratios')
    plt.xlabel("seconds")
    plt.ylabel('Ratio')
    plt.legend()
    plt.show()
    plt.xlabel("Samples")
    plt.ylabel("SPO2")
    plt.title("SPO2")
    plt.plot(timestamp, SPO2)
    plt.show()
--- a/process_spo2_alg_info.py
+++ b/process_spo2_alg_info.py
@ -0,0 +1,100 @@
 import re
 import pandas as pd
 from pathlib import Path
 from typing import List, Dict
 def parse_line(line: str) -> Dict:
    """
    解析一行数据，返回一个字典
    例如输入:
    [300]心率:78,83,83,83,;标准差1:43.0491;标准差2:2.16506;均方根:82.2787;值:82;信任级别:1     血氧组:;标准差1:0;标准差:0;均方根:0;值:0;信任级别:0结果延迟:1     R组:;均方根:0;     仪器:60
 [325]心率:75,78,83,83,;标准差1:38.6641;标准差2:3.4187;均方根:80.3232;值:81;信任级别:1     血氧组:;标准差1:0;标准差:0;均方根:0;值:0;信任级别:0结果延迟:2     R组:;均方根:0;     仪器:60
 [350]心率:78,83,83,83,;标准差1:46.0893;标准差2:2.16506;均方根:82.2787;值:82;信任级别:1     血氧组:;标准差1:0;标准差:0;均方根:0;值:0;信任级别:0结果延迟:3     R组:;均方根:0;     仪器:60
 [375]心率:83,83,83,107,;标准差1:43.8422;标准差2:10.3923;均方根:90.1047;值:82;信任级别:0     血氧组:;标准差1:0;标准差:0;均方根:0;值:0;信任级别:0结果延迟:3     R组:;均方根:0;     仪器:60
 [400]心率:83,83,107,107,;标准差1:47.7501;标准差2:12;均方根:96.2549;值:82;信任级别:0     血氧组:;标准差1:0;标准差:0;均方根:0;值:0;信任级别:0结果延迟:3     R组:;均方根:0;     仪器:60
    """
    line = line.strip()
    if not line:
        return None
    # 提取时间戳 [300] 这样的
    time_match = re.match(r'\[(\d+)\]', line)
    if not time_match:
        return None
    timestamp = int(time_match.group(1))
    # 去掉时间戳部分，后面全部是键值对
    content = line[time_match.end():]
    content = content.split("     ")
    result = {"timestamp": timestamp}
    for block in content:
        parsed_block = parse_block(block)
        result.update(parsed_block)
    return result
 def parse_block(block: str) -> Dict:
    """
    解析一个数据块，返回一个字典
    例如输入:
    心率:78,83,83,83,;标准差1:43.0491;标准差2:2.16506;均方根:82.2787;值:82;信任级别:1
    血氧组:;标准差1:0;标准差:0;均方根:0;值:0;信任级别:0结果延迟:1
    R组:;均方根:0;
    仪器:60
    """
    data = {}
    parts = block.split(';')
    if len(parts) == 1:
        key, value = parts[0].split(':', 1)
        data[key.strip()] = value.strip()
    else:
        key_0 = parts[0].split(':', 1)[0].strip()
        for part in parts:
            if len(part.strip()) == 0:
                continue
            if "结果延迟" in part:
                part0, key1, value1 = part.partition('结果延迟:')
                data[f"{key_0}_结果延迟"] = value1.strip()
                key2, _, value2 = part0.partition(':')
                if key2 and value2:
                    data[f"{key_0}_{key2.strip()}"] = value2.strip()
            else:
                key, value = part.split(':', 1)
                data[f"{key_0}_{key.strip()}"] = value.strip()
    return data
 def parse_dat_file(file_path) -> pd.DataFrame:
    """
    读取整个 .dat 文件并返回 pandas DataFrame
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            parsed = parse_line(line)
            if parsed:
                data.append(parsed)
            else:
                if line.strip():
                    print(f"第 {line_num} 行解析失败，已跳过: {line.strip()[:80]}...")
    df = pd.DataFrame(data)
    # 按时间戳排序（一般已经是顺序，但保险起见）
    if not df.empty:
        df = df.sort_values("timestamp").reset_index(drop=True)
    return df
 if __name__ == '__main__':
    # file_path = "./data/spo2_alg_info[42-A3-C5-2F-F7-32]2025.11.17_17.01.14.dat"
    file_path = "./data/spo2_alg_info[42_A3_C5_2F_F7_32]2025.11.17_17.12.16.dat"
    file_path = Path(file_path)
    df = parse_dat_file(file_path)
    df.to_csv(file_path.with_suffix('.csv'), index=False, encoding='utf-8-sig')
--- a/process_spo2_hr.py
+++ b/process_spo2_hr.py
@ -0,0 +1,52 @@
 '''样例数据
 red:9188561;ied:10745056;accX:209;accY:-438;accZ:-169
 red:9185136;ied:10744763;accX:209;accY:-438;accZ:-171
 red:9182875;ied:10744312;accX:209;accY:-438;accZ:-171
 red:9181788;ied:10744212;accX:209;accY:-441;accZ:-163
 red:9180729;ied:10744341;accX:209;accY:-441;accZ:-163
 red:9179405;ied:10742269;accX:212;accY:-440;accZ:-172
 red:9178563;ied:10741796;accX:212;accY:-440;accZ:-172
 red:9178557;ied:10742843;accX:209;accY:-440;accZ:-166
 red:9178185;ied:10742777;accX:209;accY:-440;accZ:-166
 red:9177053;ied:10737766;accX:210;accY:-440;accZ:-166
 '''
 import numpy as np
 import pandas as pd
 from pathlib import Path
 def parse_dat_line(line: str) -> dict:
    """
    解析一行 .dat 文件内容，返回一个字典
    例如输入:
    red:9188561;ied:10745056;accX:209;accY:-438;accZ:-169
    """
    data = {}
    parts = line.strip().split(';')
    for part in parts:
        if len(part.strip()) == 0:
            continue
        key, value = part.split(':', 1)
        data[key.strip()] = int(value.strip())
    return data
 def parse_dat_file(file_path) -> pd.DataFrame:
    """
    读取整个 .dat 文件并返回 pandas DataFrame
    :param file_path:
    :return:
    """
    records = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parsed_line = parse_dat_line(line)
            records.append(parsed_line)
    df = pd.DataFrame(records)
    return df
 if __name__ == '__main__':
    # file_path = './data/spo2hr[42-A3-C5-2F-F7-32]2025.11.17_17.01.13.dat'
    file_path = './data/spo2hr[42-A3-C5-2F-F7-32]2025.11.17_17.12.15.dat'
    file_path = Path(file_path)
    df = parse_dat_file(file_path)
    df.to_csv(file_path.with_suffix('.csv'), index=False)
--- a/spo2_pipeline.py
+++ b/spo2_pipeline.py
@ -0,0 +1,114 @@
 import numpy as np
 import pandas as pd
 from scipy.signal import butter, filtfilt, find_peaks
 def bandpass_filter(x, fs, low=0.5, high=5.0, order=3):
    nyq = 0.5 * fs
    lown = low / nyq
    highn = high / nyq
    b, a = butter(order, [lown, highn], btype='band')
    y = filtfilt(b, a, x)
    return y
 def detrend(x):
    return x - np.mean(x)
 def detect_peaks(signal, fs, hr_min=40, hr_max=220):
    min_dist = int(np.floor(fs * 60.0 / hr_max))
    prominence = 0.3 * np.std(signal)
    peaks, props = find_peaks(signal, distance=min_dist, prominence=prominence)
    return peaks, props
 def compute_ac_dc_per_beat(signal, peaks, fs):
    n = len(signal)
    AC, DC, times, beat_windows = [], [], [], []
    for i, p in enumerate(peaks):
        if i == 0:
            start = 0
        else:
            start = (peaks[i-1] + p) // 2
        if i == len(peaks)-1:
            end = n-1
        else:
            end = (p + peaks[i+1]) // 2
        segment = signal[start:end+1]
        if segment.size < 3:
            AC.append(np.nan); DC.append(np.nan)
            times.append(p / fs)
            beat_windows.append((start, end))
            continue
        seg_max = np.max(segment)
        seg_min = np.min(segment)
        ac = (seg_max - seg_min) / 2.0
        dc = np.mean(segment)
        AC.append(ac); DC.append(dc)
        times.append(p / fs)
        beat_windows.append((start, end))
    return np.array(AC), np.array(DC), np.array(times), beat_windows
 def compute_spo2_from_R(R, A=104.0, B=17.0):
    return A - B * R
 def calibrate_linear(R_vals, spo2_ref):
    mask = ~np.isnan(R_vals) & ~np.isnan(spo2_ref)
    if mask.sum() < 2:
        return None
    coeffs = np.polyfit(R_vals[mask], spo2_ref[mask], 1)
    c1, c0 = coeffs[0], coeffs[1]
    A = c0; B = -c1
    return float(A), float(B)
 def spo2_pipeline(red, ir, fs=25, calibrate_with_ref=None, A=104.0, B=17.0):
    assert red.shape == ir.shape, "red and ir must have same shape"
    red_f = bandpass_filter(red, fs)
    ir_f = bandpass_filter(ir, fs)
    red_d = detrend(red_f)
    ir_d = detrend(ir_f)
    peaks, props = detect_peaks(ir_d, fs)
    ac_red, dc_red, times, _ = compute_ac_dc_per_beat(red_d, peaks, fs)
    ac_ir, dc_ir, _, _ = compute_ac_dc_per_beat(ir_d, peaks, fs)
    with np.errstate(divide='ignore', invalid='ignore'):
        ratio_red = ac_red / dc_red
        ratio_ir  = ac_ir  / dc_ir
        R = ratio_red / ratio_ir
    spo2_vals = compute_spo2_from_R(R, A=A, B=B)
    used_A, used_B = A, B
    # if calibrate_with_ref is not None:
    #     if len(calibrate_with_ref) != len(R):
    #         raise ValueError("Reference SpO2 length mismatch with beats.")
    #     ref = np.array(calibrate_with_ref)
    #     fit = calibrate_linear(R, ref)
    #     if fit is not None:
    #         used_A, used_B = fit
    #         spo2_vals = compute_spo2_from_R(R, A=used_A, B=used_B)
    df = pd.DataFrame({
        "time_s": times,
        "AC_red": ac_red,
        "DC_red": dc_red,
        "AC_ir": ac_ir,
        "DC_ir": dc_ir,
        "R": R,
        "SpO2": spo2_vals
    })
    df['valid'] = (
        (~np.isnan(df['R'])) &
        (df['AC_red']>0) & (df['AC_ir']>0) &
        (df['DC_red']>0) & (df['DC_ir']>0) &
        (df['SpO2']>50) & (df['SpO2']<100)
    )
    summary = {
        "n_beats": len(df),
        "n_valid_beats": int(df['valid'].sum()),
        "mean_spo2": float(np.nanmean(df['SpO2'][df['valid']])) if df['valid'].any() else float('nan'),
        "median_spo2": float(np.nanmedian(df['SpO2'][df['valid']])) if df['valid'].any() else float('nan'),
        "used_A": used_A,
        "used_B": used_B
    }
    return df, summary