100 lines
3.8 KiB
Python
100 lines
3.8 KiB
Python
|
|
import argparse
|
|
from pathlib import Path
|
|
from lxml import etree
|
|
from tqdm import tqdm
|
|
from collections import Counter
|
|
|
|
|
|
def main():
|
|
# 设定目标文件夹路径,你可以修改这里的路径,或者运行脚本时手动输入
|
|
# 默认为当前目录 '.'
|
|
# target_dir = "/mnt/disk_wd/marques_dataset/shhs/polysomnography/annotations-events-nsrr/shhs1"
|
|
target_dir = "/mnt/disk_wd/marques_dataset/shhs/polysomnography/annotations-events-nsrr/shhs2"
|
|
|
|
folder_path = Path(target_dir)
|
|
|
|
if not folder_path.exists():
|
|
print(f"错误: 路径 '{folder_path}' 不存在。")
|
|
return
|
|
|
|
# 1. 获取所有 XML 文件 (扁平结构,不递归子目录)
|
|
xml_files = list(folder_path.glob("*.xml"))
|
|
total_files = len(xml_files)
|
|
|
|
if total_files == 0:
|
|
print(f"在 '{folder_path}' 中没有找到 XML 文件。")
|
|
return
|
|
|
|
print(f"找到 {total_files} 个 XML 文件,准备开始处理...")
|
|
|
|
# 用于统计 (EventType, EventConcept) 组合的计数器
|
|
stats_counter = Counter()
|
|
|
|
# 2. 遍历文件,使用 tqdm 显示进度条
|
|
for xml_file in tqdm(xml_files, desc="Processing XMLs", unit="file"):
|
|
try:
|
|
# 使用 lxml 解析
|
|
tree = etree.parse(str(xml_file))
|
|
root = tree.getroot()
|
|
|
|
# 3. 定位到 ScoredEvent 节点
|
|
# SHHS XML 结构通常是: PSGAnnotation -> ScoredEvents -> ScoredEvent
|
|
# 我们直接查找所有的 ScoredEvent 节点
|
|
events = root.findall(".//ScoredEvent")
|
|
|
|
for event in events:
|
|
# 提取 EventType
|
|
type_node = event.find("EventType")
|
|
# 处理节点不存在或文本为空的情况
|
|
e_type = type_node.text.strip() if (type_node is not None and type_node.text) else "N/A"
|
|
|
|
# 提取 EventConcept
|
|
concept_node = event.find("EventConcept")
|
|
e_concept = concept_node.text.strip() if (concept_node is not None and concept_node.text) else "N/A"
|
|
|
|
# 4. 组合并计数
|
|
# 组合键为元组 (EventType, EventConcept)
|
|
key = (e_type, e_concept)
|
|
stats_counter[key] += 1
|
|
|
|
except etree.XMLSyntaxError:
|
|
print(f"\n[警告] 文件格式错误,跳过: {xml_file.name}")
|
|
except Exception as e:
|
|
print(f"\n[错误] 处理文件 {xml_file.name} 时出错: {e}")
|
|
|
|
# 5. 打印结果到终端
|
|
if stats_counter:
|
|
# --- 动态计算列宽 ---
|
|
# 获取所有 EventType 的最大长度,默认长度 9
|
|
max_type_width = max((len(k[0]) for k in stats_counter.keys()), default=9)
|
|
max_type_width = max(max_type_width, 9)
|
|
|
|
# 获取所有 EventConcept 的最大长度,默认长度 12
|
|
max_conc_width = max((len(k[1]) for k in stats_counter.keys()), default=12)
|
|
max_conc_width = max(max_conc_width, 12)
|
|
|
|
# 计算表格总宽度
|
|
total_line_width = max_type_width + max_conc_width + 10 + 6
|
|
|
|
print("\n" + "=" * total_line_width)
|
|
print(f"{'EventType':<{max_type_width}} | {'EventConcept':<{max_conc_width}} | {'Count':>10}")
|
|
print("-" * total_line_width)
|
|
|
|
# --- 修改处:按名称排序 ---
|
|
# sorted() 默认会对元组 (EventType, EventConcept) 进行字典序排序
|
|
# 即先按 EventType A-Z 排序,再按 EventConcept A-Z 排序
|
|
for (e_type, e_concept), count in sorted(stats_counter.items()):
|
|
print(f"{e_type:<{max_type_width}} | {e_concept:<{max_conc_width}} | {count:>10}")
|
|
|
|
print("=" * total_line_width)
|
|
|
|
else:
|
|
print("\n未提取到任何事件数据。")
|
|
|
|
print("=" * 90)
|
|
print(f"统计完成。共扫描 {total_files} 个文件。")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |