2026-03-27 16:01:28 +08:00
|
|
|
|
"""Enrichment 共享:去重键与 object_json 规范化(sync/async 共用)。"""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
2026-04-03 11:43:16 +08:00
|
|
|
|
# 叙述者常见别名 — 归一化到 narrator_name 或「叙述者」
|
|
|
|
|
|
_NARRATOR_ALIASES: frozenset[str] = frozenset(
|
|
|
|
|
|
{
|
|
|
|
|
|
"我",
|
|
|
|
|
|
"本人",
|
|
|
|
|
|
"人物",
|
|
|
|
|
|
"叙述者",
|
|
|
|
|
|
"讲述者",
|
|
|
|
|
|
"老人",
|
|
|
|
|
|
"自己",
|
|
|
|
|
|
"咱们",
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
2026-03-27 16:01:28 +08:00
|
|
|
|
|
2026-04-03 11:43:16 +08:00
|
|
|
|
|
|
|
|
|
|
def normalize_subject(subject: str | None, narrator_name: str | None = None) -> str:
|
|
|
|
|
|
"""将代词/泛称映射为统一 subject,便于去重与检索。"""
|
|
|
|
|
|
s = (subject or "").strip()
|
|
|
|
|
|
if not s:
|
|
|
|
|
|
return narrator_name or "叙述者"
|
|
|
|
|
|
if s in _NARRATOR_ALIASES:
|
|
|
|
|
|
return narrator_name or "叙述者"
|
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dedupe_key(f: dict, *, narrator_name: str | None = None) -> tuple:
|
|
|
|
|
|
s = normalize_subject(f.get("subject"), narrator_name)
|
|
|
|
|
|
p = (f.get("predicate") or "").strip()
|
2026-03-27 16:01:28 +08:00
|
|
|
|
o = f.get("object_json")
|
|
|
|
|
|
try:
|
|
|
|
|
|
oj = json.dumps(o, sort_keys=True, ensure_ascii=False) if o is not None else ""
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
oj = str(o)
|
|
|
|
|
|
return (str(s), str(p), oj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_object_json(obj: Any) -> dict | list | None:
|
|
|
|
|
|
if obj is None:
|
|
|
|
|
|
return None
|
|
|
|
|
|
if isinstance(obj, (dict, list)):
|
|
|
|
|
|
return obj
|
|
|
|
|
|
return {"value": obj}
|