life-echo/api/app/features/memoir/chapter_evidence_snapshot.py

"""章节证据闭包：快照行 + chapter_evidence_links 是评测唯一证据来源。"""

from __future__ import annotations

import uuid
from datetime import datetime, timezone

from sqlalchemy import delete, func, select
from sqlalchemy.orm import Session, joinedload

from app.core.logging import get_logger
from app.features.conversation.lineage_schemas import aggregate_lineage_from_segments
from app.features.conversation.models import Conversation, Segment
from app.features.memoir.models import (
    Chapter,
    ChapterEvidenceLink,
    ChapterEvidenceSnapshot,
)
from app.features.story.models import StoryEvidenceLink

EVIDENCE_SNAPSHOT_SCHEMA_VERSION = 1

logger = get_logger(__name__)


def _normalize_segment_ids(raw: object) -> list[str]:
    if not raw or not isinstance(raw, list):
        return []
    out: list[str] = []
    for x in raw:
        s = str(x).strip()
        if s:
            out.append(s)
    seen: set[str] = set()
    deduped: list[str] = []
    for s in out:
        if s not in seen:
            seen.add(s)
            deduped.append(s)
    return deduped


def _story_ids_ordered(chapter: Chapter) -> list[str]:
    links = sorted(
        list(getattr(chapter, "story_links", None) or []),
        key=lambda lnk: getattr(lnk, "order_index", 0),
    )
    out: list[str] = []
    for ln in links:
        sid = getattr(ln, "story_id", None)
        if sid:
            out.append(str(sid))
    return out


def _dedupe_ids(raw: list[str]) -> list[str]:
    seen: set[str] = set()
    out: list[str] = []
    for item in raw:
        sid = str(item).strip()
        if sid and sid not in seen:
            seen.add(sid)
            out.append(sid)
    return out


def _story_evidence_ids_for_chapter(
    session: Session, story_ids: list[str]
) -> tuple[list[str], list[str], list[str], list[str]]:
    """Collect strict story-bound evidence ids for the chapter snapshot."""
    if not story_ids:
        return [], [], [], []
    stmt = select(
        StoryEvidenceLink.evidence_type,
        StoryEvidenceLink.evidence_id,
    ).where(StoryEvidenceLink.story_id.in_(story_ids))
    chunk_ids: list[str] = []
    fact_ids: list[str] = []
    timeline_ids: list[str] = []
    summary_ids: list[str] = []
    for evidence_type, evidence_id in session.execute(stmt).all():
        et = str(evidence_type or "").strip()
        eid = str(evidence_id or "").strip()
        if not eid:
            continue
        if et == "chunk":
            chunk_ids.append(eid)
        elif et == "fact":
            fact_ids.append(eid)
        elif et == "timeline_event":
            timeline_ids.append(eid)
        elif et == "summary":
            summary_ids.append(eid)
    return (
        _dedupe_ids(chunk_ids),
        _dedupe_ids(fact_ids),
        _dedupe_ids(timeline_ids),
        _dedupe_ids(summary_ids),
    )


def build_chapter_evidence_closure_payload_sync(
    session: Session, chapter: Chapter
) -> dict:
    """
    唯一闭包计算入口：transcript 证据来自 chapter.segment 绑定；
    memory 证据只来自 StoryEvidenceLink，不再做 live memory closure fallback。
    """
    uid = str(chapter.user_id)
    segment_ids = _normalize_segment_ids(chapter.source_segments)
    story_ids = _story_ids_ordered(chapter)
    chunk_ids, fact_ids, tl_ids, sum_ids = _story_evidence_ids_for_chapter(
        session, story_ids
    )
    segs: list = []

    if not segment_ids:
        conv_ids: list[str] = []
        notes = [
            "no_source_segments",
            "snapshot_materialized",
        ]
    else:
        stmt = (
            select(Segment)
            .join(Conversation, Segment.conversation_id == Conversation.id)
            .where(
                Segment.id.in_(segment_ids),
                Conversation.user_id == uid,
                Conversation.deleted_at.is_(None),
            )
        )
        segs = list(session.execute(stmt).scalars().all())
        conv_ids = sorted({str(s.conversation_id) for s in segs if s.conversation_id})
        notes = ["snapshot_materialized"]
        if len(segs) < len(segment_ids):
            notes.append("some_segment_ids_unresolved_or_foreign_user")

    message_lineage_json = None
    if segs:
        order_map = {sid: i for i, sid in enumerate(segment_ids)}
        segs_ordered = sorted(segs, key=lambda s: order_map.get(str(s.id), 9999))
        message_lineage_json = aggregate_lineage_from_segments(
            segs_ordered,
            conversation_id_fallback=conv_ids[0] if conv_ids else None,
        )

    return {
        "schema_version": EVIDENCE_SNAPSHOT_SCHEMA_VERSION,
        "captured_at": datetime.now(timezone.utc).isoformat(),
        "chapter_id": str(chapter.id),
        "user_id": uid,
        "segment_ids": segment_ids,
        "conversation_ids": conv_ids,
        "story_ids": story_ids,
        "memory_chunk_ids": chunk_ids,
        "memory_fact_ids": fact_ids,
        "timeline_event_ids": tl_ids,
        "summary_ids": sum_ids,
        "notes": notes,
        "message_lineage_json": message_lineage_json,
    }


# 旧名保留，避免外部 import 断裂
build_chapter_evidence_snapshot_sync = build_chapter_evidence_closure_payload_sync


def _replace_chapter_evidence_links_sync(
    session: Session, *, chapter_id: str, payload: dict
) -> None:
    session.execute(
        delete(ChapterEvidenceLink).where(ChapterEvidenceLink.chapter_id == chapter_id)
    )
    for cid in payload.get("memory_chunk_ids") or []:
        session.add(
            ChapterEvidenceLink(
                id=str(uuid.uuid4()),
                chapter_id=chapter_id,
                evidence_type="chunk",
                evidence_id=str(cid),
                role="primary",
            )
        )
    for fid in payload.get("memory_fact_ids") or []:
        session.add(
            ChapterEvidenceLink(
                id=str(uuid.uuid4()),
                chapter_id=chapter_id,
                evidence_type="fact",
                evidence_id=str(fid),
                role="supporting",
            )
        )
    for tid in payload.get("timeline_event_ids") or []:
        session.add(
            ChapterEvidenceLink(
                id=str(uuid.uuid4()),
                chapter_id=chapter_id,
                evidence_type="timeline_event",
                evidence_id=str(tid),
                role="supporting",
            )
        )
    for sid in payload.get("summary_ids") or []:
        session.add(
            ChapterEvidenceLink(
                id=str(uuid.uuid4()),
                chapter_id=chapter_id,
                evidence_type="summary",
                evidence_id=str(sid),
                role="background",
            )
        )


def refresh_chapter_evidence_snapshot_sync(session: Session, chapter_id: str) -> bool:
    """写入新版本快照行、替换 evidence_links、更新 Chapter 当前指针。"""
    stmt = (
        select(Chapter)
        .where(Chapter.id == chapter_id)
        .options(joinedload(Chapter.story_links))
    )
    ch = session.execute(stmt).unique().scalar_one_or_none()
    if not ch:
        return False
    payload = build_chapter_evidence_closure_payload_sync(session, ch)

    max_v = session.execute(
        select(func.coalesce(func.max(ChapterEvidenceSnapshot.version_no), 0)).where(
            ChapterEvidenceSnapshot.chapter_id == chapter_id
        )
    ).scalar()
    next_v = int(max_v or 0) + 1
    cap_at = datetime.now(timezone.utc)
    snap = ChapterEvidenceSnapshot(
        id=str(uuid.uuid4()),
        chapter_id=str(ch.id),
        user_id=str(ch.user_id),
        version_no=next_v,
        schema_version=int(
            payload.get("schema_version") or EVIDENCE_SNAPSHOT_SCHEMA_VERSION
        ),
        segment_ids=list(payload.get("segment_ids") or []),
        conversation_ids=list(payload.get("conversation_ids") or []),
        story_ids=list(payload.get("story_ids") or []),
        memory_chunk_ids=list(payload.get("memory_chunk_ids") or []),
        memory_fact_ids=list(payload.get("memory_fact_ids") or []),
        timeline_event_ids=list(payload.get("timeline_event_ids") or []),
        summary_ids=list(payload.get("summary_ids") or []),
        notes=list(payload.get("notes") or []),
        message_lineage_json=payload.get("message_lineage_json"),
        captured_at=cap_at,
    )
    session.add(snap)
    session.flush()
    _replace_chapter_evidence_links_sync(
        session, chapter_id=str(ch.id), payload=payload
    )
    ch.current_evidence_snapshot_id = snap.id
    if payload.get("message_lineage_json") is not None:
        ch.source_lineage_json = payload.get("message_lineage_json")
    session.flush()
    return True


def refresh_chapter_evidence_snapshot_with_retry_sync(
    session: Session, chapter_id: str
) -> bool:
    """
    同 `refresh_chapter_evidence_snapshot_sync`，失败时整体再试 1 次（共 2 次）。
    日志前缀 `evidence_snapshot_refresh_failed` 便于检索。
    """
    last_exc: Exception | None = None
    for attempt in range(2):
        try:
            return refresh_chapter_evidence_snapshot_sync(session, chapter_id)
        except Exception as e:
            last_exc = e
            logger.warning(
                "evidence_snapshot_refresh_failed attempt={} chapter_id={}: {}",
                attempt + 1,
                chapter_id,
                e,
            )
    if last_exc:
        logger.warning(
            "evidence_snapshot_refresh_failed exhausted chapter_id={}: {}",
            chapter_id,
            last_exc,
        )
    return False