life-echo/api/app/features/evaluation/judge_manual_service.py

"""手动触发 GLM 评审（不写 eval_runs）。"""

from __future__ import annotations

import re
from collections.abc import AsyncIterator
from typing import Any

from sqlalchemy.ext.asyncio import AsyncSession

from app.core.dependencies import get_eval_judge_langchain_llm
from app.core.logging import get_logger
from app.features.conversation import repo as conversation_repo
from app.features.evaluation.errors import (
    EvaluationBadRequestError,
    EvaluationNotFoundError,
)
from app.features.evaluation.execution_service import _assistant_text_for_eval_display
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.schemas import MemoirSectionBaselineOut
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user

logger = get_logger(__name__)

_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000


def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
    s = (text or "").strip()
    if len(s) <= max_chars:
        return s
    return f"{s[:max_chars]}\n\n…（已截断供评审）"


def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
    parts: list[str] = []
    for u, ai in turns:
        u = (u or "").strip()
        ai = (ai or "").strip()
        if u:
            parts.append(f"用户: {u}")
        if ai:
            parts.append(f"AI: {_assistant_text_for_eval_display(ai)}")
    return "\n\n".join(parts)


def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
    s = (text or "").strip()
    if len(s) <= max_chars:
        return s
    return f"{s[:max_chars]}\n\n…（访谈证据已截断）"


async def _conversation_transcript_for_manual(
    db: AsyncSession, conversation_id: str
) -> str:
    rows = await conversation_repo.get_conversation_messages(conversation_id, db)
    parts: list[str] = []
    for row in rows:
        role = (row.role or "").lower()
        body = (row.content or "").strip()
        if not body:
            continue
        label = "用户" if role == "human" else "AI"
        out = _assistant_text_for_eval_display(body) if role != "human" else body
        parts.append(f"{label}: {out}")
    return "\n\n".join(parts)


async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
    conversations = await conversation_repo.get_user_conversations(user_id, db)
    if not conversations:
        return ""
    parts: list[str] = []
    for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
        transcript = await _conversation_transcript_for_manual(db, str(conv.id))
        if transcript:
            parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
    return _trim_evidence_text("\n\n".join(parts))


def _normalize_title_key(title: str) -> str:
    t = (title or "").strip().lower()
    t = re.sub(r"^#+\s*", "", t)
    return re.sub(r"\s+", " ", t)


def _baseline_for_chapter_title(
    baselines: list[MemoirSectionBaselineOut],
    chapter_title: str,
    index: int,
) -> MemoirSectionBaselineOut | None:
    if baselines:
        key = _normalize_title_key(chapter_title)
        for b in baselines:
            if _normalize_title_key(b.title) == key:
                return b
        if 0 <= index < len(baselines):
            return baselines[index]
    return None


class EvalJudgeManualService:
    def __init__(self, db: AsyncSession) -> None:
        self._db = db

    async def judge_conversation(
        self,
        conversation_id: str,
        fixture_filename: str | None,
    ) -> dict[str, Any]:
        cid = (conversation_id or "").strip()
        if not cid:
            raise EvaluationBadRequestError("conversation_id is required")

        catalog = SessionCatalogService(self._db)
        dialogue = await catalog.get_session_dialogue(cid)
        if not dialogue:
            raise EvaluationNotFoundError("conversation not found")

        parts: list[str] = []
        for m in dialogue.messages:
            r = (m.role or "").lower()
            label = "用户" if r == "human" else "AI"
            raw = m.content or ""
            out = _assistant_text_for_eval_display(raw) if r != "human" else raw
            parts.append(f"{label}: {out}")
        replay_transcript = "\n\n".join(parts)
        if not replay_transcript.strip():
            raise EvaluationBadRequestError("no messages to judge")

        fn = (fixture_filename or "").strip() or None
        baseline_transcript = ""
        if fn:
            try:
                turns, _ = read_user_export_fixture(fn)
                baseline_transcript = _transcript_from_export_turns(turns)
            except ValueError as e:
                raise EvaluationBadRequestError(str(e)) from e
            except FileNotFoundError as e:
                raise EvaluationNotFoundError("fixture not found") from e

        errors: list[str] = []
        judge_llm = get_eval_judge_langchain_llm()
        judge = EvalJudgeService(judge_llm)
        baseline_judge_dict: dict[str, Any] | None = None
        if baseline_transcript.strip():
            bj = await judge.judge_conversation(full_transcript=baseline_transcript)
            if bj:
                baseline_judge_dict = bj.model_dump()
            else:
                errors.append("baseline_glm_failed")
        elif fn:
            errors.append("baseline_transcript_empty")

        rj = await judge.judge_conversation(full_transcript=replay_transcript)
        replay_judge_dict = rj.model_dump() if rj else None
        if not rj:
            errors.append("replay_glm_failed")

        return {
            "conversation_id": cid,
            "fixture_filename": fn,
            "baseline_transcript": baseline_transcript,
            "replay_transcript": replay_transcript,
            "baseline_judge": baseline_judge_dict,
            "replay_judge": replay_judge_dict,
            "errors": errors,
        }

    async def iter_conversation_judge_sse(
        self,
        conversation_id: str,
        fixture_filename: str | None,
    ) -> AsyncIterator[dict[str, Any]]:
        """供 SSE：先整体基准分、再整体回放分，再流式对比与建议。"""
        cid = (conversation_id or "").strip()
        if not cid:
            yield {
                "event": "error",
                "phase": "validate",
                "message": "conversation_id is required",
            }
            return

        catalog = SessionCatalogService(self._db)
        dialogue = await catalog.get_session_dialogue(cid)
        if not dialogue:
            yield {
                "event": "error",
                "phase": "load",
                "message": "conversation not found",
            }
            return

        parts: list[str] = []
        for m in dialogue.messages:
            r = (m.role or "").lower()
            label = "用户" if r == "human" else "AI"
            raw = m.content or ""
            out = _assistant_text_for_eval_display(raw) if r != "human" else raw
            parts.append(f"{label}: {out}")
        replay_transcript = "\n\n".join(parts)
        if not replay_transcript.strip():
            yield {"event": "error", "phase": "load", "message": "no messages to judge"}
            return

        fn = (fixture_filename or "").strip() or None
        baseline_transcript = ""
        if fn:
            try:
                turns, _ = read_user_export_fixture(fn)
                baseline_transcript = _transcript_from_export_turns(turns)
            except ValueError as e:
                yield {"event": "error", "phase": "fixture", "message": str(e)}
                return
            except FileNotFoundError:
                yield {
                    "event": "error",
                    "phase": "fixture",
                    "message": "fixture not found",
                }
                return

        judge_llm = get_eval_judge_langchain_llm()
        if not judge_llm:
            yield {
                "event": "error",
                "phase": "config",
                "message": "评审 LLM 未配置（eval_judge_api_key / zhipu_api_key）",
            }
            return

        judge = EvalJudgeService(judge_llm)
        yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn}

        if not baseline_transcript.strip():
            yield {
                "event": "warning",
                "message": "未提供基准 MD 或基准无文本：仅对回放对话打分并输出单侧改进建议",
            }

        baseline_judge = None
        if baseline_transcript.strip():
            baseline_judge = await judge.judge_conversation(
                full_transcript=baseline_transcript
            )
            yield {
                "event": "baseline_judge",
                "ok": baseline_judge is not None,
                "judge": baseline_judge.model_dump() if baseline_judge else None,
            }
            if not baseline_judge:
                yield {
                    "event": "error",
                    "phase": "baseline_glm",
                    "message": "基准整体打分失败（密钥、限流或 JSON 解析失败，见服务端日志）",
                }
        else:
            yield {
                "event": "baseline_judge",
                "ok": False,
                "skipped": True,
                "judge": None,
            }

        replay_judge = await judge.judge_conversation(full_transcript=replay_transcript)
        yield {
            "event": "replay_judge",
            "ok": replay_judge is not None,
            "judge": replay_judge.model_dump() if replay_judge else None,
        }
        if not replay_judge:
            yield {
                "event": "error",
                "phase": "replay_glm",
                "message": "回放对话整体 GLM 打分失败（空密钥、限流或 JSON 解析失败，见服务端日志）",
            }
            yield {"event": "done"}
            return

        async for piece in judge.stream_conversation_compare(
            baseline_transcript=baseline_transcript,
            replay_transcript=replay_transcript,
            baseline_judge=baseline_judge,
            replay_judge=replay_judge,
        ):
            if piece:
                yield {"event": "compare_delta", "text": piece}

        yield {"event": "done"}

    async def judge_memoir_for_user(
        self,
        user_id: str,
        baseline_sections: list[MemoirSectionBaselineOut] | None,
    ) -> dict[str, Any]:
        uid = (user_id or "").strip()
        if not uid:
            raise EvaluationBadRequestError("user_id is required")

        judge_llm = get_eval_judge_langchain_llm()
        judge = EvalJudgeService(judge_llm)
        baselines = list(baseline_sections or [])
        evidence_transcript = await _user_transcript_evidence(self._db, uid)

        chapter_results: list[dict[str, Any]] = []
        try:
            chapters = await get_chapters_for_memoir_list(
                uid, self._db, active_only=True, is_new_only=None
            )
            for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
                body = (ch.canonical_markdown or "").strip()
                if not body:
                    continue
                bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
                baseline_excerpt = ""
                if bl and (bl.body or "").strip():
                    baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
                md = f"# 章节：{ch.title}\n\n"
                if baseline_excerpt:
                    md += f"## 导出基线（节选）\n\n{baseline_excerpt}\n\n"
                md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
                cj = await judge.judge_memoir(
                    memoir_markdown=md,
                    source_transcript=evidence_transcript,
                    reference_memoir_markdown=baseline_excerpt,
                    evidence_notes=(
                        "严格按文档打分；真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
                    ),
                )
                chapter_results.append(
                    {
                        "id": ch.id,
                        "title": ch.title,
                        "order_index": ch.order_index,
                        "baseline_title": bl.title if bl else None,
                        "judge": cj.model_dump() if cj else None,
                    }
                )
        except Exception as e:
            logger.warning("manual memoir chapter judges failed: {}", e)

        story_results: list[dict[str, Any]] = []
        try:
            stories = await get_stories_for_user(self._db, uid, status="active")
            for st in stories[:_MAX_EVAL_STORIES]:
                body = (st.canonical_markdown or "").strip()
                if not body:
                    continue
                md = f"# 故事：{st.title}\n\n{_clip_md_for_judge(body)}"
                sj = await judge.judge_memoir(
                    memoir_markdown=md,
                    source_transcript=evidence_transcript,
                    evidence_notes=(
                        "严格按文档打分；真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
                    ),
                )
                story_results.append(
                    {
                        "id": st.id,
                        "title": st.title,
                        "stage": st.stage,
                        "judge": sj.model_dump() if sj else None,
                    }
                )
        except Exception as e:
            logger.warning("manual memoir story judges failed: {}", e)

        return {
            "user_id": uid,
            "chapter_results": chapter_results,
            "story_results": story_results,
        }

    async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
        uid = (user_id or "").strip()
        if not uid:
            raise EvaluationBadRequestError("user_id is required")

        chapters_out: list[dict[str, Any]] = []
        stories_out: list[dict[str, Any]] = []
        try:
            chapters = await get_chapters_for_memoir_list(
                uid, self._db, active_only=True, is_new_only=None
            )
            for ch in chapters[:_MAX_EVAL_CHAPTERS]:
                chapters_out.append(
                    {
                        "id": ch.id,
                        "title": ch.title,
                        "category": ch.category,
                        "order_index": ch.order_index,
                        "canonical_markdown": ch.canonical_markdown,
                    }
                )
        except Exception as e:
            logger.warning("memoir snapshot chapters failed: {}", e)
        try:
            stories = await get_stories_for_user(self._db, uid, status="active")
            for st in stories[:_MAX_EVAL_STORIES]:
                stories_out.append(
                    {
                        "id": st.id,
                        "title": st.title,
                        "stage": st.stage,
                        "canonical_markdown": st.canonical_markdown,
                    }
                )
        except Exception as e:
            logger.warning("memoir snapshot stories failed: {}", e)

        return {
            "user_id": uid,
            "chapters": chapters_out,
            "stories": stories_out,
        }