life-echo/api/app/agents/memoir/extraction_agent.py

"""
ExtractionAgent：从用户消息中提取 5-stage 状态与 slots。
对应现有逻辑：get_state_extraction_prompt + JSON 解析
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from typing import Any, Dict

from app.agents.memoir.prompts import get_state_extraction_prompt
from app.agents.stage_constants import normalize_chat_stage
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
from app.core.json_utils import extract_json_payload

logger = get_logger(__name__)


@dataclass
class ExtractionResult:
    """状态提取结果"""

    detected_stage: str
    slots: Dict[str, str]


class ExtractionAgent:
    """从用户消息中提取 detected_stage 和 slots"""

    def extract(
        self,
        user_message: str,
        current_stage: str,
        stage_slots: Dict[str, Any],
        llm: Any,
    ) -> ExtractionResult:
        """
        提取结构化信息并判断阶段。
        llm 需支持 .invoke(prompt) 同步调用（Celery 任务内使用）。
        """
        detected_stage = current_stage
        extracted_slots: Dict[str, str] = {}

        if not llm:
            return ExtractionResult(
                detected_stage=detected_stage, slots=extracted_slots
            )

        try:
            prompt = get_state_extraction_prompt(
                user_message=user_message,
                current_stage=current_stage,
                stage_slots={
                    k: v.model_dump() if hasattr(v, "model_dump") else v
                    for k, v in (stage_slots or {}).items()
                },
            )
            raw = invoke_json_object(
                llm,
                prompt,
                max_tokens=1024,
                agent="ExtractionAgent.extract",
            )
            parsed = json.loads(extract_json_payload(raw))
            raw_slots = parsed.get("slots", {}) or {}
            extracted_slots = {
                k: v if isinstance(v, str) else str(v) for k, v in raw_slots.items()
            }
            if not extracted_slots:
                # 无实质 slot 时不推断阶段，避免元话语被标成任意 childhood 等（与服务端护栏一致）
                detected_stage = normalize_chat_stage(
                    current_stage, fallback=current_stage
                )
            else:
                raw_detected = parsed.get("detected_stage", current_stage)
                detected_stage = normalize_chat_stage(
                    str(raw_detected) if raw_detected is not None else None,
                    fallback=current_stage,
                )
        except (json.JSONDecodeError, Exception) as e:
            logger.warning("ExtractionAgent LLM 解析失败: {}", e)

        return ExtractionResult(detected_stage=detected_stage, slots=extracted_slots)