Files
life-echo/api/tests/test_eval_judge_llm_spec.py
Kevin 71fbd39e32 feat(api)!: memory single chain — async MemoryService, strict eval closure
Route all memory ingest/retrieve/enrichment/compaction through async MemoryService.
Remove legacy sync memory implementations (ingest/retrieve/compaction); Celery and
memoir Phase2 call asyncio.run into MemoryService-backed helpers.

Memoir Phase1 batch ingest uses MemoryService.ingest_transcripts_batch; drop chapters.
evidence_bundle_json mirror (Alembic 0015). Evaluation uses snapshot/link-only bundles;
raise EvidenceClosureMissing instead of partial/fallback lineage tiers.

Split memoir state into NarrativeCoverageState and InterviewControlState; delete the
_interview_meta_store adapter layer. Remove rolling-query and recent-fact fallback
settings from config and evidence assembly.

Update judges, docs, tests, and PlaygroundPage alignment.

Made-with: Cursor
2026-04-30 14:11:50 +08:00

77 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评测评审 LLM 装配:多供应商与上下文预算。"""
import pytest
from app.core.config import settings
from app.core.dependencies import build_eval_judge_llm_spec
from app.features.evaluation.judge_service import (
eval_judge_compare_transcript_each_max_chars_for_context,
eval_judge_conversation_transcript_max_chars_for_context,
)
def test_build_eval_judge_zhipu_uses_bigmodel_defaults(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "eval_judge_api_key", "")
monkeypatch.setattr(settings, "zhipu_api_key", "z-test")
monkeypatch.setattr(settings, "eval_judge_model", "glm-5")
spec = build_eval_judge_llm_spec("zhipu", None)
assert spec is not None
assert spec.provider == "zhipu"
assert spec.resolved_model == "glm-5"
assert spec.llm is not None
assert spec.context_window_tokens == settings.eval_judge_context_window_tokens
def test_build_eval_judge_zhipu_request_model_override(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "eval_judge_api_key", "e-test")
monkeypatch.setattr(settings, "eval_judge_model", "glm-5")
spec = build_eval_judge_llm_spec("zhipu", "glm-4-plus")
assert spec is not None
assert spec.resolved_model == "glm-4-plus"
def test_build_eval_judge_deepseek_requires_key(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "deepseek_api_key", "")
monkeypatch.setattr(settings, "llm_api_key", "")
assert build_eval_judge_llm_spec("deepseek", None) is None
def test_build_eval_judge_deepseek_v4_flash_non_thinking_default_path(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""默认 deepseek-v4-flash 且关闭 thinking 时显式传 disabled避免 API 默认 enabled"""
monkeypatch.setattr(settings, "deepseek_api_key", "d-test")
monkeypatch.setattr(settings, "eval_judge_deepseek_model", "deepseek-v4-flash")
monkeypatch.setattr(settings, "eval_judge_deepseek_thinking_enabled", False)
spec = build_eval_judge_llm_spec("deepseek", None)
assert spec is not None
assert spec.resolved_model == "deepseek-v4-flash"
assert spec.llm.extra_body == {"thinking": {"type": "disabled"}}
assert spec.llm.reasoning_effort is None
def test_build_eval_judge_deepseek_context_budget(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "deepseek_api_key", "d-test")
monkeypatch.setattr(settings, "eval_judge_deepseek_model", "deepseek-reasoner")
monkeypatch.setattr(settings, "eval_judge_deepseek_context_window_tokens", 64_000)
spec = build_eval_judge_llm_spec("deepseek", None)
assert spec is not None
assert spec.provider == "deepseek"
# 旧名 deepseek-reasoner 规范为 v4-flash 思考模式
assert spec.resolved_model == "deepseek-v4-flash"
assert spec.context_window_tokens == 64_000
n = eval_judge_conversation_transcript_max_chars_for_context(64_000)
glm_n = eval_judge_conversation_transcript_max_chars_for_context(200_000)
assert n < glm_n
each_ds = eval_judge_compare_transcript_each_max_chars_for_context(64_000)
each_glm = eval_judge_compare_transcript_each_max_chars_for_context(200_000)
assert each_ds < each_glm