"""评测评审 LLM 装配:多供应商与上下文预算。""" import pytest from app.core.config import settings from app.core.dependencies import build_eval_judge_llm_spec from app.features.evaluation.constants import eval_cfg from app.features.evaluation.judge_service import ( eval_judge_compare_transcript_each_max_chars_for_context, eval_judge_conversation_transcript_max_chars_for_context, ) def test_build_eval_judge_zhipu_uses_bigmodel_defaults( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(settings, "zhipu_api_key", "z-test") monkeypatch.setattr(eval_cfg, "judge_model", "glm-5") spec = build_eval_judge_llm_spec("zhipu", None) assert spec is not None assert spec.provider == "zhipu" assert spec.resolved_model == "glm-5" assert spec.llm is not None assert spec.context_window_tokens == eval_cfg.judge_context_window_tokens def test_build_eval_judge_zhipu_request_model_override( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(settings, "zhipu_api_key", "e-test") monkeypatch.setattr(eval_cfg, "judge_model", "glm-5") spec = build_eval_judge_llm_spec("zhipu", "glm-4-plus") assert spec is not None assert spec.resolved_model == "glm-4-plus" def test_build_eval_judge_deepseek_requires_key( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(settings, "deepseek_api_key", "") assert build_eval_judge_llm_spec("deepseek", None) is None def test_build_eval_judge_deepseek_v4_flash_non_thinking_default_path( monkeypatch: pytest.MonkeyPatch, ) -> None: """默认 deepseek-v4-flash 且关闭 thinking 时显式传 disabled(避免 API 默认 enabled)。""" monkeypatch.setattr(settings, "deepseek_api_key", "d-test") monkeypatch.setattr(eval_cfg, "judge_deepseek_model", "deepseek-v4-flash") monkeypatch.setattr(eval_cfg, "judge_deepseek_thinking_enabled", False) spec = build_eval_judge_llm_spec("deepseek", None) assert spec is not None assert spec.resolved_model == "deepseek-v4-flash" assert spec.llm.extra_body == {"thinking": {"type": "disabled"}} assert spec.llm.reasoning_effort is None def test_build_eval_judge_deepseek_context_budget( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(settings, "deepseek_api_key", "d-test") monkeypatch.setattr(eval_cfg, "judge_deepseek_model", "deepseek-reasoner") monkeypatch.setattr(eval_cfg, "judge_deepseek_context_window_tokens", 64_000) spec = build_eval_judge_llm_spec("deepseek", None) assert spec is not None assert spec.provider == "deepseek" # 旧名 deepseek-reasoner 规范为 v4-flash 思考模式 assert spec.resolved_model == "deepseek-v4-flash" assert spec.context_window_tokens == 64_000 n = eval_judge_conversation_transcript_max_chars_for_context(64_000) glm_n = eval_judge_conversation_transcript_max_chars_for_context(200_000) assert n < glm_n each_ds = eval_judge_compare_transcript_each_max_chars_for_context(64_000) each_glm = eval_judge_compare_transcript_each_max_chars_for_context(200_000) assert each_ds < each_glm