Files
life-echo/api/app/core/agent_logging.py

200 lines
6.5 KiB
Python
Raw Normal View History

"""
Agent / LLM 诊断日志耗时输入输出规模截断预览
- **详情**prompt 预览 / hash / 响应预览仅在 ``LOG_LEVEL`` ``TRACE`` / ``DEBUG`` 时通过 ``logger.debug`` 输出
- **摘要**单行耗时字符数operation ``LOG_AGENT_VERBOSE=1`` 时通过 ``logger.info`` 输出
便于生产环境在不把全局日志调到 DEBUG 的情况下排查 Agent 性能与路径
生产/预发建议 ``LOG_LEVEL=INFO``需看 Agent 耗时与规模时可设 ``LOG_AGENT_VERBOSE=1``无需长期 DEBUG
敏感内容DEBUG 下会记录用户相关文本``AGENT_LOG_MAX_CHARS=0`` 时预览不截断完整输出慎用
2026-04-03 13:49:24 +08:00
配置节选``AGENT_LOG_OMIT_SYSTEM_MESSAGE_BODY``默认 true省略聊天 System 正文仅打 len+sha12
``AGENT_LOG_JSON_PROMPT_PREFIX_CHARS`` + ``AGENT_LOG_JSON_PROMPT_PREFIX_ONLY_IF_LEN_GT`` DEBUG 下跳过
超长单段 ``*.prompt`` 的前缀再预览
``AGENT_LOG_PROMPT_MODE=hash_only`` ``*.prompt`` 仅输出 sha12 + 长度无正文
``AGENT_LOG_PROMPT_DEDUP=1`` 时同一 label 连续相同全文则跳过重复行
"""
from __future__ import annotations
import hashlib
import threading
import time
from contextlib import contextmanager
from typing import Any, Iterator
feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31) (#32) * add staging ios app build script * feat(api): add OpenTelemetry LGTM stack for local observability Wire OTel traces, metrics, and logs through a collector to Tempo, Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay, Grafana provisioning, env templates, and development.sh auto-start. * feat: expand observability, harden dev tooling, and fix expo staging UX Add business and LLM Prometheus metrics with Grafana dashboards, alerting, and a metrics verification script. Wire telemetry through adapters and core LLM paths, and document the local LGTM workflow. Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome, and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge internal-eval into the main dev script with improved compose handling. Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs, show memoir empty state instead of load errors when no chapters exist, and add jest env setup plus chapter list response normalization. * chore: enable Grafana Assistant Cursor plugin * fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019. --------- Co-authored-by: Kevin <kevin@brighteng.org> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-20 15:14:13 +08:00
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
from app.core.config import settings
feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31) (#32) * add staging ios app build script * feat(api): add OpenTelemetry LGTM stack for local observability Wire OTel traces, metrics, and logs through a collector to Tempo, Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay, Grafana provisioning, env templates, and development.sh auto-start. * feat: expand observability, harden dev tooling, and fix expo staging UX Add business and LLM Prometheus metrics with Grafana dashboards, alerting, and a metrics verification script. Wire telemetry through adapters and core LLM paths, and document the local LGTM workflow. Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome, and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge internal-eval into the main dev script with improved compose handling. Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs, show memoir empty state instead of load errors when no chapters exist, and add jest env setup plus chapter list response normalization. * chore: enable Grafana Assistant Cursor plugin * fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019. --------- Co-authored-by: Kevin <kevin@brighteng.org> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-20 15:14:13 +08:00
from app.core.telemetry import get_tracer
from app.core.runtime_constants import agent_log_defaults
_dedup_lock = threading.Lock()
_last_prompt_sha256_by_label: dict[str, str] = {}
def _payload_sha256_hex(text: str) -> str:
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
def _payload_sha12(text: str) -> str:
return _payload_sha256_hex(text)[:12]
def agent_verbose_enabled() -> bool:
"""是否输出含 prompt/response 等 DEBUG 级详情。"""
raw = (settings.log_level or "INFO").strip().upper()
return raw in ("TRACE", "DEBUG")
def agent_summary_enabled() -> bool:
"""是否输出单行 INFO 摘要(耗时、规模等),不依赖全局 DEBUG。"""
if agent_verbose_enabled():
return True
return bool(agent_log_defaults.agent_verbose)
def truncate_for_log(text: str | None, *, max_chars: int | None = None) -> str:
"""截断过长文本避免日志爆量。max_chars / AGENT_LOG_MAX_CHARS 为 0 表示不截断。"""
if text is None:
return ""
max_c = max_chars if max_chars is not None else agent_log_defaults.max_chars
s = str(text)
if max_c <= 0 or len(s) <= max_c:
return s
return s[:max_c] + f"... [truncated total_len={len(s)}]"
def log_agent_detail(
logger: Any,
message: str,
*args: Any,
) -> None:
"""DEBUG 级详情(含预览时先判断 ``agent_verbose_enabled()``)。"""
if agent_verbose_enabled():
logger.debug(message, *args)
def log_agent_summary(
logger: Any,
message: str,
*args: Any,
) -> None:
"""INFO 级单行摘要(``LOG_AGENT_VERBOSE`` 或 DEBUG 全局时启用)。"""
if agent_summary_enabled():
logger.info(message, *args)
@contextmanager
def agent_span(
logger: Any,
operation: str,
**context: Any,
) -> Iterator[None]:
"""
记录一段 Agent/LLM 调用的起止与耗时
- DEBUG输出 context 与起止
- INFO ``LOG_AGENT_VERBOSE``仅输出结束行含 duration_ms
"""
t0 = time.perf_counter()
ctx = " ".join(f"{k}={v!r}" for k, v in context.items())
if agent_verbose_enabled():
logger.debug("agent_span_start {} {}", operation, ctx)
feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31) (#32) * add staging ios app build script * feat(api): add OpenTelemetry LGTM stack for local observability Wire OTel traces, metrics, and logs through a collector to Tempo, Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay, Grafana provisioning, env templates, and development.sh auto-start. * feat: expand observability, harden dev tooling, and fix expo staging UX Add business and LLM Prometheus metrics with Grafana dashboards, alerting, and a metrics verification script. Wire telemetry through adapters and core LLM paths, and document the local LGTM workflow. Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome, and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge internal-eval into the main dev script with improved compose handling. Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs, show memoir empty state instead of load errors when no chapters exist, and add jest env setup plus chapter list response normalization. * chore: enable Grafana Assistant Cursor plugin * fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019. --------- Co-authored-by: Kevin <kevin@brighteng.org> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-20 15:14:13 +08:00
def _log_end(ms: float) -> None:
if agent_verbose_enabled():
logger.debug("agent_span_end {} duration_ms={:.2f} {}", operation, ms, ctx)
elif agent_log_defaults.agent_verbose:
logger.info("agent_span {} duration_ms={:.2f} {}", operation, ms, ctx)
feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31) (#32) * add staging ios app build script * feat(api): add OpenTelemetry LGTM stack for local observability Wire OTel traces, metrics, and logs through a collector to Tempo, Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay, Grafana provisioning, env templates, and development.sh auto-start. * feat: expand observability, harden dev tooling, and fix expo staging UX Add business and LLM Prometheus metrics with Grafana dashboards, alerting, and a metrics verification script. Wire telemetry through adapters and core LLM paths, and document the local LGTM workflow. Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome, and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge internal-eval into the main dev script with improved compose handling. Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs, show memoir empty state instead of load errors when no chapters exist, and add jest env setup plus chapter list response normalization. * chore: enable Grafana Assistant Cursor plugin * fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019. --------- Co-authored-by: Kevin <kevin@brighteng.org> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-20 15:14:13 +08:00
if settings.otel_enabled:
tracer = get_tracer("app.agent")
with tracer.start_as_current_span(
"agent.operation",
attributes={"agent.operation": operation},
) as span:
failed = False
try:
yield
except Exception:
failed = True
if span.is_recording():
span.set_status(Status(StatusCode.ERROR))
raise
finally:
ms = (time.perf_counter() - t0) * 1000
if span.is_recording():
span.set_attribute("agent.duration_ms", round(ms, 2))
if not failed:
span.set_status(Status(StatusCode.OK))
_log_end(ms)
return
try:
yield
finally:
_log_end((time.perf_counter() - t0) * 1000)
def log_agent_payload(
logger: Any,
label: str,
text: str | None,
*,
max_chars: int | None = None,
) -> None:
"""在 DEBUG 下记录文本长度与截断预览(*.prompt 可 hash_only / 去重)。"""
if not agent_verbose_enabled():
return
2026-04-03 13:49:24 +08:00
raw = text or ""
total_len = len(raw)
digest = _payload_sha256_hex(raw)
sha12 = digest[:12]
is_prompt = label.endswith(".prompt")
if is_prompt and agent_log_defaults.prompt_dedup:
with _dedup_lock:
if _last_prompt_sha256_by_label.get(label) == digest:
logger.debug(
"agent_payload_skipped label={} reason=same_as_previous sha12={} total_len={}",
label,
sha12,
total_len,
)
return
_last_prompt_sha256_by_label[label] = digest
2026-04-03 13:49:24 +08:00
preview_source = raw
extra_note = ""
if (
is_prompt
and agent_log_defaults.json_prompt_prefix_chars > 0
and total_len > agent_log_defaults.json_prompt_prefix_only_if_len_gt
2026-04-03 13:49:24 +08:00
):
skip = agent_log_defaults.json_prompt_prefix_chars
2026-04-03 13:49:24 +08:00
preview_source = raw[skip:]
extra_note = f" skipped_prefix_chars={skip}"
mode = (agent_log_defaults.prompt_mode or "preview").strip().lower()
if is_prompt and mode == "hash_only":
logger.debug(
"agent_payload label={} total_len={} sha12={} mode=hash_only{}",
label,
total_len,
sha12,
extra_note,
)
return
2026-04-03 13:49:24 +08:00
preview = truncate_for_log(preview_source, max_chars=max_chars)
logger.debug(
"agent_payload {} total_len={}{} sha12={} preview={}",
label,
2026-04-03 13:49:24 +08:00
total_len,
extra_note,
sha12,
preview,
)