Merge branch 'eval/elapsed-time-memoir-batch-chunk' into development

This commit is contained in:
Kevin
2026-04-10 10:27:41 +08:00
66 changed files with 5246 additions and 705 deletions

View File

@@ -4,8 +4,9 @@ Phase1 批处理:一次 LLM 调用完成多段的抽取 + 章节分类(与
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Any, Dict, List
from typing import Any, Callable, Dict, List
from app.agents.memoir.prompts import get_batch_memoir_phase1_prep_prompt
from app.agents.memoir.schemas import BatchPhase1LLMOutput
@@ -107,3 +108,80 @@ def run_batch_phase1_prep(
logger.warning("batch phase1 id mismatch missing={} extra={}", missing, extra)
raise ValueError("batch phase1 response segment ids do not match input")
return by_id
def _run_batch_phase1_prep_chunk_with_bisect(
segments: List[Segment],
state: MemoirStateSchema,
llm: Any,
) -> Dict[str, BatchPhase1SegmentRow]:
"""单块 LLM失败时如输出截断将块二等分重试直至单段。"""
try:
return run_batch_phase1_prep(segments, state, llm)
except ValueError:
if len(segments) <= 1:
raise
mid = len(segments) // 2
if mid < 1:
raise
left = _run_batch_phase1_prep_chunk_with_bisect(
segments[:mid], state, llm
)
right = _run_batch_phase1_prep_chunk_with_bisect(
segments[mid:], state, llm
)
merged = {**left, **right}
expected = {str(s.id) for s in segments}
if merged.keys() != expected:
raise ValueError(
"batch phase1 chunked bisect merge: segment ids do not match input"
) from None
return merged
def run_batch_phase1_prep_chunked(
segments: List[Segment],
state: MemoirStateSchema,
llm: Any,
*,
chunk_size: int,
on_chunk: Callable[[int, int], None] | None = None,
) -> Dict[str, BatchPhase1SegmentRow]:
"""
将 segments 按 chunk_size 切片多次调用 Phase1 批处理 LLM合并 by_id。
单块仍失败时在块内二分回退(最后回退到单段),与 orchestrator 外层逐段回退衔接。
"""
if not segments:
return {}
if chunk_size < 1:
chunk_size = 1
n = len(segments)
total_chunks = max(1, math.ceil(n / chunk_size))
merged: Dict[str, BatchPhase1SegmentRow] = {}
for i in range(0, n, chunk_size):
chunk_idx = i // chunk_size + 1
sub = segments[i : i + chunk_size]
logger.info(
"event=batch_phase1_chunk chunk_idx={}/{} segment_count={} batch_path=chunked "
"msg=Phase1 批处理分块调用",
chunk_idx,
total_chunks,
len(sub),
)
part = _run_batch_phase1_prep_chunk_with_bisect(sub, state, llm)
merged.update(part)
if on_chunk is not None:
on_chunk(chunk_idx, total_chunks)
expected = {str(s.id) for s in segments}
if merged.keys() != expected:
missing = expected - merged.keys()
extra = merged.keys() - expected
logger.warning(
"batch phase1 chunked id mismatch missing={} extra={}",
missing,
extra,
)
raise ValueError(
"batch phase1 chunked: merged segment ids do not match input"
)
return merged

View File

@@ -8,11 +8,11 @@ from __future__ import annotations
import time
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Set, Tuple
from typing import Any, Callable, Dict, List, Optional, Set, Tuple
from app.agents.memoir.batch_phase1_prep import (
STAGE_ALLOWED_SLOTS,
run_batch_phase1_prep,
run_batch_phase1_prep_chunked,
)
from app.agents.memoir.classification_agent import (
ClassificationAgent,
@@ -63,6 +63,7 @@ class MemoirOrchestrator:
get_or_create_state: Callable[[], MemoirStateSchema],
update_slot: Callable[[str, str, str, List[str]], MemoirStateSchema],
llm_fast: Any | None = None,
on_phase1_chunk: Optional[Callable[[int, int], None]] = None,
) -> PreparedMemoirBatches:
"""
遍历 segmentsExtraction → slot 更新 → Classification → 按 category 分桶。
@@ -89,15 +90,19 @@ class MemoirOrchestrator:
state=state,
classify_extract_llm=classify_extract_llm,
update_slot=update_slot,
on_phase1_chunk=on_phase1_chunk,
)
logger.info(
"event=phase1_batch_path_used segment_count={}",
"event=phase1_batch_path_used segment_count={} "
"msg=Phase1 批处理 LLM 路径已使用",
len(segments),
)
return result
except Exception as e:
logger.warning(
"MemoirOrchestrator.prepare_batches batch LLM 失败,回退逐段: {}",
"event=phase1_batch_path_fallback segment_count={} exc={} "
"msg=Phase1 批处理失败,回退逐段",
len(segments),
e,
)
@@ -172,12 +177,19 @@ class MemoirOrchestrator:
state: MemoirStateSchema,
classify_extract_llm: Any,
update_slot: Callable[[str, str, str, List[str]], MemoirStateSchema],
on_phase1_chunk: Optional[Callable[[int, int], None]] = None,
) -> PreparedMemoirBatches:
category_to_segments: Dict[str, List[Segment]] = {}
segment_skip_story_ids: Set[str] = set()
segment_chapter_category: Dict[str, str] = {}
by_id = run_batch_phase1_prep(segments, state, classify_extract_llm)
by_id = run_batch_phase1_prep_chunked(
segments,
state,
classify_extract_llm,
chunk_size=int(settings.memoir_phase1_batch_llm_chunk_size),
on_chunk=on_phase1_chunk,
)
for segment in segments:
text = segment.user_input_text or ""
@@ -289,6 +301,7 @@ class MemoirOrchestrator:
llm_fast=llm_fast,
get_or_create_state=get_or_create_state,
update_slot=update_slot,
on_phase1_chunk=None,
)
state = prepared.state
chapters_to_enqueue: Set[str] = set()