~cytrogen/evi-run

60ce6575f4d2cd33187309b6c2d4f61ffa44d003 — Bendy 5 months ago 65b2381
send_answer: dynamic budget, safe HTML splitting, pre/code fixes, HTML comments escaping; agents_: tool routing wording; cleanup (remove dead code)
2 files changed, 241 insertions(+), 38 deletions(-)

M bot/agents_tools/agents_.py
M bot/utils/send_answer.py
M bot/agents_tools/agents_.py => bot/agents_tools/agents_.py +4 -4
@@ 133,14 133,14 @@ async def create_main_agent(user_id: int, mcp_server_1: MCPServerStdio, knowledg
        ⚠️ When you receive a message marked <msg from Task Scheduler>, just execute the request, and do not create a new task unless it is explicitly stated in the message. Because this is a message from the Task Scheduler about the need to complete the current task, not about scheduling a new task.
        - search_knowledge_base: Use it to extract facts from uploaded reference materials; if necessary, refer to sources. 
        - search_conversation_memory: Use to recall prior conversations, user preferences, details about the user and extract information from files uploaded by the user.
        - Web Search: Use it as an Internet browser to search for current, external information and any other operational information / data that can be found on the web (weather, news, brief reviews, short facts, events, exchange rates, etc.). Use RUNTIME CONTEXT for the notion of "current time".
        - web: Use it as an Internet browser to search for current, external information and any other operational information / data that can be found on the web (weather, news, brief reviews, short facts, events, exchange rates, etc.). Use RUNTIME CONTEXT for the notion of "current time".
        - image_gen_tool: Only generate new images (no editing). Do not include base64 or links; the image is attached automatically.
        - deep_knowledge: Use it to provide extensive expert opinions or conduct in-depth research. Give the tool's report to the user as close to the original as possible: do not generalize, shorten, or change the style. Be sure to include key sources and links from the report. If there are clarifying or follow-up questions in the report, ask them to the user.
        - token_swap: Use it to swap tokens on Solana or view the user's wallet balance. Do not ask the user for the wallet address, it is already known to the tool. You may not see this tool in your list if the user has not enabled it.
        - DexPaprika (getNetworks, getNetworkDexes, getNetworkPools, getDexPools, getPoolDetails, getTokenDetails, getTokenPools, getPoolOHLCV, getPoolTransactions, search, getStats): Use it for token analytics, DeFi analytics and DEX analytics. 
        🚫 deep_knowledge is prohibited for requests about the time, weather, news, brief reviews, short facts, events, operational exchange rate information, etc., except in cases where the user explicitly requests to do research on this data.
        ✅ For operational data — only Web Search. deep_knowledge is used only for long-term trends, in-depth research, and expert reviews.
        ⚠️ If you receive a request for the latest news, summaries, events, etc., do not look for them in your training data, but use a Web Search.
        ✅ For operational data — only web. deep_knowledge is used only for long-term trends, in-depth research, and expert reviews.
        ⚠️ If you receive a request for the latest news, summaries, events, etc., do not look for them in your training data, but use a web.

        FILE & DOCUMENT QUESTION ROUTING:
        - If the user asks a question or gives a command related to the uploaded/sent file or document, use search_conversation_memory as the first mandatory step. If there is no data about the requested file or document, inform the user about it.


@@ 148,7 148,7 @@ async def create_main_agent(user_id: int, mcp_server_1: MCPServerStdio, knowledg
        EXECUTION DISCIPLINE: 
        - Validate tool outputs and handle errors gracefully. If uncertain, ask a clarifying question.
        - Be transparent about limitations and avoid hallucinations; prefer asking for missing details over guessing.
        - Before stating any concrete date/month/year as "current/today/now", first check RUNTIME CONTEXT; if RUNTIME CONTEXT is missing or insufficient, ask the user or use Web Search. Never use your training data/cutoff to infer "today".
        - Before stating any concrete date/month/year as "current/today/now", first check RUNTIME CONTEXT; if RUNTIME CONTEXT is missing or insufficient, ask the user or use web. Never use your training data/cutoff to infer "today".

        REFERENCE MATERIALS (The reference materials uploaded to search_knowledge_base are listed here):
        -

M bot/utils/send_answer.py => bot/utils/send_answer.py +237 -34
@@ 34,57 34,260 @@ async def send_answer_text(user_ques: str, message: Message, answer: AnswerText,
                await message.answer(mess)


def split_code_message(text, chunk_size=3700, type_: str = None):
def split_code_message(text, type_: str = None):
    """
    Reliably split Telegram HTML into chunks while preserving valid markup.
    - Self-closing tags are not pushed to the stack and therefore are not closed.
    - For opened tags we store the full opening form including attributes to re-open later.
    - Never split inside an HTML tag or inside an HTML entity.
    - Preserve Telegram-specific nuances such as <blockquote expandable> and <pre>/<code> blocks.
    """
    if not type_:
        text = telegram_format(text)
        text = text.replace('&lt;blockquote expandable&gt;', '<blockquote expandable>')

    # Escape HTML comments <!-- ... --> so they are treated as text,
    # not as tags that could break the open/close stack while splitting
    comment_pattern = re.compile(r"<!--.*?-->", re.DOTALL)

    def _escape_comment(m):
        c = m.group(0)
        return c.replace('<', '&lt;').replace('>', '&gt;')

    text = comment_pattern.sub(_escape_comment, text)

    chunks = []
    current_chunk = ""
    open_tags = []

    # Stack of opened tags: items are dicts {name, open}
    open_stack = []
    position = 0
    tag_pattern = re.compile(r"<(\/)?([a-zA-Z0-9\-]+)([^>]*)>")

    def close_open_tags():
        return "".join(f"</{tag}>" for tag in reversed(open_tags))
    tag_pattern = re.compile(r"<(\/)?([a-zA-Z0-9\-]+)([^>]*)>")

    def reopen_tags():
        return "".join(f"<{tag if tag != 'blockquote' else 'blockquote expandable'}>" for tag in open_tags)
    # Set of self-closing/non-closing tags in Telegram HTML context
    SELF_CLOSING = {"br"}

    def is_self_closing(tag_name: str, tag_full: str) -> bool:
        return tag_name in SELF_CLOSING or tag_full.strip().endswith('/>')

    def close_open_tags() -> str:
        # Close only normal opened tags in reverse order
        closing = []
        for item in reversed(open_stack):
            closing.append(f"</{item['name']}>")
        return "".join(closing)

    def reopen_tags() -> str:
        # Re-open saved opening tags (with attributes) in original order.
        # For blockquote expandable we keep the original form as-is.
        return "".join(item['open'] for item in open_stack)

    def escape_tag_text(tag_text: str) -> str:
        """Render a tag as plain text by escaping angle brackets."""
        return tag_text.replace('<', '&lt;').replace('>', '&gt;')

    def safe_cut_index(text_: str, start: int, tentative_end: int) -> int:
        """Shift a split position so that we never cut inside a tag or an HTML entity."""
        end = min(tentative_end, len(text_))
        if end <= start:
            return end

        segment = text_[start:end]

        # 1) Do not split inside a tag: if the last '<' is after the last '>' -> move back to that '<'
        last_lt = segment.rfind('<')
        last_gt = segment.rfind('>')
        if last_lt != -1 and (last_gt == -1 or last_lt > last_gt):
            end = start + last_lt
            if end <= start:
                return start
            segment = text_[start:end]

        # 2) Do not split inside an entity: if there's '&' after the last ';' -> move back to that '&'
        last_amp = segment.rfind('&')
        last_semi = segment.rfind(';')
        if last_amp != -1 and (last_semi == -1 or last_amp > last_semi):
            end = start + last_amp

        return end

    text_len = len(text)
    while position < text_len:
        # Dynamic budget for the current chunk
        SAFETY = 64
        BASE_LIMIT = 3900
        allowed_total = BASE_LIMIT - len(close_open_tags()) - len(reopen_tags()) - SAFETY
        # Clamp to reasonable bounds just in case
        if allowed_total < 1000:
            allowed_total = 1000
        elif allowed_total > BASE_LIMIT:
            allowed_total = BASE_LIMIT

        # If current chunk is full — close and start a new one
        if len(current_chunk) >= allowed_total:
            current_chunk += close_open_tags()
            chunks.append(current_chunk)
            current_chunk = reopen_tags()

    while position < len(text):
        if len(current_chunk) >= chunk_size:
        # Compute the boundary where we can safely write more characters
        tentative_end = position + (allowed_total - len(current_chunk))
        if tentative_end <= position:
            # No room left — force a chunk break
            current_chunk += close_open_tags()
            chunks.append(current_chunk)
            current_chunk = reopen_tags()
            continue

        next_cut = position + chunk_size - len(current_chunk)
        next_match = tag_pattern.search(text, position, next_cut)
        # Look for the next tag before the boundary
        next_match = tag_pattern.search(text, position, min(tentative_end, text_len))

        if not next_match:
            current_chunk += text[position:next_cut]
            position = next_cut
            # No tags before boundary — split at a safe position
            cut_idx = safe_cut_index(text, position, min(tentative_end, text_len))
            if cut_idx == position:
                # No safe position found in the window — extend the window to find the next tag/entity end
                extend_end = min(position + 100 + (allowed_total - len(current_chunk)), text_len)
                next_match_ext = tag_pattern.search(text, position, extend_end)
                if next_match_ext:
                    cut_idx = next_match_ext.start()
                else:
                    # No complete tag found in lookahead — split before a partial '<...'
                    extended_segment = text[position:extend_end]
                    last_lt = extended_segment.rfind('<')
                    if last_lt != -1:
                        # Check if there's '>' after that '<' in the extended window
                        gt_after = extended_segment.find('>', last_lt + 1)
                        if gt_after == -1:
                            # Tag is not completed within the window — cut before '<'
                            cut_idx = position + last_lt
                        else:
                            cut_idx = extend_end
                    else:
                        cut_idx = extend_end
            # Zero-shift guard (when cut_idx == position):
            # happens if a partial tag starts exactly at 'position'.
            if cut_idx == position:
                if current_chunk:
                    # Close current chunk and start a new one before continuing
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()
                    continue
                else:
                    # Current chunk is empty — extend search forward to the next '>' and advance at least to it
                    search_end = min(position + 300, text_len)
                    gt_global = text.find('>', position, search_end)
                    if gt_global != -1:
                        cut_idx = gt_global + 1
                    else:
                        # Last resort — move to search_end to avoid infinite loop
                        cut_idx = search_end
            current_chunk += text[position:cut_idx]
            position = cut_idx
            continue

        # There is a tag before the boundary
        start_tag, end_tag = next_match.span()
        tag_full = next_match.group(0)
        is_closing = next_match.group(1) == "/"
        tag_name = next_match.group(2)
        _ = next_match.group(3)

        # If text before the tag doesn't fit — break the chunk
        if (start_tag - position) + len(current_chunk) > allowed_total:
            current_chunk += close_open_tags()
            chunks.append(current_chunk)
            current_chunk = reopen_tags()
            continue

        # Append text up to the tag
        current_chunk += text[position:start_tag]
        position = start_tag

        # Tag handling
        if is_closing:
            # Prefer strict LIFO, but outside pre/code try to fix nesting to preserve formatting
            if open_stack and open_stack[-1]['name'] == tag_name:
                # Does the tag itself fit into the current chunk?
                if len(current_chunk) + (end_tag - start_tag) > allowed_total:
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()
                current_chunk += tag_full
                # Pop the top tag
                open_stack.pop()
            else:
                if open_stack and open_stack[-1]['name'] in {"pre", "code"}:
                    # Inside pre/code escape foreign closing tags as text
                    escaped = escape_tag_text(tag_full)
                    if len(current_chunk) + len(escaped) > allowed_total:
                        current_chunk += close_open_tags()
                        chunks.append(current_chunk)
                        current_chunk = reopen_tags()
                    current_chunk += escaped
                else:
                    # Outside pre/code: normalize nesting by auto-closing tags down to target.
                    # Find the target tag in the stack (from the end). If not found — escape as text.
                    target_idx = None
                    for idx in range(len(open_stack) - 1, -1, -1):
                        if open_stack[idx]['name'] == tag_name:
                            target_idx = idx
                            break
                    if target_idx is None:
                        escaped = escape_tag_text(tag_full)
                        if len(current_chunk) + len(escaped) > allowed_total:
                            current_chunk += close_open_tags()
                            chunks.append(current_chunk)
                            current_chunk = reopen_tags()
                        current_chunk += escaped
                    else:
                        # Close all tags above the target sequentially
                        names_above = [open_stack[i]['name'] for i in range(len(open_stack) - 1, target_idx, -1)]
                        estimated = sum(len(f"</{n}>") for n in names_above) + (end_tag - start_tag)
                        if len(current_chunk) + estimated > allowed_total:
                            # Start a new chunk before emitting the closing sequence to stay within budget
                            current_chunk += close_open_tags()
                            chunks.append(current_chunk)
                            current_chunk = reopen_tags()
                        # Emit the closing tags for the ones above the target
                        for n in names_above:
                            current_chunk += f"</{n}>"
                            open_stack.pop()
                        # Finally append the original closing tag for the target and pop it
                        current_chunk += tag_full
                        open_stack.pop()  # снимаем целевой тег
        else:
            start, end = next_match.span()
            tag_full = next_match.group(0)
            is_closing = next_match.group(1) == "/"
            tag_name = next_match.group(2)

            if start - position + len(current_chunk) >= chunk_size:
                current_chunk += close_open_tags()
                chunks.append(current_chunk)
                current_chunk = reopen_tags()

            current_chunk += text[position:start]
            position = start

            if is_closing:
                if tag_name in open_tags:
                    open_tags.remove(tag_name)
            # Opening tag
            # If we are inside pre/code and encounter a non pre/code tag — escape as text, do not push to stack
            if open_stack and open_stack[-1]['name'] in {"pre", "code"} and tag_name not in {"pre", "code"}:
                escaped_open = escape_tag_text(tag_full)
                if len(current_chunk) + len(escaped_open) > allowed_total:
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()
                current_chunk += escaped_open
            else:
                open_tags.append(tag_name)

            current_chunk += tag_full
            position = end

                if len(current_chunk) + (end_tag - start_tag) > allowed_total:
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()

                current_chunk += tag_full

                # Do not push self-closing tags to the stack
                if not is_self_closing(tag_name, tag_full):
                    # Save the original opening form with attributes.
                    # Special case blockquote expandable — keep as-is.
                    opening = tag_full
                    open_stack.append({
                        'name': tag_name,
                        'open': opening,
                    })

        position = end_tag

    # Finalization
    if current_chunk:
        current_chunk += close_open_tags()
        chunks.append(current_chunk)