~cytrogen/evi-run

ref: f9177ebbdb748fbfd9aec9173405de7691a3bf26 evi-run/bot/utils/send_answer.py -rw-r--r-- 16.9 KiB
f9177ebb — Cytrogen fork: 定制修改 + 敏感数据清理 9 days ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import re

from agents.mcp import MCPServerStdio
from aiogram.types import Message, BufferedInputFile
from chatgpt_md_converter import telegram_format
from redis.asyncio.client import Redis

from bot.utils.calculate_tokens import calculate_tokens
from database.models import User
from database.repositories.user import UserRepository
from database.repositories.utils import UtilsRepository
from bot.utils.agent_requests import AnswerText, text_request, AnswerImage, image_request
import bot.keyboards.inline as inline_kb


async def send_answer_text(user_ques: str, message: Message, answer: AnswerText, user: User, user_repo: UserRepository, i18n):
    if answer.image_bytes:
        await message.answer_photo(photo=BufferedInputFile(answer.image_bytes, filename=f"{user.telegram_id}.jpeg"),
                                   caption=answer.answer)

        await user_repo.add_context(user_id=user.telegram_id, role='user', content=user_ques)
        await user_repo.add_context(user_id=user.telegram_id, role='assistant', content=answer.answer)
    else:
        await user_repo.add_context(user_id=user.telegram_id, role='user', content=user_ques)
        row_id = await user_repo.add_context(user_id=user.telegram_id, role='assistant', content=answer.answer)
        messages = split_code_message(answer.answer)

        for index, mess in enumerate(messages, 1):
            if len(messages) == index:
                await message.answer(mess,
                                     reply_markup=inline_kb.keyboard_md(row_id=row_id, text=i18n.get('answer_md')))
            else:
                await message.answer(mess)


def split_code_message(text, type_: str = None):
    """
    Reliably split Telegram HTML into chunks while preserving valid markup.
    - Self-closing tags are not pushed to the stack and therefore are not closed.
    - For opened tags we store the full opening form including attributes to re-open later.
    - Never split inside an HTML tag or inside an HTML entity.
    - Preserve Telegram-specific nuances such as <blockquote expandable> and <pre>/<code> blocks.
    """
    if not type_:
        text = telegram_format(text)
        text = text.replace('&lt;blockquote expandable&gt;', '<blockquote expandable>')

    # Escape HTML comments <!-- ... --> so they are treated as text,
    # not as tags that could break the open/close stack while splitting
    comment_pattern = re.compile(r"<!--.*?-->", re.DOTALL)

    def _escape_comment(m):
        c = m.group(0)
        return c.replace('<', '&lt;').replace('>', '&gt;')

    text = comment_pattern.sub(_escape_comment, text)

    chunks = []
    current_chunk = ""

    # Stack of opened tags: items are dicts {name, open}
    open_stack = []
    position = 0

    tag_pattern = re.compile(r"<(\/)?([a-zA-Z0-9\-]+)([^>]*)>")

    # Set of self-closing/non-closing tags in Telegram HTML context
    SELF_CLOSING = {"br"}

    def is_self_closing(tag_name: str, tag_full: str) -> bool:
        return tag_name in SELF_CLOSING or tag_full.strip().endswith('/>')

    def close_open_tags() -> str:
        # Close only normal opened tags in reverse order
        closing = []
        for item in reversed(open_stack):
            closing.append(f"</{item['name']}>")
        return "".join(closing)

    def reopen_tags() -> str:
        # Re-open saved opening tags (with attributes) in original order.
        # For blockquote expandable we keep the original form as-is.
        return "".join(item['open'] for item in open_stack)

    def escape_tag_text(tag_text: str) -> str:
        """Render a tag as plain text by escaping angle brackets."""
        return tag_text.replace('<', '&lt;').replace('>', '&gt;')

    def safe_cut_index(text_: str, start: int, tentative_end: int) -> int:
        """Shift a split position so that we never cut inside a tag or an HTML entity."""
        end = min(tentative_end, len(text_))
        if end <= start:
            return end

        segment = text_[start:end]

        # 1) Do not split inside a tag: if the last '<' is after the last '>' -> move back to that '<'
        last_lt = segment.rfind('<')
        last_gt = segment.rfind('>')
        if last_lt != -1 and (last_gt == -1 or last_lt > last_gt):
            end = start + last_lt
            if end <= start:
                return start
            segment = text_[start:end]

        # 2) Do not split inside an entity: if there's '&' after the last ';' -> move back to that '&'
        last_amp = segment.rfind('&')
        last_semi = segment.rfind(';')
        if last_amp != -1 and (last_semi == -1 or last_amp > last_semi):
            end = start + last_amp

        return end

    text_len = len(text)
    while position < text_len:
        # Dynamic budget for the current chunk
        SAFETY = 64
        BASE_LIMIT = 3900
        allowed_total = BASE_LIMIT - len(close_open_tags()) - len(reopen_tags()) - SAFETY
        # Clamp to reasonable bounds just in case
        if allowed_total < 1000:
            allowed_total = 1000
        elif allowed_total > BASE_LIMIT:
            allowed_total = BASE_LIMIT

        # If current chunk is full — close and start a new one
        if len(current_chunk) >= allowed_total:
            current_chunk += close_open_tags()
            chunks.append(current_chunk)
            current_chunk = reopen_tags()

        # Compute the boundary where we can safely write more characters
        tentative_end = position + (allowed_total - len(current_chunk))
        if tentative_end <= position:
            # No room left — force a chunk break
            current_chunk += close_open_tags()
            chunks.append(current_chunk)
            current_chunk = reopen_tags()
            continue

        # Look for the next tag before the boundary
        next_match = tag_pattern.search(text, position, min(tentative_end, text_len))

        if not next_match:
            # No tags before boundary — split at a safe position
            cut_idx = safe_cut_index(text, position, min(tentative_end, text_len))
            if cut_idx == position:
                # No safe position found in the window — extend the window to find the next tag/entity end
                extend_end = min(position + 100 + (allowed_total - len(current_chunk)), text_len)
                next_match_ext = tag_pattern.search(text, position, extend_end)
                if next_match_ext:
                    cut_idx = next_match_ext.start()
                else:
                    # No complete tag found in lookahead — split before a partial '<...'
                    extended_segment = text[position:extend_end]
                    last_lt = extended_segment.rfind('<')
                    if last_lt != -1:
                        # Check if there's '>' after that '<' in the extended window
                        gt_after = extended_segment.find('>', last_lt + 1)
                        if gt_after == -1:
                            # Tag is not completed within the window — cut before '<'
                            cut_idx = position + last_lt
                        else:
                            cut_idx = extend_end
                    else:
                        cut_idx = extend_end
            # Zero-shift guard (when cut_idx == position):
            # happens if a partial tag starts exactly at 'position'.
            if cut_idx == position:
                if current_chunk:
                    # Close current chunk and start a new one before continuing
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()
                    continue
                else:
                    # Current chunk is empty — extend search forward to the next '>' and advance at least to it
                    search_end = min(position + 300, text_len)
                    gt_global = text.find('>', position, search_end)
                    if gt_global != -1:
                        cut_idx = gt_global + 1
                    else:
                        # Last resort — move to search_end to avoid infinite loop
                        cut_idx = search_end
            current_chunk += text[position:cut_idx]
            position = cut_idx
            continue

        # There is a tag before the boundary
        start_tag, end_tag = next_match.span()
        tag_full = next_match.group(0)
        is_closing = next_match.group(1) == "/"
        tag_name = next_match.group(2)
        _ = next_match.group(3)

        # If text before the tag doesn't fit — break the chunk
        if (start_tag - position) + len(current_chunk) > allowed_total:
            current_chunk += close_open_tags()
            chunks.append(current_chunk)
            current_chunk = reopen_tags()
            continue

        # Append text up to the tag
        current_chunk += text[position:start_tag]
        position = start_tag

        # Tag handling
        if is_closing:
            # Prefer strict LIFO, but outside pre/code try to fix nesting to preserve formatting
            if open_stack and open_stack[-1]['name'] == tag_name:
                # Does the tag itself fit into the current chunk?
                if len(current_chunk) + (end_tag - start_tag) > allowed_total:
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()
                current_chunk += tag_full
                # Pop the top tag
                open_stack.pop()
            else:
                if open_stack and open_stack[-1]['name'] in {"pre", "code"}:
                    # Inside pre/code escape foreign closing tags as text
                    escaped = escape_tag_text(tag_full)
                    if len(current_chunk) + len(escaped) > allowed_total:
                        current_chunk += close_open_tags()
                        chunks.append(current_chunk)
                        current_chunk = reopen_tags()
                    current_chunk += escaped
                else:
                    # Outside pre/code: normalize nesting by auto-closing tags down to target.
                    # Find the target tag in the stack (from the end). If not found — escape as text.
                    target_idx = None
                    for idx in range(len(open_stack) - 1, -1, -1):
                        if open_stack[idx]['name'] == tag_name:
                            target_idx = idx
                            break
                    if target_idx is None:
                        escaped = escape_tag_text(tag_full)
                        if len(current_chunk) + len(escaped) > allowed_total:
                            current_chunk += close_open_tags()
                            chunks.append(current_chunk)
                            current_chunk = reopen_tags()
                        current_chunk += escaped
                    else:
                        # Close all tags above the target sequentially
                        names_above = [open_stack[i]['name'] for i in range(len(open_stack) - 1, target_idx, -1)]
                        estimated = sum(len(f"</{n}>") for n in names_above) + (end_tag - start_tag)
                        if len(current_chunk) + estimated > allowed_total:
                            # Start a new chunk before emitting the closing sequence to stay within budget
                            current_chunk += close_open_tags()
                            chunks.append(current_chunk)
                            current_chunk = reopen_tags()
                        # Emit the closing tags for the ones above the target
                        for n in names_above:
                            current_chunk += f"</{n}>"
                            open_stack.pop()
                        # Finally append the original closing tag for the target and pop it
                        current_chunk += tag_full
                        open_stack.pop()  # снимаем целевой тег
        else:
            # Opening tag
            # If we are inside pre/code and encounter a non pre/code tag — escape as text, do not push to stack
            if open_stack and open_stack[-1]['name'] in {"pre", "code"} and tag_name not in {"pre", "code"}:
                escaped_open = escape_tag_text(tag_full)
                if len(current_chunk) + len(escaped_open) > allowed_total:
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()
                current_chunk += escaped_open
            else:
                if len(current_chunk) + (end_tag - start_tag) > allowed_total:
                    current_chunk += close_open_tags()
                    chunks.append(current_chunk)
                    current_chunk = reopen_tags()

                current_chunk += tag_full

                # Do not push self-closing tags to the stack
                if not is_self_closing(tag_name, tag_full):
                    # Save the original opening form with attributes.
                    # Special case blockquote expandable — keep as-is.
                    opening = tag_full
                    open_stack.append({
                        'name': tag_name,
                        'open': opening,
                    })

        position = end_tag

    # Finalization
    if current_chunk:
        current_chunk += close_open_tags()
        chunks.append(current_chunk)

    return chunks


async def process_after_text(message: Message, user: User, user_repo: UserRepository,
                             utils_repo: UtilsRepository, redis: Redis, i18n,
                             mess_to_delete: Message, mcp_server_1: MCPServerStdio, scheduler, text_from_voice: str = None,
                             constant_text: str = None):
    try:
        if text_from_voice:
            user_ques = text_from_voice
        elif constant_text:
            user_ques = constant_text
        else:
            user_ques = message.text

        answer = await text_request(text=user_ques, user=user,
                                    user_repo=user_repo, utils_repo=utils_repo, redis=redis, mcp_server_1=mcp_server_1,
                                    bot=message.bot, scheduler=scheduler)

        await send_answer_text(user_ques=user_ques,
                               message=message, answer=answer, user=user, user_repo=user_repo, i18n=i18n)

        await calculate_tokens(user=user, user_repo=user_repo, input_tokens_text=answer.input_tokens,
                               input_tokens_img=answer.input_tokens_image, output_tokens_text=answer.output_tokens,
                               output_tokens_img=answer.output_tokens_image)
    except Exception as e:
        print(e)
        await message.answer(text=i18n.get('warning_text_error'))
    finally:
        await redis.delete(f'request_{message.from_user.id}')
        await mess_to_delete.delete()


async def send_answer_photo(message: Message, answer: AnswerImage, user: User, user_repo: UserRepository):
    caption = message.caption if message.caption else '.'
    await user_repo.add_context(user_id=user.telegram_id, role='user', content=f'{answer.image_path}|{caption}')
    await user_repo.add_context(user_id=user.telegram_id, role='assistant', content=answer.answer)

    messages = split_code_message(answer.answer)

    for index, mess in enumerate(messages, 1):
        await message.answer(mess)


async def process_after_photo(message: Message, user: User, user_repo: UserRepository,
                              utils_repo: UtilsRepository, redis: Redis, i18n, mess_to_delete: Message,
                              mcp_server_1: MCPServerStdio, scheduler):
    try:
        file_id = message.photo[-1].file_id
        file_path = await message.bot.get_file(file_id=file_id)
        file_bytes = (await message.bot.download_file(file_path.file_path)).read()
        answer = await image_request(image_bytes=file_bytes, user=user, user_repo=user_repo,
                                     utils_repo=utils_repo, redis=redis, mcp_server_1=mcp_server_1, bot=message.bot,
                                     caption=message.caption, scheduler=scheduler)

        await send_answer_photo(message=message, answer=answer, user=user, user_repo=user_repo)

        await calculate_tokens(user=user, user_repo=user_repo, input_tokens_text=answer.input_tokens,
                               input_tokens_img=0, output_tokens_text=answer.output_tokens,
                               output_tokens_img=0)
    except Exception as e:
        await message.answer(text=i18n.get('warning_text_error'))
    finally:
        await redis.delete(f'request_{message.from_user.id}')
        await mess_to_delete.delete()