feat: интеграция ML-скоринга с использованием RAG и DeepSeek

- Обновлен Dockerfile для установки необходимых зависимостей.
- Добавлены новые переменные окружения для настройки ML-скоринга в env.example.
- Реализованы методы для получения и обновления ML-скоров в AsyncBotDB и PostRepository.
- Обновлены обработчики публикации постов для интеграции ML-скоринга.
- Добавлен новый обработчик для получения статистики ML-скоринга в админ-панели.
- Обновлены функции для форматирования сообщений с учетом ML-скоров.
This commit is contained in:
2026-01-26 18:40:38 +03:00
parent e2b1353408
commit 7f6f0f028c
25 changed files with 2833 additions and 52 deletions

View File

@@ -16,6 +16,7 @@ from helper_bot.keyboards.keyboards import (create_keyboard_for_approve_ban,
create_keyboard_for_ban_reason,
create_keyboard_with_pagination,
get_reply_keyboard_admin)
from helper_bot.utils.base_dependency_factory import get_global_instance
# Local imports - metrics
from helper_bot.utils.metrics import db_query_time, track_errors, track_time
from logs.custom_logger import logger
@@ -137,6 +138,69 @@ async def get_banned_users(
await handle_admin_error(message, e, state, "get_banned_users")
@admin_router.message(
ChatTypeFilter(chat_type=["private"]),
StateFilter("ADMIN"),
F.text == '📊 ML Статистика'
)
@track_time("get_ml_stats", "admin_handlers")
@track_errors("admin_handlers", "get_ml_stats")
async def get_ml_stats(
message: types.Message,
state: FSMContext,
**kwargs
):
"""Получение статистики ML-скоринга"""
try:
logger.info(f"Запрос ML статистики от пользователя: {message.from_user.full_name}")
bdf = get_global_instance()
scoring_manager = bdf.get_scoring_manager()
if not scoring_manager:
await message.answer("📊 ML Scoring отключен\n\nДля включения установите RAG_ENABLED=true или DEEPSEEK_ENABLED=true в .env")
return
stats = scoring_manager.get_stats()
# Формируем текст статистики
lines = ["📊 <b>ML Scoring Статистика</b>\n"]
# RAG статистика
if "rag" in stats:
rag = stats["rag"]
lines.append("🤖 <b>RAG (ruBERT):</b>")
lines.append(f" • Статус: {'✅ Включен' if rag.get('enabled') else '❌ Отключен'}")
lines.append(f" • Модель: {rag.get('model_name', 'N/A')}")
lines.append(f" • Модель загружена: {'' if rag.get('model_loaded') else ''}")
vs = rag.get("vector_store", {})
lines.append(f" • Положительных примеров: {vs.get('positive_count', 0)}")
lines.append(f" • Отрицательных примеров: {vs.get('negative_count', 0)}")
lines.append(f"Всего примеров: {vs.get('total_count', 0)}")
lines.append(f" • Макс. примеров: {vs.get('max_examples', 'N/A')}")
lines.append("")
# DeepSeek статистика
if "deepseek" in stats:
ds = stats["deepseek"]
lines.append("🔮 <b>DeepSeek API:</b>")
lines.append(f" • Статус: {'✅ Включен' if ds.get('enabled') else '❌ Отключен'}")
lines.append(f" • Модель: {ds.get('model', 'N/A')}")
lines.append(f" • Таймаут: {ds.get('timeout', 'N/A')}с")
lines.append("")
# Если ничего не включено
if "rag" not in stats and "deepseek" not in stats:
lines.append("⚠️ Ни один сервис не настроен")
await message.answer("\n".join(lines), parse_mode="HTML")
except Exception as e:
logger.error(f"Ошибка получения ML статистики: {e}")
await message.answer(f"❌ Ошибка получения статистики: {str(e)}")
# ============================================================================
# ХЕНДЛЕРЫ ПРОЦЕССА БАНА
# ============================================================================

View File

@@ -15,7 +15,8 @@ def get_post_publish_service() -> PostPublishService:
db = bdf.get_db()
settings = bdf.settings
s3_storage = bdf.get_s3_storage()
return PostPublishService(None, db, settings, s3_storage)
scoring_manager = bdf.get_scoring_manager()
return PostPublishService(None, db, settings, s3_storage, scoring_manager)
def get_ban_service() -> BanService:

View File

@@ -29,12 +29,13 @@ from .exceptions import (BanError, PostNotFoundError, PublishError,
class PostPublishService:
def __init__(self, bot: Bot, db, settings: Dict[str, Any], s3_storage=None):
def __init__(self, bot: Bot, db, settings: Dict[str, Any], s3_storage=None, scoring_manager=None):
# bot может быть None - в этом случае используем бота из контекста сообщения
self.bot = bot
self.db = db
self.settings = settings
self.s3_storage = s3_storage
self.scoring_manager = scoring_manager
self.group_for_posts = settings['Telegram']['group_for_posts']
self.main_public = settings['Telegram']['main_public']
self.important_logs = settings['Telegram']['important_logs']
@@ -392,6 +393,9 @@ class PostPublishService:
async def _decline_single_post(self, call: CallbackQuery) -> None:
"""Отклонение одиночного поста"""
author_id = await self._get_author_id(call.message.message_id)
# Обучаем RAG на отклоненном посте перед удалением
await self._train_on_declined(call.message.message_id)
updated_rows = await self.db.update_status_by_message_id(call.message.message_id, "declined")
if updated_rows == 0:
@@ -485,6 +489,9 @@ class PostPublishService:
@track_errors("post_publish_service", "_delete_post_and_notify_author")
async def _delete_post_and_notify_author(self, call: CallbackQuery, author_id: int) -> None:
"""Удаление поста и уведомление автора"""
# Получаем текст поста для обучения RAG перед удалением
await self._train_on_published(call.message.message_id)
await self._get_bot(call.message).delete_message(chat_id=self.group_for_posts, message_id=call.message.message_id)
try:
@@ -493,6 +500,32 @@ class PostPublishService:
if str(e) == ERROR_BOT_BLOCKED:
raise UserBlockedBotError("Пользователь заблокировал бота")
raise
async def _train_on_published(self, message_id: int) -> None:
"""Обучает RAG на опубликованном посте."""
if not self.scoring_manager:
return
try:
text = await self.db.get_post_text_by_message_id(message_id)
if text and text.strip() and text != "^":
await self.scoring_manager.on_post_published(text)
logger.debug(f"RAG обучен на опубликованном посте: {message_id}")
except Exception as e:
logger.error(f"Ошибка обучения RAG на опубликованном посте {message_id}: {e}")
async def _train_on_declined(self, message_id: int) -> None:
"""Обучает RAG на отклоненном посте."""
if not self.scoring_manager:
return
try:
text = await self.db.get_post_text_by_message_id(message_id)
if text and text.strip() and text != "^":
await self.scoring_manager.on_post_declined(text)
logger.debug(f"RAG обучен на отклоненном посте: {message_id}")
except Exception as e:
logger.error(f"Ошибка обучения RAG на отклоненном посте {message_id}: {e}")
@track_time("_delete_media_group_and_notify_author", "post_publish_service")
@track_errors("post_publish_service", "_delete_media_group_and_notify_author")

View File

@@ -35,11 +35,11 @@ sleep = asyncio.sleep
class PrivateHandlers:
"""Main handler class for private messages"""
def __init__(self, db: AsyncBotDB, settings: BotSettings, s3_storage=None):
def __init__(self, db: AsyncBotDB, settings: BotSettings, s3_storage=None, scoring_manager=None):
self.db = db
self.settings = settings
self.user_service = UserService(db, settings)
self.post_service = PostService(db, settings, s3_storage)
self.post_service = PostService(db, settings, s3_storage, scoring_manager)
self.sticker_service = StickerService(settings)
self.router = Router()
@@ -240,18 +240,24 @@ class PrivateHandlers:
# Factory function to create handlers with dependencies
def create_private_handlers(db: AsyncBotDB, settings: BotSettings, s3_storage=None) -> PrivateHandlers:
def create_private_handlers(db: AsyncBotDB, settings: BotSettings, s3_storage=None, scoring_manager=None) -> PrivateHandlers:
"""Create private handlers instance with dependencies"""
return PrivateHandlers(db, settings, s3_storage)
return PrivateHandlers(db, settings, s3_storage, scoring_manager)
# Legacy router for backward compatibility
private_router = Router()
# Флаг инициализации для защиты от повторного вызова
_legacy_router_initialized = False
# Initialize with global dependencies (for backward compatibility)
def init_legacy_router():
"""Initialize legacy router with global dependencies"""
global private_router
global private_router, _legacy_router_initialized
if _legacy_router_initialized:
return
from helper_bot.utils.base_dependency_factory import get_global_instance
@@ -269,11 +275,13 @@ def init_legacy_router():
db = bdf.get_db()
s3_storage = bdf.get_s3_storage()
handlers = create_private_handlers(db, settings, s3_storage)
scoring_manager = bdf.get_scoring_manager()
handlers = create_private_handlers(db, settings, s3_storage, scoring_manager)
# Instead of trying to copy handlers, we'll use the new router directly
# This maintains backward compatibility while using the new architecture
private_router = handlers.router
_legacy_router_initialized = True
# Initialize legacy router
init_legacy_router()

View File

@@ -128,10 +128,11 @@ class UserService:
class PostService:
"""Service for post-related operations"""
def __init__(self, db: DatabaseProtocol, settings: BotSettings, s3_storage=None) -> None:
def __init__(self, db: DatabaseProtocol, settings: BotSettings, s3_storage=None, scoring_manager=None) -> None:
self.db = db
self.settings = settings
self.s3_storage = s3_storage
self.scoring_manager = scoring_manager
async def _save_media_background(self, sent_message: types.Message, bot_db: Any, s3_storage) -> None:
"""Сохраняет медиа в фоне, чтобы не блокировать ответ пользователю"""
@@ -142,18 +143,65 @@ class PostService:
except Exception as e:
logger.error(f"_save_media_background: Ошибка при сохранении медиа для поста {sent_message.message_id}: {e}")
async def _get_scores(self, text: str) -> tuple:
"""
Получает скоры для текста поста.
Returns:
Tuple (deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json)
"""
if not self.scoring_manager or not text or not text.strip():
return None, None, None, None, None
try:
scores = await self.scoring_manager.score_post(text)
# Формируем JSON для сохранения в БД
import json
ml_scores_json = json.dumps(scores.to_json_dict()) if scores.has_any_score() else None
# Получаем данные от RAG
rag_confidence = scores.rag.confidence if scores.rag else None
rag_score_pos_only = scores.rag.metadata.get("score_pos_only") if scores.rag else None
return scores.deepseek_score, scores.rag_score, rag_confidence, rag_score_pos_only, ml_scores_json
except Exception as e:
logger.error(f"PostService: Ошибка получения скоров: {e}")
return None, None, None, None, None
async def _save_scores_background(self, message_id: int, ml_scores_json: str) -> None:
"""Сохраняет скоры в БД в фоне."""
if ml_scores_json:
try:
await self.db.update_ml_scores(message_id, ml_scores_json)
except Exception as e:
logger.error(f"PostService: Ошибка сохранения скоров для {message_id}: {e}")
@track_time("handle_text_post", "post_service")
@track_errors("post_service", "handle_text_post")
@db_query_time("handle_text_post", "posts", "insert")
async def handle_text_post(self, message: types.Message, first_name: str) -> None:
"""Handle text post submission"""
post_text = get_text_message(message.text.lower(), first_name, message.from_user.username)
raw_text = message.text or ""
# Получаем скоры для текста
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_text)
# Формируем текст с учетом скоров
post_text = get_text_message(
message.text.lower(),
first_name,
message.from_user.username,
deepseek_score=deepseek_score,
rag_score=rag_score,
rag_confidence=rag_confidence,
rag_score_pos_only=rag_score_pos_only,
)
markup = get_reply_keyboard_for_post()
sent_message = await send_text_message(self.settings.group_for_posts, message, post_text, markup)
# Сохраняем сырой текст и определяем анонимность
raw_text = message.text or ""
# Определяем анонимность
is_anonymous = determine_anonymity(raw_text)
post = TelegramPost(
@@ -164,23 +212,39 @@ class PostService:
is_anonymous=is_anonymous
)
await self.db.add_post(post)
# Сохраняем скоры в фоне
if ml_scores_json:
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
@track_time("handle_photo_post", "post_service")
@track_errors("post_service", "handle_photo_post")
@db_query_time("handle_photo_post", "posts", "insert")
async def handle_photo_post(self, message: types.Message, first_name: str) -> None:
"""Handle photo post submission"""
raw_caption = message.caption or ""
# Получаем скоры для текста
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
post_caption = ""
if message.caption:
post_caption = get_text_message(message.caption.lower(), first_name, message.from_user.username)
post_caption = get_text_message(
message.caption.lower(),
first_name,
message.from_user.username,
deepseek_score=deepseek_score,
rag_score=rag_score,
rag_confidence=rag_confidence,
rag_score_pos_only=rag_score_pos_only,
)
markup = get_reply_keyboard_for_post()
sent_message = await send_photo_message(
self.settings.group_for_posts, message, message.photo[-1].file_id, post_caption, markup
)
# Сохраняем сырой caption и определяем анонимность
raw_caption = message.caption or ""
# Определяем анонимность
is_anonymous = determine_anonymity(raw_caption)
post = TelegramPost(
@@ -191,25 +255,40 @@ class PostService:
is_anonymous=is_anonymous
)
await self.db.add_post(post)
# Сохраняем медиа в фоне, чтобы не блокировать ответ пользователю
# Сохраняем медиа и скоры в фоне
asyncio.create_task(self._save_media_background(sent_message, self.db, self.s3_storage))
if ml_scores_json:
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
@track_time("handle_video_post", "post_service")
@track_errors("post_service", "handle_video_post")
@db_query_time("handle_video_post", "posts", "insert")
async def handle_video_post(self, message: types.Message, first_name: str) -> None:
"""Handle video post submission"""
raw_caption = message.caption or ""
# Получаем скоры для текста
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
post_caption = ""
if message.caption:
post_caption = get_text_message(message.caption.lower(), first_name, message.from_user.username)
post_caption = get_text_message(
message.caption.lower(),
first_name,
message.from_user.username,
deepseek_score=deepseek_score,
rag_score=rag_score,
rag_confidence=rag_confidence,
rag_score_pos_only=rag_score_pos_only,
)
markup = get_reply_keyboard_for_post()
sent_message = await send_video_message(
self.settings.group_for_posts, message, message.video.file_id, post_caption, markup
)
# Сохраняем сырой caption и определяем анонимность
raw_caption = message.caption or ""
# Определяем анонимность
is_anonymous = determine_anonymity(raw_caption)
post = TelegramPost(
@@ -220,8 +299,11 @@ class PostService:
is_anonymous=is_anonymous
)
await self.db.add_post(post)
# Сохраняем медиа в фоне, чтобы не блокировать ответ пользователю
# Сохраняем медиа и скоры в фоне
asyncio.create_task(self._save_media_background(sent_message, self.db, self.s3_storage))
if ml_scores_json:
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
@track_time("handle_video_note_post", "post_service")
@track_errors("post_service", "handle_video_note_post")
@@ -253,17 +335,29 @@ class PostService:
@db_query_time("handle_audio_post", "posts", "insert")
async def handle_audio_post(self, message: types.Message, first_name: str) -> None:
"""Handle audio post submission"""
raw_caption = message.caption or ""
# Получаем скоры для текста
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
post_caption = ""
if message.caption:
post_caption = get_text_message(message.caption.lower(), first_name, message.from_user.username)
post_caption = get_text_message(
message.caption.lower(),
first_name,
message.from_user.username,
deepseek_score=deepseek_score,
rag_score=rag_score,
rag_confidence=rag_confidence,
rag_score_pos_only=rag_score_pos_only,
)
markup = get_reply_keyboard_for_post()
sent_message = await send_audio_message(
self.settings.group_for_posts, message, message.audio.file_id, post_caption, markup
)
# Сохраняем сырой caption и определяем анонимность
raw_caption = message.caption or ""
# Определяем анонимность
is_anonymous = determine_anonymity(raw_caption)
post = TelegramPost(
@@ -274,8 +368,11 @@ class PostService:
is_anonymous=is_anonymous
)
await self.db.add_post(post)
# Сохраняем медиа в фоне, чтобы не блокировать ответ пользователю
# Сохраняем медиа и скоры в фоне
asyncio.create_task(self._save_media_background(sent_message, self.db, self.s3_storage))
if ml_scores_json:
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
@track_time("handle_voice_post", "post_service")
@track_errors("post_service", "handle_voice_post")
@@ -310,10 +407,23 @@ class PostService:
"""Handle media group post submission"""
post_caption = " "
raw_caption = ""
ml_scores_json = None
if album and album[0].caption:
raw_caption = album[0].caption or ""
post_caption = get_text_message(album[0].caption.lower(), first_name, message.from_user.username)
# Получаем скоры для текста
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
post_caption = get_text_message(
album[0].caption.lower(),
first_name,
message.from_user.username,
deepseek_score=deepseek_score,
rag_score=rag_score,
rag_confidence=rag_confidence,
rag_score_pos_only=rag_score_pos_only,
)
is_anonymous = determine_anonymity(raw_caption)
media_group = await prepare_media_group_from_middlewares(album, post_caption)
@@ -333,6 +443,10 @@ class PostService:
)
await self.db.add_post(main_post)
# Сохраняем скоры в фоне
if ml_scores_json:
asyncio.create_task(self._save_scores_background(main_post_id, ml_scores_json))
for msg_id in media_group_message_ids:
await self.db.add_message_link(main_post_id, msg_id)