feat: интеграция ML-скоринга с использованием RAG и DeepSeek
- Обновлен Dockerfile для установки необходимых зависимостей. - Добавлены новые переменные окружения для настройки ML-скоринга в env.example. - Реализованы методы для получения и обновления ML-скоров в AsyncBotDB и PostRepository. - Обновлены обработчики публикации постов для интеграции ML-скоринга. - Добавлен новый обработчик для получения статистики ML-скоринга в админ-панели. - Обновлены функции для форматирования сообщений с учетом ML-скоров.
This commit is contained in:
@@ -16,6 +16,7 @@ from helper_bot.keyboards.keyboards import (create_keyboard_for_approve_ban,
|
||||
create_keyboard_for_ban_reason,
|
||||
create_keyboard_with_pagination,
|
||||
get_reply_keyboard_admin)
|
||||
from helper_bot.utils.base_dependency_factory import get_global_instance
|
||||
# Local imports - metrics
|
||||
from helper_bot.utils.metrics import db_query_time, track_errors, track_time
|
||||
from logs.custom_logger import logger
|
||||
@@ -137,6 +138,69 @@ async def get_banned_users(
|
||||
await handle_admin_error(message, e, state, "get_banned_users")
|
||||
|
||||
|
||||
@admin_router.message(
|
||||
ChatTypeFilter(chat_type=["private"]),
|
||||
StateFilter("ADMIN"),
|
||||
F.text == '📊 ML Статистика'
|
||||
)
|
||||
@track_time("get_ml_stats", "admin_handlers")
|
||||
@track_errors("admin_handlers", "get_ml_stats")
|
||||
async def get_ml_stats(
|
||||
message: types.Message,
|
||||
state: FSMContext,
|
||||
**kwargs
|
||||
):
|
||||
"""Получение статистики ML-скоринга"""
|
||||
try:
|
||||
logger.info(f"Запрос ML статистики от пользователя: {message.from_user.full_name}")
|
||||
|
||||
bdf = get_global_instance()
|
||||
scoring_manager = bdf.get_scoring_manager()
|
||||
|
||||
if not scoring_manager:
|
||||
await message.answer("📊 ML Scoring отключен\n\nДля включения установите RAG_ENABLED=true или DEEPSEEK_ENABLED=true в .env")
|
||||
return
|
||||
|
||||
stats = scoring_manager.get_stats()
|
||||
|
||||
# Формируем текст статистики
|
||||
lines = ["📊 <b>ML Scoring Статистика</b>\n"]
|
||||
|
||||
# RAG статистика
|
||||
if "rag" in stats:
|
||||
rag = stats["rag"]
|
||||
lines.append("🤖 <b>RAG (ruBERT):</b>")
|
||||
lines.append(f" • Статус: {'✅ Включен' if rag.get('enabled') else '❌ Отключен'}")
|
||||
lines.append(f" • Модель: {rag.get('model_name', 'N/A')}")
|
||||
lines.append(f" • Модель загружена: {'✅' if rag.get('model_loaded') else '❌'}")
|
||||
|
||||
vs = rag.get("vector_store", {})
|
||||
lines.append(f" • Положительных примеров: {vs.get('positive_count', 0)}")
|
||||
lines.append(f" • Отрицательных примеров: {vs.get('negative_count', 0)}")
|
||||
lines.append(f" • Всего примеров: {vs.get('total_count', 0)}")
|
||||
lines.append(f" • Макс. примеров: {vs.get('max_examples', 'N/A')}")
|
||||
lines.append("")
|
||||
|
||||
# DeepSeek статистика
|
||||
if "deepseek" in stats:
|
||||
ds = stats["deepseek"]
|
||||
lines.append("🔮 <b>DeepSeek API:</b>")
|
||||
lines.append(f" • Статус: {'✅ Включен' if ds.get('enabled') else '❌ Отключен'}")
|
||||
lines.append(f" • Модель: {ds.get('model', 'N/A')}")
|
||||
lines.append(f" • Таймаут: {ds.get('timeout', 'N/A')}с")
|
||||
lines.append("")
|
||||
|
||||
# Если ничего не включено
|
||||
if "rag" not in stats and "deepseek" not in stats:
|
||||
lines.append("⚠️ Ни один сервис не настроен")
|
||||
|
||||
await message.answer("\n".join(lines), parse_mode="HTML")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Ошибка получения ML статистики: {e}")
|
||||
await message.answer(f"❌ Ошибка получения статистики: {str(e)}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# ХЕНДЛЕРЫ ПРОЦЕССА БАНА
|
||||
# ============================================================================
|
||||
|
||||
@@ -15,7 +15,8 @@ def get_post_publish_service() -> PostPublishService:
|
||||
db = bdf.get_db()
|
||||
settings = bdf.settings
|
||||
s3_storage = bdf.get_s3_storage()
|
||||
return PostPublishService(None, db, settings, s3_storage)
|
||||
scoring_manager = bdf.get_scoring_manager()
|
||||
return PostPublishService(None, db, settings, s3_storage, scoring_manager)
|
||||
|
||||
|
||||
def get_ban_service() -> BanService:
|
||||
|
||||
@@ -29,12 +29,13 @@ from .exceptions import (BanError, PostNotFoundError, PublishError,
|
||||
|
||||
|
||||
class PostPublishService:
|
||||
def __init__(self, bot: Bot, db, settings: Dict[str, Any], s3_storage=None):
|
||||
def __init__(self, bot: Bot, db, settings: Dict[str, Any], s3_storage=None, scoring_manager=None):
|
||||
# bot может быть None - в этом случае используем бота из контекста сообщения
|
||||
self.bot = bot
|
||||
self.db = db
|
||||
self.settings = settings
|
||||
self.s3_storage = s3_storage
|
||||
self.scoring_manager = scoring_manager
|
||||
self.group_for_posts = settings['Telegram']['group_for_posts']
|
||||
self.main_public = settings['Telegram']['main_public']
|
||||
self.important_logs = settings['Telegram']['important_logs']
|
||||
@@ -392,6 +393,9 @@ class PostPublishService:
|
||||
async def _decline_single_post(self, call: CallbackQuery) -> None:
|
||||
"""Отклонение одиночного поста"""
|
||||
author_id = await self._get_author_id(call.message.message_id)
|
||||
|
||||
# Обучаем RAG на отклоненном посте перед удалением
|
||||
await self._train_on_declined(call.message.message_id)
|
||||
|
||||
updated_rows = await self.db.update_status_by_message_id(call.message.message_id, "declined")
|
||||
if updated_rows == 0:
|
||||
@@ -485,6 +489,9 @@ class PostPublishService:
|
||||
@track_errors("post_publish_service", "_delete_post_and_notify_author")
|
||||
async def _delete_post_and_notify_author(self, call: CallbackQuery, author_id: int) -> None:
|
||||
"""Удаление поста и уведомление автора"""
|
||||
# Получаем текст поста для обучения RAG перед удалением
|
||||
await self._train_on_published(call.message.message_id)
|
||||
|
||||
await self._get_bot(call.message).delete_message(chat_id=self.group_for_posts, message_id=call.message.message_id)
|
||||
|
||||
try:
|
||||
@@ -493,6 +500,32 @@ class PostPublishService:
|
||||
if str(e) == ERROR_BOT_BLOCKED:
|
||||
raise UserBlockedBotError("Пользователь заблокировал бота")
|
||||
raise
|
||||
|
||||
async def _train_on_published(self, message_id: int) -> None:
|
||||
"""Обучает RAG на опубликованном посте."""
|
||||
if not self.scoring_manager:
|
||||
return
|
||||
|
||||
try:
|
||||
text = await self.db.get_post_text_by_message_id(message_id)
|
||||
if text and text.strip() and text != "^":
|
||||
await self.scoring_manager.on_post_published(text)
|
||||
logger.debug(f"RAG обучен на опубликованном посте: {message_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Ошибка обучения RAG на опубликованном посте {message_id}: {e}")
|
||||
|
||||
async def _train_on_declined(self, message_id: int) -> None:
|
||||
"""Обучает RAG на отклоненном посте."""
|
||||
if not self.scoring_manager:
|
||||
return
|
||||
|
||||
try:
|
||||
text = await self.db.get_post_text_by_message_id(message_id)
|
||||
if text and text.strip() and text != "^":
|
||||
await self.scoring_manager.on_post_declined(text)
|
||||
logger.debug(f"RAG обучен на отклоненном посте: {message_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Ошибка обучения RAG на отклоненном посте {message_id}: {e}")
|
||||
|
||||
@track_time("_delete_media_group_and_notify_author", "post_publish_service")
|
||||
@track_errors("post_publish_service", "_delete_media_group_and_notify_author")
|
||||
|
||||
@@ -35,11 +35,11 @@ sleep = asyncio.sleep
|
||||
class PrivateHandlers:
|
||||
"""Main handler class for private messages"""
|
||||
|
||||
def __init__(self, db: AsyncBotDB, settings: BotSettings, s3_storage=None):
|
||||
def __init__(self, db: AsyncBotDB, settings: BotSettings, s3_storage=None, scoring_manager=None):
|
||||
self.db = db
|
||||
self.settings = settings
|
||||
self.user_service = UserService(db, settings)
|
||||
self.post_service = PostService(db, settings, s3_storage)
|
||||
self.post_service = PostService(db, settings, s3_storage, scoring_manager)
|
||||
self.sticker_service = StickerService(settings)
|
||||
|
||||
self.router = Router()
|
||||
@@ -240,18 +240,24 @@ class PrivateHandlers:
|
||||
|
||||
|
||||
# Factory function to create handlers with dependencies
|
||||
def create_private_handlers(db: AsyncBotDB, settings: BotSettings, s3_storage=None) -> PrivateHandlers:
|
||||
def create_private_handlers(db: AsyncBotDB, settings: BotSettings, s3_storage=None, scoring_manager=None) -> PrivateHandlers:
|
||||
"""Create private handlers instance with dependencies"""
|
||||
return PrivateHandlers(db, settings, s3_storage)
|
||||
return PrivateHandlers(db, settings, s3_storage, scoring_manager)
|
||||
|
||||
|
||||
# Legacy router for backward compatibility
|
||||
private_router = Router()
|
||||
|
||||
# Флаг инициализации для защиты от повторного вызова
|
||||
_legacy_router_initialized = False
|
||||
|
||||
# Initialize with global dependencies (for backward compatibility)
|
||||
def init_legacy_router():
|
||||
"""Initialize legacy router with global dependencies"""
|
||||
global private_router
|
||||
global private_router, _legacy_router_initialized
|
||||
|
||||
if _legacy_router_initialized:
|
||||
return
|
||||
|
||||
from helper_bot.utils.base_dependency_factory import get_global_instance
|
||||
|
||||
@@ -269,11 +275,13 @@ def init_legacy_router():
|
||||
|
||||
db = bdf.get_db()
|
||||
s3_storage = bdf.get_s3_storage()
|
||||
handlers = create_private_handlers(db, settings, s3_storage)
|
||||
scoring_manager = bdf.get_scoring_manager()
|
||||
handlers = create_private_handlers(db, settings, s3_storage, scoring_manager)
|
||||
|
||||
# Instead of trying to copy handlers, we'll use the new router directly
|
||||
# This maintains backward compatibility while using the new architecture
|
||||
private_router = handlers.router
|
||||
_legacy_router_initialized = True
|
||||
|
||||
# Initialize legacy router
|
||||
init_legacy_router()
|
||||
|
||||
@@ -128,10 +128,11 @@ class UserService:
|
||||
class PostService:
|
||||
"""Service for post-related operations"""
|
||||
|
||||
def __init__(self, db: DatabaseProtocol, settings: BotSettings, s3_storage=None) -> None:
|
||||
def __init__(self, db: DatabaseProtocol, settings: BotSettings, s3_storage=None, scoring_manager=None) -> None:
|
||||
self.db = db
|
||||
self.settings = settings
|
||||
self.s3_storage = s3_storage
|
||||
self.scoring_manager = scoring_manager
|
||||
|
||||
async def _save_media_background(self, sent_message: types.Message, bot_db: Any, s3_storage) -> None:
|
||||
"""Сохраняет медиа в фоне, чтобы не блокировать ответ пользователю"""
|
||||
@@ -142,18 +143,65 @@ class PostService:
|
||||
except Exception as e:
|
||||
logger.error(f"_save_media_background: Ошибка при сохранении медиа для поста {sent_message.message_id}: {e}")
|
||||
|
||||
async def _get_scores(self, text: str) -> tuple:
|
||||
"""
|
||||
Получает скоры для текста поста.
|
||||
|
||||
Returns:
|
||||
Tuple (deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json)
|
||||
"""
|
||||
if not self.scoring_manager or not text or not text.strip():
|
||||
return None, None, None, None, None
|
||||
|
||||
try:
|
||||
scores = await self.scoring_manager.score_post(text)
|
||||
|
||||
# Формируем JSON для сохранения в БД
|
||||
import json
|
||||
ml_scores_json = json.dumps(scores.to_json_dict()) if scores.has_any_score() else None
|
||||
|
||||
# Получаем данные от RAG
|
||||
rag_confidence = scores.rag.confidence if scores.rag else None
|
||||
rag_score_pos_only = scores.rag.metadata.get("score_pos_only") if scores.rag else None
|
||||
|
||||
return scores.deepseek_score, scores.rag_score, rag_confidence, rag_score_pos_only, ml_scores_json
|
||||
except Exception as e:
|
||||
logger.error(f"PostService: Ошибка получения скоров: {e}")
|
||||
return None, None, None, None, None
|
||||
|
||||
async def _save_scores_background(self, message_id: int, ml_scores_json: str) -> None:
|
||||
"""Сохраняет скоры в БД в фоне."""
|
||||
if ml_scores_json:
|
||||
try:
|
||||
await self.db.update_ml_scores(message_id, ml_scores_json)
|
||||
except Exception as e:
|
||||
logger.error(f"PostService: Ошибка сохранения скоров для {message_id}: {e}")
|
||||
|
||||
@track_time("handle_text_post", "post_service")
|
||||
@track_errors("post_service", "handle_text_post")
|
||||
@db_query_time("handle_text_post", "posts", "insert")
|
||||
async def handle_text_post(self, message: types.Message, first_name: str) -> None:
|
||||
"""Handle text post submission"""
|
||||
post_text = get_text_message(message.text.lower(), first_name, message.from_user.username)
|
||||
raw_text = message.text or ""
|
||||
|
||||
# Получаем скоры для текста
|
||||
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_text)
|
||||
|
||||
# Формируем текст с учетом скоров
|
||||
post_text = get_text_message(
|
||||
message.text.lower(),
|
||||
first_name,
|
||||
message.from_user.username,
|
||||
deepseek_score=deepseek_score,
|
||||
rag_score=rag_score,
|
||||
rag_confidence=rag_confidence,
|
||||
rag_score_pos_only=rag_score_pos_only,
|
||||
)
|
||||
markup = get_reply_keyboard_for_post()
|
||||
|
||||
sent_message = await send_text_message(self.settings.group_for_posts, message, post_text, markup)
|
||||
|
||||
# Сохраняем сырой текст и определяем анонимность
|
||||
raw_text = message.text or ""
|
||||
# Определяем анонимность
|
||||
is_anonymous = determine_anonymity(raw_text)
|
||||
|
||||
post = TelegramPost(
|
||||
@@ -164,23 +212,39 @@ class PostService:
|
||||
is_anonymous=is_anonymous
|
||||
)
|
||||
await self.db.add_post(post)
|
||||
|
||||
# Сохраняем скоры в фоне
|
||||
if ml_scores_json:
|
||||
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
|
||||
|
||||
@track_time("handle_photo_post", "post_service")
|
||||
@track_errors("post_service", "handle_photo_post")
|
||||
@db_query_time("handle_photo_post", "posts", "insert")
|
||||
async def handle_photo_post(self, message: types.Message, first_name: str) -> None:
|
||||
"""Handle photo post submission"""
|
||||
raw_caption = message.caption or ""
|
||||
|
||||
# Получаем скоры для текста
|
||||
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
|
||||
|
||||
post_caption = ""
|
||||
if message.caption:
|
||||
post_caption = get_text_message(message.caption.lower(), first_name, message.from_user.username)
|
||||
post_caption = get_text_message(
|
||||
message.caption.lower(),
|
||||
first_name,
|
||||
message.from_user.username,
|
||||
deepseek_score=deepseek_score,
|
||||
rag_score=rag_score,
|
||||
rag_confidence=rag_confidence,
|
||||
rag_score_pos_only=rag_score_pos_only,
|
||||
)
|
||||
|
||||
markup = get_reply_keyboard_for_post()
|
||||
sent_message = await send_photo_message(
|
||||
self.settings.group_for_posts, message, message.photo[-1].file_id, post_caption, markup
|
||||
)
|
||||
|
||||
# Сохраняем сырой caption и определяем анонимность
|
||||
raw_caption = message.caption or ""
|
||||
# Определяем анонимность
|
||||
is_anonymous = determine_anonymity(raw_caption)
|
||||
|
||||
post = TelegramPost(
|
||||
@@ -191,25 +255,40 @@ class PostService:
|
||||
is_anonymous=is_anonymous
|
||||
)
|
||||
await self.db.add_post(post)
|
||||
# Сохраняем медиа в фоне, чтобы не блокировать ответ пользователю
|
||||
|
||||
# Сохраняем медиа и скоры в фоне
|
||||
asyncio.create_task(self._save_media_background(sent_message, self.db, self.s3_storage))
|
||||
if ml_scores_json:
|
||||
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
|
||||
|
||||
@track_time("handle_video_post", "post_service")
|
||||
@track_errors("post_service", "handle_video_post")
|
||||
@db_query_time("handle_video_post", "posts", "insert")
|
||||
async def handle_video_post(self, message: types.Message, first_name: str) -> None:
|
||||
"""Handle video post submission"""
|
||||
raw_caption = message.caption or ""
|
||||
|
||||
# Получаем скоры для текста
|
||||
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
|
||||
|
||||
post_caption = ""
|
||||
if message.caption:
|
||||
post_caption = get_text_message(message.caption.lower(), first_name, message.from_user.username)
|
||||
post_caption = get_text_message(
|
||||
message.caption.lower(),
|
||||
first_name,
|
||||
message.from_user.username,
|
||||
deepseek_score=deepseek_score,
|
||||
rag_score=rag_score,
|
||||
rag_confidence=rag_confidence,
|
||||
rag_score_pos_only=rag_score_pos_only,
|
||||
)
|
||||
|
||||
markup = get_reply_keyboard_for_post()
|
||||
sent_message = await send_video_message(
|
||||
self.settings.group_for_posts, message, message.video.file_id, post_caption, markup
|
||||
)
|
||||
|
||||
# Сохраняем сырой caption и определяем анонимность
|
||||
raw_caption = message.caption or ""
|
||||
# Определяем анонимность
|
||||
is_anonymous = determine_anonymity(raw_caption)
|
||||
|
||||
post = TelegramPost(
|
||||
@@ -220,8 +299,11 @@ class PostService:
|
||||
is_anonymous=is_anonymous
|
||||
)
|
||||
await self.db.add_post(post)
|
||||
# Сохраняем медиа в фоне, чтобы не блокировать ответ пользователю
|
||||
|
||||
# Сохраняем медиа и скоры в фоне
|
||||
asyncio.create_task(self._save_media_background(sent_message, self.db, self.s3_storage))
|
||||
if ml_scores_json:
|
||||
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
|
||||
|
||||
@track_time("handle_video_note_post", "post_service")
|
||||
@track_errors("post_service", "handle_video_note_post")
|
||||
@@ -253,17 +335,29 @@ class PostService:
|
||||
@db_query_time("handle_audio_post", "posts", "insert")
|
||||
async def handle_audio_post(self, message: types.Message, first_name: str) -> None:
|
||||
"""Handle audio post submission"""
|
||||
raw_caption = message.caption or ""
|
||||
|
||||
# Получаем скоры для текста
|
||||
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
|
||||
|
||||
post_caption = ""
|
||||
if message.caption:
|
||||
post_caption = get_text_message(message.caption.lower(), first_name, message.from_user.username)
|
||||
post_caption = get_text_message(
|
||||
message.caption.lower(),
|
||||
first_name,
|
||||
message.from_user.username,
|
||||
deepseek_score=deepseek_score,
|
||||
rag_score=rag_score,
|
||||
rag_confidence=rag_confidence,
|
||||
rag_score_pos_only=rag_score_pos_only,
|
||||
)
|
||||
|
||||
markup = get_reply_keyboard_for_post()
|
||||
sent_message = await send_audio_message(
|
||||
self.settings.group_for_posts, message, message.audio.file_id, post_caption, markup
|
||||
)
|
||||
|
||||
# Сохраняем сырой caption и определяем анонимность
|
||||
raw_caption = message.caption or ""
|
||||
# Определяем анонимность
|
||||
is_anonymous = determine_anonymity(raw_caption)
|
||||
|
||||
post = TelegramPost(
|
||||
@@ -274,8 +368,11 @@ class PostService:
|
||||
is_anonymous=is_anonymous
|
||||
)
|
||||
await self.db.add_post(post)
|
||||
# Сохраняем медиа в фоне, чтобы не блокировать ответ пользователю
|
||||
|
||||
# Сохраняем медиа и скоры в фоне
|
||||
asyncio.create_task(self._save_media_background(sent_message, self.db, self.s3_storage))
|
||||
if ml_scores_json:
|
||||
asyncio.create_task(self._save_scores_background(sent_message.message_id, ml_scores_json))
|
||||
|
||||
@track_time("handle_voice_post", "post_service")
|
||||
@track_errors("post_service", "handle_voice_post")
|
||||
@@ -310,10 +407,23 @@ class PostService:
|
||||
"""Handle media group post submission"""
|
||||
post_caption = " "
|
||||
raw_caption = ""
|
||||
ml_scores_json = None
|
||||
|
||||
if album and album[0].caption:
|
||||
raw_caption = album[0].caption or ""
|
||||
post_caption = get_text_message(album[0].caption.lower(), first_name, message.from_user.username)
|
||||
|
||||
# Получаем скоры для текста
|
||||
deepseek_score, rag_score, rag_confidence, rag_score_pos_only, ml_scores_json = await self._get_scores(raw_caption)
|
||||
|
||||
post_caption = get_text_message(
|
||||
album[0].caption.lower(),
|
||||
first_name,
|
||||
message.from_user.username,
|
||||
deepseek_score=deepseek_score,
|
||||
rag_score=rag_score,
|
||||
rag_confidence=rag_confidence,
|
||||
rag_score_pos_only=rag_score_pos_only,
|
||||
)
|
||||
|
||||
is_anonymous = determine_anonymity(raw_caption)
|
||||
media_group = await prepare_media_group_from_middlewares(album, post_caption)
|
||||
@@ -333,6 +443,10 @@ class PostService:
|
||||
)
|
||||
await self.db.add_post(main_post)
|
||||
|
||||
# Сохраняем скоры в фоне
|
||||
if ml_scores_json:
|
||||
asyncio.create_task(self._save_scores_background(main_post_id, ml_scores_json))
|
||||
|
||||
for msg_id in media_group_message_ids:
|
||||
await self.db.add_message_link(main_post_id, msg_id)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user