Files
rag-service/tests/test_vector_store_submitted.py

213 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Тесты для submitted-коллекции VectorStore.
"""
import numpy as np
import pytest
from app.storage.vector_store import VectorStore
@pytest.fixture
def vector_store(tmp_path):
"""VectorStore с временным путём для submitted."""
return VectorStore(
vector_dim=4,
max_examples=10,
max_submitted=5,
storage_path=None,
submitted_path=str(tmp_path / "submitted.npz"),
)
@pytest.fixture
def sample_vector():
"""Нормализованный вектор для тестов."""
v = np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)
return v / np.linalg.norm(v)
def test_add_submitted(vector_store, sample_vector):
"""Добавление submitted-поста."""
added = vector_store.add_submitted(
vector=sample_vector,
text_hash="abc123",
created_at=1000,
post_id=42,
text="Test post",
rag_score=0.85,
)
assert added is True
assert vector_store.submitted_count == 1
def test_add_submitted_duplicate(vector_store, sample_vector):
"""Дубликат по хешу не добавляется."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="same_hash",
created_at=1000,
text="First",
)
added = vector_store.add_submitted(
vector=sample_vector,
text_hash="same_hash",
created_at=2000,
text="Second",
)
assert added is False
assert vector_store.submitted_count == 1
def test_add_submitted_fifo(vector_store, sample_vector):
"""При превышении max_submitted удаляется самый старый (FIFO)."""
for i in range(7):
v = np.array(
[float(i + 1), 0.0, 0.0, 0.0], dtype=np.float32
) # i+1 чтобы избежать нулевого вектора
v = v / np.linalg.norm(v)
vector_store.add_submitted(
vector=v,
text_hash=f"hash_{i}",
created_at=1000 + i,
post_id=i,
text=f"Post {i}",
)
assert vector_store.submitted_count == 5 # max_submitted
# Должны остаться посты 2, 3, 4, 5, 6 (удалены 0, 1)
post_ids = vector_store._submitted_post_ids
assert 0 not in post_ids
assert 1 not in post_ids
assert 2 in post_ids
def test_find_similar_submitted_empty(vector_store, sample_vector):
"""Поиск в пустой коллекции возвращает пустой список."""
result = vector_store.find_similar_submitted(
vector=sample_vector,
threshold=0.5,
hours=24,
)
assert result == []
def test_find_similar_submitted(vector_store, sample_vector):
"""Поиск похожих постов с фильтром по времени и threshold."""
import time
now = int(time.time())
# Похожий вектор
similar_v = np.array([0.99, 0.01, 0.0, 0.0], dtype=np.float32)
similar_v = similar_v / np.linalg.norm(similar_v)
# Непохожий вектор
different_v = np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)
different_v = different_v / np.linalg.norm(different_v)
vector_store.add_submitted(
vector=similar_v,
text_hash="similar",
created_at=now - 3600, # 1 час назад
post_id=1,
text="Similar post",
rag_score=0.9,
)
vector_store.add_submitted(
vector=different_v,
text_hash="different",
created_at=now - 3600,
post_id=2,
text="Different post",
rag_score=0.5,
)
result = vector_store.find_similar_submitted(
vector=sample_vector,
threshold=0.9,
hours=24,
)
assert len(result) == 1
assert result[0]["post_id"] == 1
assert result[0]["text"] == "Similar post"
assert result[0]["similarity"] >= 0.9
def test_find_similar_submitted_time_filter(vector_store, sample_vector):
"""Фильтр по hours исключает старые посты."""
import time
now = int(time.time())
vector_store.add_submitted(
vector=sample_vector,
text_hash="old",
created_at=now - 48 * 3600, # 48 часов назад
post_id=1,
text="Old post",
)
vector_store.add_submitted(
vector=sample_vector,
text_hash="recent",
created_at=now - 3600, # 1 час назад
post_id=2,
text="Recent post",
)
result = vector_store.find_similar_submitted(
vector=sample_vector,
threshold=0.5,
hours=24,
)
assert len(result) == 1
assert result[0]["post_id"] == 2
def test_submitted_persistence(vector_store, sample_vector, tmp_path):
"""Сохранение и загрузка submitted-коллекции."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="persist",
created_at=12345,
post_id=999,
text="Persisted post",
rag_score=0.77,
)
vector_store.save_submitted_to_disk()
# Новый store загружает данные
store2 = VectorStore(
vector_dim=4,
max_submitted=5,
storage_path=None,
submitted_path=str(tmp_path / "submitted.npz"),
)
assert store2.submitted_count == 1
assert store2._submitted_post_ids[0] == 999
assert store2._submitted_texts[0] == "Persisted post"
assert store2._submitted_rag_scores[0] == 0.77
def test_get_stats_includes_submitted(vector_store, sample_vector):
"""get_stats включает submitted_count и max_submitted."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="stat",
created_at=1000,
text="For stats",
)
stats = vector_store.get_stats()
assert "submitted_count" in stats
assert stats["submitted_count"] == 1
assert "max_submitted" in stats
assert stats["max_submitted"] == 5
def test_clear_submitted(vector_store, sample_vector):
"""clear() очищает submitted-коллекцию."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="clear",
created_at=1000,
text="To clear",
)
vector_store.clear()
assert vector_store.submitted_count == 0