feat: add submitted collection, /similar and /submitted endpoints (Stage 4)

Made-with: Cursor
This commit is contained in:
2026-02-28 19:00:22 +03:00
parent 955f518429
commit a1d6d2d860
15 changed files with 1308 additions and 400 deletions

View File

@@ -0,0 +1,212 @@
"""
Тесты для submitted-коллекции VectorStore.
"""
import numpy as np
import pytest
from app.storage.vector_store import VectorStore
@pytest.fixture
def vector_store(tmp_path):
"""VectorStore с временным путём для submitted."""
return VectorStore(
vector_dim=4,
max_examples=10,
max_submitted=5,
storage_path=None,
submitted_path=str(tmp_path / "submitted.npz"),
)
@pytest.fixture
def sample_vector():
"""Нормализованный вектор для тестов."""
v = np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)
return v / np.linalg.norm(v)
def test_add_submitted(vector_store, sample_vector):
"""Добавление submitted-поста."""
added = vector_store.add_submitted(
vector=sample_vector,
text_hash="abc123",
created_at=1000,
post_id=42,
text="Test post",
rag_score=0.85,
)
assert added is True
assert vector_store.submitted_count == 1
def test_add_submitted_duplicate(vector_store, sample_vector):
"""Дубликат по хешу не добавляется."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="same_hash",
created_at=1000,
text="First",
)
added = vector_store.add_submitted(
vector=sample_vector,
text_hash="same_hash",
created_at=2000,
text="Second",
)
assert added is False
assert vector_store.submitted_count == 1
def test_add_submitted_fifo(vector_store, sample_vector):
"""При превышении max_submitted удаляется самый старый (FIFO)."""
for i in range(7):
v = np.array(
[float(i + 1), 0.0, 0.0, 0.0], dtype=np.float32
) # i+1 чтобы избежать нулевого вектора
v = v / np.linalg.norm(v)
vector_store.add_submitted(
vector=v,
text_hash=f"hash_{i}",
created_at=1000 + i,
post_id=i,
text=f"Post {i}",
)
assert vector_store.submitted_count == 5 # max_submitted
# Должны остаться посты 2, 3, 4, 5, 6 (удалены 0, 1)
post_ids = vector_store._submitted_post_ids
assert 0 not in post_ids
assert 1 not in post_ids
assert 2 in post_ids
def test_find_similar_submitted_empty(vector_store, sample_vector):
"""Поиск в пустой коллекции возвращает пустой список."""
result = vector_store.find_similar_submitted(
vector=sample_vector,
threshold=0.5,
hours=24,
)
assert result == []
def test_find_similar_submitted(vector_store, sample_vector):
"""Поиск похожих постов с фильтром по времени и threshold."""
import time
now = int(time.time())
# Похожий вектор
similar_v = np.array([0.99, 0.01, 0.0, 0.0], dtype=np.float32)
similar_v = similar_v / np.linalg.norm(similar_v)
# Непохожий вектор
different_v = np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)
different_v = different_v / np.linalg.norm(different_v)
vector_store.add_submitted(
vector=similar_v,
text_hash="similar",
created_at=now - 3600, # 1 час назад
post_id=1,
text="Similar post",
rag_score=0.9,
)
vector_store.add_submitted(
vector=different_v,
text_hash="different",
created_at=now - 3600,
post_id=2,
text="Different post",
rag_score=0.5,
)
result = vector_store.find_similar_submitted(
vector=sample_vector,
threshold=0.9,
hours=24,
)
assert len(result) == 1
assert result[0]["post_id"] == 1
assert result[0]["text"] == "Similar post"
assert result[0]["similarity"] >= 0.9
def test_find_similar_submitted_time_filter(vector_store, sample_vector):
"""Фильтр по hours исключает старые посты."""
import time
now = int(time.time())
vector_store.add_submitted(
vector=sample_vector,
text_hash="old",
created_at=now - 48 * 3600, # 48 часов назад
post_id=1,
text="Old post",
)
vector_store.add_submitted(
vector=sample_vector,
text_hash="recent",
created_at=now - 3600, # 1 час назад
post_id=2,
text="Recent post",
)
result = vector_store.find_similar_submitted(
vector=sample_vector,
threshold=0.5,
hours=24,
)
assert len(result) == 1
assert result[0]["post_id"] == 2
def test_submitted_persistence(vector_store, sample_vector, tmp_path):
"""Сохранение и загрузка submitted-коллекции."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="persist",
created_at=12345,
post_id=999,
text="Persisted post",
rag_score=0.77,
)
vector_store.save_submitted_to_disk()
# Новый store загружает данные
store2 = VectorStore(
vector_dim=4,
max_submitted=5,
storage_path=None,
submitted_path=str(tmp_path / "submitted.npz"),
)
assert store2.submitted_count == 1
assert store2._submitted_post_ids[0] == 999
assert store2._submitted_texts[0] == "Persisted post"
assert store2._submitted_rag_scores[0] == 0.77
def test_get_stats_includes_submitted(vector_store, sample_vector):
"""get_stats включает submitted_count и max_submitted."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="stat",
created_at=1000,
text="For stats",
)
stats = vector_store.get_stats()
assert "submitted_count" in stats
assert stats["submitted_count"] == 1
assert "max_submitted" in stats
assert stats["max_submitted"] == 5
def test_clear_submitted(vector_store, sample_vector):
"""clear() очищает submitted-коллекцию."""
vector_store.add_submitted(
vector=sample_vector,
text_hash="clear",
created_at=1000,
text="To clear",
)
vector_store.clear()
assert vector_store.submitted_count == 0