Files
income_calculator/backend/parsers/bank_y.py
2026-02-23 16:49:24 +03:00

85 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
from datetime import datetime
from pathlib import Path
from typing import List
import pdfplumber
from backend.parsers.base import BaseBankParser, ParsedTransaction
# Я-банк использует EN DASH (U+2013) для минуса
MINUS_CHARS = "\u2013-"
def _normalize_amount(s: str) -> float:
s = s.replace("\u00a0", " ").replace(" ", "").replace(",", ".")
for c in MINUS_CHARS:
s = s.replace(c, "-")
if s.startswith(""):
s = "-" + s[1:]
return float(s)
def _parse_datetime_y(date_str: str, time_str: str = "") -> str:
try:
part = date_str.strip() + " " + (time_str or "00:00").strip()
dt = datetime.strptime(part, "%d.%m.%Y %H:%M")
return dt.strftime("%Y-%m-%dT%H:%M:%S")
except ValueError:
return date_str.strip().replace(".", "-")[:10] + "T00:00:00"
class BankYParser(BaseBankParser):
"""Парсер выписок Яндекс Банка. Файлы Я-MM-YY.pdf."""
# Строка: описание ... DD.MM.YYYY DD.MM.YYYY [*XXXX] сумма ₽ сумма ₽ (минус может быть U+2013, карта опциональна)
ROW_RE = re.compile(
r"^(.+?)\s+(\d{2}\.\d{2}\.\d{4})\s+(\d{2}\.\d{2}\.\d{4})\s+(?:\*(\d{4})\s+)?([+\u2013\-]?[\d\s,]+)\s*₽\s+([+\u2013\-]?[\d\s,]+)\s*₽\s*$",
re.UNICODE,
)
# Вторая строка может содержать время: "в 18:13" или "клиента в 21:35"
TIME_RE = re.compile(r"^(?:.*\s+)?в\s+(\d{1,2}:\d{2})\s*$", re.UNICODE)
PIGGY_MARKER = "Перевод между счетами одного клиента"
def can_parse(self, filename: str) -> bool:
name = Path(filename).name
return name.startswith("Я-") and name.lower().endswith(".pdf")
def parse(self, file_path: str) -> List[ParsedTransaction]:
result: List[ParsedTransaction] = []
with pdfplumber.open(file_path) as pdf:
lines: List[str] = []
for page in pdf.pages:
text = page.extract_text()
if text:
lines.extend(text.split("\n"))
i = 0
while i < len(lines):
line = lines[i].strip()
m = self.ROW_RE.match(line)
if m:
desc, date_op, date_proc, card_tail, amt1, amt2 = m.groups()
card_tail = card_tail or "0000"
amount = _normalize_amount(amt1)
time_str = ""
if i + 1 < len(lines):
tm = self.TIME_RE.match(lines[i + 1].strip())
if tm:
time_str = tm.group(1)
desc = (desc + " " + lines[i + 1].strip()).strip()
i += 1
result.append(
ParsedTransaction(
operation_date=_parse_datetime_y(date_op, time_str),
debit_date=_parse_datetime_y(date_proc),
amount=amount,
amount_card_currency=_normalize_amount(amt2) if amt2 else None,
description=desc.strip(),
card_tail=card_tail,
)
)
i += 1
return result