diff --git a/CHANGELOG.md b/CHANGELOG.md index 858a9e8..22c08a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [2.6.0] - 2026-06 + +Release rollup of 2.5.2–2.5.8 (reversible initials, instance tracking fixes, +live `MASK_*` flags, O(n) replacements, threat model docs, stderr passwords). + +### Changed +- Version strings synchronized across all file headers (were stuck at 2.5.1) +- Historical "extracted during vX refactoring" phrases pinned to v2.5.0 + so they no longer drift with version bumps + +## [2.5.8] - 2026-06 + +### Performance +- Replacement loops (mask engine, initials phase, both unmask passes) build + the result via segment join instead of rebuilding the whole string per + replacement — O(n) instead of O(n²) on large documents + +### Fixed +- Mask engine processes items in document order: instance numbers now match + occurrence order (was reverse — wrong original could be restored when two + different values masked to the same string) + ## [2.5.7] - 2026-06 ### Security diff --git a/config_example.py b/config_example.py index 51824ac..50116a4 100644 --- a/config_example.py +++ b/config_example.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Configuration example for data_masking.py v2.5.1 +Configuration example for data_masking.py v2.6.0 Demonstrates all available configuration options using dataclasses. No external dependencies required — uses only Python standard library. @@ -9,7 +9,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 """ @@ -25,7 +25,7 @@ @dataclass class SystemConfig: """Системні налаштування.""" - version: str = "v2.5.1" + version: str = "v2.6.0" hash_algorithm: str = "blake2b" hash_digest_size: int = 8 encoding: str = "utf-8" @@ -172,7 +172,7 @@ def to_dict(self) -> dict: data = config.to_dict() print("=" * 70) - print(" Data Masking Configuration Example v2.5.1") + print(" Data Masking Configuration Example v2.6.0") print("=" * 70) for section_name, section_data in data.items(): diff --git a/config_example.yaml b/config_example.yaml index 8d95e33..91025c2 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -1,5 +1,5 @@ # ========================================================================== -# Приклад конфігурації для системи маскування даних v2.5.1 +# Приклад конфігурації для системи маскування даних v2.6.0 # # Скопіюйте цей файл як config.yaml та налаштуйте під свої потреби. # Згенерувати конфігурацію за замовчуванням: @@ -8,7 +8,7 @@ # Author: Vladyslav V. Prodan # Contact: github.com/click0 # Phone: +38(099)6053340 -# Version: 2.5.1 +# Version: 2.6.0 # License: BSD 3-Clause "New" or "Revised" License # Year: 2025-2026 # ========================================================================== @@ -18,7 +18,7 @@ # -------------------------------------------------------------------------- system: # Версія конфігурації (для сумісності) - version: "v2.5.1" + version: "v2.6.0" # Алгоритм хешування для детерміністичного маскування. # Той самий вхід завжди дає той самий замаскований результат. diff --git a/data_masking.py b/data_masking.py index 685b243..b8d1665 100644 --- a/data_masking.py +++ b/data_masking.py @@ -2,19 +2,26 @@ # -*- coding: utf-8 -*- """ -Data Masking Script v2.5.1 +Data Masking Script v2.6.0 Локально узгоджене маскування конфіденційних даних з INSTANCE TRACKING -ОНОВЛЕНО В v2.5.1: -- Рефакторинг: розбито на пакет masking/ (constants, helpers, language, - context, mask_personal, mask_military, engine, cli) -- Додано __main__.py: запуск з кореня репо — python . mask / python . unmask -- Зворотна сумісність: всі імпорти з data_masking продовжують працювати +ОНОВЛЕНО В v2.6.0: +- Зворотні ініціали: ПІБ типу "Іванов П.А." зберігаються у mapping + (категорія initials) та повністю відновлюються при unmask +- Instance tracking у порядку документа; повторні текстові дати + відновлюються всі +- "Живі" прапорці MASK_*: data_masking.MASK_NAMES = False знову діє +- O(n) заміни замість O(n^2) на великих файлах +- Згенеровані паролі виводяться у stderr + +Архітектура (з v2.5.0): тонка обгортка над пакетом masking/ +(constants, helpers, language, context, mask_personal, mask_military, +engine, cli); запуск з кореня репо — python . mask / python . unmask Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 """ @@ -23,7 +30,7 @@ # Re-exports from masking package for backward compatibility # ============================================================================ -__version__ = "2.5.7" +__version__ = "2.6.0" from masking.constants import ( __version__, __author__, __contact__, __phone__, __license__, __year__, diff --git a/diagnose_mapping.py b/diagnose_mapping.py index 0d3b553..9f9a1d5 100644 --- a/diagnose_mapping.py +++ b/diagnose_mapping.py @@ -6,7 +6,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 diff --git a/masking/__init__.py b/masking/__init__.py index 589e3bd..3d4fe34 100644 --- a/masking/__init__.py +++ b/masking/__init__.py @@ -4,7 +4,7 @@ """ Masking package — data masking with instance tracking. -Refactored from monolithic data_masking.py in v2.5.1. +Refactored from monolithic data_masking.py (v2.5.0). """ from masking.constants import __version__, __author__, __contact__, __license__, __year__ diff --git a/masking/cli.py b/masking/cli.py index 555d853..4b7d42d 100644 --- a/masking/cli.py +++ b/masking/cli.py @@ -4,7 +4,7 @@ """ CLI entry point and orchestration for data masking. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import json diff --git a/masking/constants.py b/masking/constants.py index f541b37..06af580 100644 --- a/masking/constants.py +++ b/masking/constants.py @@ -4,7 +4,7 @@ """ Masking constants, patterns, and configuration flags. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import re @@ -26,7 +26,7 @@ # ============================================================================ # МЕТАДАНІ # ============================================================================ -__version__ = "2.5.7" +__version__ = "2.6.0" __author__ = "Vladyslav V. Prodan" __contact__ = "github.com/click0" __phone__ = "+38(099)6053340" diff --git a/masking/context.py b/masking/context.py index 25b55d3..82bbdaa 100644 --- a/masking/context.py +++ b/masking/context.py @@ -4,7 +4,7 @@ """ Context analysis and line parsing functions. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import re diff --git a/masking/engine.py b/masking/engine.py index 5e27941..b7a424d 100644 --- a/masking/engine.py +++ b/masking/engine.py @@ -4,7 +4,7 @@ """ Main masking engine: context-aware text masking and JSON processing. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import random @@ -133,10 +133,12 @@ def _mask_initials_pib(text: str, masking_dict: Dict, instance_counters: Dict) - if not any(c[0] < k[1] and c[1] > k[0] for k in kept): kept.append(c) - # Фаза 2: у порядку документа маскуємо та записуємо mapping + # Фаза 2: у порядку документа маскуємо, записуємо mapping + # і збираємо результат сегментами (O(n)) masking_dict["mappings"].setdefault("initials", {}) kept.sort(key=lambda x: x[0]) - replacements = [] + segments = [] + prev_end = 0 for start, end, surname, initials, has_space, ini_first in kept: ms = mask_surname(surname, masking_dict, instance_counters) sep = '. ' if has_space else '.' @@ -147,11 +149,13 @@ def _mask_initials_pib(text: str, masking_dict: Dict, instance_counters: Dict) - masked_ini = add_to_mapping(masking_dict, instance_counters, "initials", orig_ini, masked_ini) new_text = f"{masked_ini} {ms}" if ini_first else f"{ms} {masked_ini}" - replacements.append((start, end, new_text)) + segments.append(text[prev_end:start]) + segments.append(new_text) + prev_end = end - # Заміни з кінця тексту, щоб не збити позиції - for start, end, new_text in reversed(replacements): - text = text[:start] + new_text + text[end:] + if segments: + segments.append(text[prev_end:]) + text = ''.join(segments) return text @@ -265,43 +269,46 @@ def mask_text_context_aware(text: str, masking_dict: Dict, instance_counters: Di if not skip: items_to_mask.append({'type': 'date_text', 'full_text': match.group(0), 'number_part': match.group(0), 'start': match.start(), 'end': match.end()}) - items_to_mask.sort(key=lambda x: x['start'], reverse=True) + # Обхід у порядку документа: instance tracking збігається з порядком + # входжень (потрібно для unmask), а заміни збираються сегментами — + # O(n) замість квадратичного text[:i] + ... + text[j:] на кожен елемент + items_to_mask.sort(key=lambda x: x['start']) + segments = [] + prev_end = 0 for item in items_to_mask: - masked = "" + if item['start'] < prev_end: continue # перекриття — пропускаємо if text[item['start']:item['end']] != item['full_text']: continue - if item['type'] == 'ipn': masked = mask_ipn(item['number_part'], masking_dict, instance_counters) - elif item['type'] == 'passport_id': masked = mask_passport_id(item['number_part'], masking_dict, instance_counters) - elif item['type'] == 'military_id': masked = mask_military_id(item['number_part'], masking_dict, instance_counters) - elif item['type'] == 'military_unit': masked = mask_military_unit(item['number_part'], masking_dict, instance_counters) + + replacement = None + if item['type'] == 'ipn': replacement = mask_ipn(item['number_part'], masking_dict, instance_counters) + elif item['type'] == 'passport_id': replacement = mask_passport_id(item['number_part'], masking_dict, instance_counters) + elif item['type'] == 'military_id': replacement = mask_military_id(item['number_part'], masking_dict, instance_counters) + elif item['type'] == 'military_unit': replacement = mask_military_unit(item['number_part'], masking_dict, instance_counters) elif item['type'] == 'brigade_number': - masked = mask_brigade_number(item['full_text'], masking_dict, instance_counters) - text = text[:item['start']] + masked + text[item['end']:] - continue + replacement = mask_brigade_number(item['full_text'], masking_dict, instance_counters) elif item['type'] == 'date': - masked = mask_date(item['full_text'], masking_dict, instance_counters) - text = text[:item['start']] + masked + text[item['end']:] - continue + replacement = mask_date(item['full_text'], masking_dict, instance_counters) elif item['type'] == 'date_text': - masked = _mask_date_text(item['full_text'], masking_dict, instance_counters) - text = text[:item['start']] + masked + text[item['end']:] - continue + replacement = _mask_date_text(item['full_text'], masking_dict, instance_counters) elif item['type'] == 'order_simple': masked = mask_order_number(item['number_part'], masking_dict, instance_counters) - new_full = item['full_text'].replace(item['number_part'], masked, 1) - text = text[:item['start']] + new_full + text[item['end']:] - continue + replacement = item['full_text'].replace(item['number_part'], masked, 1) elif item['type'] == 'order_with_letters': masked = mask_order_number_with_letters(item['number_part'], masking_dict, instance_counters) - new_full = item['full_text'].replace(item['number_part'], masked, 1) - text = text[:item['start']] + new_full + text[item['end']:] - continue + replacement = item['full_text'].replace(item['number_part'], masked, 1) elif item['type'] in ['br_complex', 'br_with_slashes', 'br_with_suffix', 'br_standalone']: - masked = mask_br_number(item['full_text'], masking_dict, instance_counters) - text = text[:item['start']] + masked + text[item['end']:] + replacement = mask_br_number(item['full_text'], masking_dict, instance_counters) + + if replacement is None or replacement == "": continue + segments.append(text[prev_end:item['start']]) + segments.append(replacement) + prev_end = item['end'] - if masked: text = text[:item['start']] + masked + text[item['end']:] + if segments: + segments.append(text[prev_end:]) + text = ''.join(segments) lines = text.split('\n') masked_lines = [] diff --git a/masking/helpers.py b/masking/helpers.py index 40e93fd..c749f25 100644 --- a/masking/helpers.py +++ b/masking/helpers.py @@ -4,7 +4,7 @@ """ Base helper functions for masking operations. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import hashlib diff --git a/masking/language.py b/masking/language.py index 49ce211..f7d2767 100644 --- a/masking/language.py +++ b/masking/language.py @@ -4,7 +4,7 @@ """ Language analysis functions: gender detection, grammatical case, declension. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import random diff --git a/masking/mask_military.py b/masking/mask_military.py index 01d59cc..042e7b9 100644 --- a/masking/mask_military.py +++ b/masking/mask_military.py @@ -4,7 +4,7 @@ """ Military data masking: ranks, units, orders, BR numbers, dates. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import random diff --git a/masking/mask_personal.py b/masking/mask_personal.py index 5d7e328..4bac0f5 100644 --- a/masking/mask_personal.py +++ b/masking/mask_personal.py @@ -4,7 +4,7 @@ """ Personal data masking functions: IPN, passport, military ID, names. -Extracted from data_masking.py during v2.5.1 refactoring. +Extracted from data_masking.py during the package refactoring (v2.5.0). """ import random diff --git a/modules/__init__.py b/modules/__init__.py index 2bcf3fc..fac6c03 100644 --- a/modules/__init__.py +++ b/modules/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Data Masking Modules Package v2.5.1 +Data Masking Modules Package v2.6.0 Модулі системи маскування даних. """ diff --git a/modules/config.py b/modules/config.py index 6aeb630..36b2741 100644 --- a/modules/config.py +++ b/modules/config.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Configuration Module v2.5.1 for data_masking.py +Configuration Module v2.6.0 for data_masking.py Provides YAML + ENV + CLI configuration loading with priority resolution: CLI > ENV > config.yaml > config.py > Default @@ -16,7 +16,7 @@ Year: 2025-2026 """ -__version__ = "2.5.1" +__version__ = "2.6.0" import os import logging diff --git a/modules/masking_logger.py b/modules/masking_logger.py index d0d3861..08cc9e3 100644 --- a/modules/masking_logger.py +++ b/modules/masking_logger.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Logging Module v2.5.1 for data_masking.py +Logging Module v2.6.0 for data_masking.py Provides structured logging with JSON and colored console output for masking operations. @@ -22,7 +22,7 @@ from typing import Any, Dict, Optional -__version__ = "2.5.1" +__version__ = "2.6.0" class JsonFormatter(logging.Formatter): diff --git a/modules/password_generator.py b/modules/password_generator.py index e14ef39..2fe019c 100644 --- a/modules/password_generator.py +++ b/modules/password_generator.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Password Generator Module v2.5.1 +Password Generator Module v2.6.0 Генератор паролів з підтримкою ASCII, кирилиці та кастомних символів. Використання як модуль: from password_generator import generate_password, PasswordConfig @@ -25,7 +25,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 """ @@ -34,7 +34,7 @@ import argparse from dataclasses import dataclass from typing import Optional, List, Dict, Any, Union -__version__ = "2.5.1" +__version__ = "2.6.0" __author__ = "Vladyslav V. Prodan" # ============================================================================= # КОНСТАНТИ diff --git a/modules/re_mask.py b/modules/re_mask.py index 0457716..e70780b 100644 --- a/modules/re_mask.py +++ b/modules/re_mask.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Re-Masking Module v2.5.1 +Re-Masking Module v2.6.0 Multi-pass masking with chain tracking for data_masking.py Provides: @@ -12,7 +12,7 @@ Chain JSON format: { - "version": "2.5.1", + "version": "2.6.0", "chain_id": "", "chain_version": 2, "created_at": "ISO-8601", @@ -38,7 +38,7 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple -__version__ = "2.5.1" +__version__ = "2.6.0" # Standard mapping categories used by data_masking.py MAPPING_CATEGORIES = [ @@ -49,7 +49,7 @@ ] -def make_empty_masking_dict(version: str = "2.5.1") -> Dict: +def make_empty_masking_dict(version: str = "2.6.0") -> Dict: """Create a fresh empty masking dict with all required category keys. Args: diff --git a/modules/security.py b/modules/security.py index c7d0ef9..ce6a464 100644 --- a/modules/security.py +++ b/modules/security.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Security Module v2.5.1 +Security Module v2.6.0 Provides AES-256-GCM encryption/decryption of mapping files for data_masking.py. @@ -15,7 +15,7 @@ Year: 2025-2026 """ -__version__ = "2.5.1" +__version__ = "2.6.0" import json import os diff --git a/modules/selective.py b/modules/selective.py index dbea7ca..e8a6f7e 100644 --- a/modules/selective.py +++ b/modules/selective.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Selective Masking Module v2.5.1 +Selective Masking Module v2.6.0 Provides --only / --exclude support: mask only selected data types. Supports type aliases (plural forms, Ukrainian names), type groups, @@ -19,7 +19,7 @@ from dataclasses import dataclass, field from typing import Dict, List, Optional, Set -__version__ = "2.5.1" +__version__ = "2.6.0" # ============================================================================ # AVAILABLE TYPES diff --git a/modules/tools.py b/modules/tools.py index f3bac64..25e81a5 100644 --- a/modules/tools.py +++ b/modules/tools.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Tools API Module v2.5.1 +Tools API Module v2.6.0 Provides atomic masking functions for programmatic use (without CLI). Each function is self-contained and operates on a single value, updating @@ -52,7 +52,7 @@ # ============================================================================ # METADATA # ============================================================================ -__version__ = "2.5.1" +__version__ = "2.6.0" __author__ = "Vladyslav V. Prodan" __contact__ = "github.com/click0" __license__ = "BSD 3-Clause" diff --git a/requirements.txt b/requirements.txt index b5e133a..098a378 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -# Data Masking System v2.5.1 +# Data Masking System v2.6.0 # Python 3.9+ # Генерація реалістичних українських імен diff --git a/tests/test_case_preservation.py b/tests/test_case_preservation.py index 31a4188..afab947 100644 --- a/tests/test_case_preservation.py +++ b/tests/test_case_preservation.py @@ -5,7 +5,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 diff --git a/tests/test_diagnose.py b/tests/test_diagnose.py index 2468eef..7bceadb 100644 --- a/tests/test_diagnose.py +++ b/tests/test_diagnose.py @@ -6,7 +6,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 """ diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 3cbb88c..c31f209 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -5,7 +5,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 diff --git a/tests/test_initials.py b/tests/test_initials.py index a761f95..c0fe50e 100644 --- a/tests/test_initials.py +++ b/tests/test_initials.py @@ -135,3 +135,26 @@ def test_roundtrip_repeated_person(self): version = check_mapping_version(masking_dict) restored, _ = unmask_text_v2(masked, masking_dict, version) assert restored == text + + +class TestLargeDocumentRoundtrip: + """Великий документ: instance tracking у порядку документа, + сегментна збірка замін не губить і не дублює текст.""" + + def test_many_items_roundtrip(self): + import random as rnd + rnd.seed(42) + lines = [] + for i in range(300): + ipn = ''.join(rnd.choice('0123456789') for _ in range(10)) + day = rnd.randint(1, 28) + lines.append( + f'Запис {i}: ІПН {ipn} від {day:02d}.03.2024, в/ч А{rnd.randint(1000, 9999)}' + ) + text = '\n'.join(lines) + + masked, masking_dict = mask(text) + assert masked != text + version = check_mapping_version(masking_dict) + restored, _ = unmask_text_v2(masked, masking_dict, version) + assert restored == text diff --git a/tests/test_integration.py b/tests/test_integration.py index d5dcea2..3ea060f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Інтеграційні тести для data_masking.py v2.5.1 +Інтеграційні тести для data_masking.py v2.6.0 Тестує нові функції: - --init-config - --encrypt / --password @@ -11,7 +11,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 """ @@ -574,7 +574,7 @@ def test_basic_masking(self, sample_input_file, temp_dir, run_masking): text = sample_input_file.read_text(encoding="utf-8") masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {k: {} for k in [ "ipn", "passport_id", "military_id", "surname", "name", "military_unit", "order_number", "order_number_with_letters", @@ -608,7 +608,7 @@ def test_masking_with_only_flag(self, sample_input_file, temp_dir): # Симуляція --only ipn (маскуємо тільки ІПН) masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {k: {} for k in [ "ipn", "passport_id", "military_id", "surname", "name", "military_unit", "order_number", "rank", "date", "date_text", "patronymic" @@ -648,7 +648,7 @@ def test_masking_with_remask(self, sample_input_file, temp_dir, run_masking): text = sample_input_file.read_text(encoding="utf-8") masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {k: {} for k in [ "ipn", "passport_id", "military_id", "surname", "name", "military_unit", "order_number", "order_number_with_letters", @@ -701,7 +701,7 @@ def test_text_date_masking(self, temp_dir): text = 'від "06" жовтня 2025 року №292' masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {k: {} for k in [ "ipn", "passport_id", "military_id", "surname", "name", "order_number", "rank", "date", "date_text", "patronymic" @@ -724,7 +724,7 @@ def test_numeric_date_masking(self, temp_dir): text = "Дата: 15.03.2024" masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {k: {} for k in [ "ipn", "passport_id", "military_id", "surname", "name", "order_number", "rank", "date", "date_text", "patronymic" @@ -765,7 +765,7 @@ def test_no_report_flag(self, temp_dir): # Симулюємо --no-report text = "Тестовий текст" masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {"surname": {}, "name": {}}, "statistics": {}, "instance_tracking": {} @@ -800,7 +800,7 @@ def test_json_output(self, temp_dir): } masking_dict = { - "version": "2.5.1", + "version": "2.6.0", "mappings": {k: {} for k in [ "ipn", "passport_id", "surname", "name", "rank", "patronymic" ]}, @@ -881,7 +881,7 @@ class TestChainRoundtrip: def _make_masking_dict(self): """Create a fresh masking dict.""" from re_mask import make_empty_masking_dict - return make_empty_masking_dict("2.5.1") + return make_empty_masking_dict("2.6.0") def test_chain_roundtrip_2_passes(self): """Тест: маскування у 2 проходи, потім відновлення через ланцюг.""" diff --git a/tests/test_patronymic.py b/tests/test_patronymic.py index b936e03..96c8833 100644 --- a/tests/test_patronymic.py +++ b/tests/test_patronymic.py @@ -5,7 +5,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 diff --git a/tests/test_rank_data.py b/tests/test_rank_data.py index e1163ea..c849c3a 100644 --- a/tests/test_rank_data.py +++ b/tests/test_rank_data.py @@ -7,7 +7,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 diff --git a/tests/test_unmask.py b/tests/test_unmask.py index a847971..e4435c4 100644 --- a/tests/test_unmask.py +++ b/tests/test_unmask.py @@ -6,7 +6,7 @@ Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 diff --git a/unmask_data.py b/unmask_data.py index bb54e4d..c07419c 100644 --- a/unmask_data.py +++ b/unmask_data.py @@ -2,18 +2,22 @@ # -*- coding: utf-8 -*- """ -Data Unmasking Script v2.5.1 +Data Unmasking Script v2.6.0 Відновлення оригінальних даних з замаскованого файлу -ОНОВЛЕНО В v2.5.1: -- Рефакторинг: розбито на пакет unmasking/ (helpers, engine, io, cli) -- Додано __main__.py: запуск з кореня репо — python . mask / python . unmask -- Зворотна сумісність: всі імпорти з unmask_data продовжують працювати +ОНОВЛЕНО В v2.6.0: +- Відновлення ініціалів (категорія initials у mapping) +- Повторні текстові дати відновлюються всі (instance tracking) +- Розпізнавання mapping-файлів усіх версій 2.x (раніше 2.3+ падали у v1-логіку) +- O(n) заміни замість O(n^2) на великих файлах + +Архітектура (з v2.5.0): тонка обгортка над пакетом unmasking/ +(helpers, engine, io, cli); запуск з кореня репо — python . unmask Author: Vladyslav V. Prodan Contact: github.com/click0 Phone: +38(099)6053340 -Version: 2.5.1 +Version: 2.6.0 License: BSD 3-Clause "New" or "Revised" License Year: 2025-2026 """ diff --git a/unmasking/__init__.py b/unmasking/__init__.py index 804e35d..99e65fd 100644 --- a/unmasking/__init__.py +++ b/unmasking/__init__.py @@ -4,7 +4,7 @@ """ Unmasking package — data restoration from masked files. -Refactored from monolithic unmask_data.py in v2.5.1. +Refactored from monolithic unmask_data.py (v2.5.0). """ from unmasking.cli import __version__, main diff --git a/unmasking/cli.py b/unmasking/cli.py index 59c514f..3c96eba 100644 --- a/unmasking/cli.py +++ b/unmasking/cli.py @@ -4,7 +4,7 @@ """ CLI entry point for data unmasking. -Extracted from unmask_data.py during v2.5.1 refactoring. +Extracted from unmask_data.py during the package refactoring (v2.5.0). """ import json @@ -58,7 +58,7 @@ # ============================================================================ # МЕТАДАНІ # ============================================================================ -__version__ = "2.5.7" +__version__ = "2.6.0" def main(): diff --git a/unmasking/engine.py b/unmasking/engine.py index b3e35c9..a807666 100644 --- a/unmasking/engine.py +++ b/unmasking/engine.py @@ -4,7 +4,7 @@ """ Core unmasking logic: text and JSON restoration. -Extracted from unmask_data.py during v2.5.1 refactoring. +Extracted from unmask_data.py during the package refactoring (v2.5.0). """ import re @@ -141,10 +141,21 @@ def unmask_ranks_gender_aware(masked_text: str, masking_map: Dict) -> Tuple[str, else: stats["skipped_count"] += 1 - # КРОК 3: ВИКОНАННЯ ЗАМІН (з кінця до початку) - replacements_to_do.sort(key=lambda x: x[0], reverse=True) + # КРОК 3: ВИКОНАННЯ ЗАМІН + # Сегментами в порядку документа — O(n) замість квадратичної + # пересборки рядка на кожну заміну + replacements_to_do.sort(key=lambda x: x[0]) + segments = [] + prev_end = 0 for start, end, original in replacements_to_do: - restored_text = restored_text[:start] + original + restored_text[end:] + if start < prev_end: + continue + segments.append(restored_text[prev_end:start]) + segments.append(original) + prev_end = end + if segments: + segments.append(restored_text[prev_end:]) + restored_text = ''.join(segments) return restored_text, stats @@ -175,12 +186,24 @@ def unmask_other_data(masked_text: str, masking_map: Dict) -> Tuple[str, Dict]: else: stats["skipped_count"] += 1 - replacements_to_do.sort(key=lambda x: x[0], reverse=True) + # Сегментами в порядку документа — O(n); перевірка відповідності маски + # йде по незмінному тексту, перекриття пропускаються + replacements_to_do.sort(key=lambda x: x[0]) + segments = [] + prev_end = 0 for start_pos, end_pos, original_value, masked_value in replacements_to_do: - if restored_text[start_pos:end_pos].lower() == masked_value.lower(): - masked_segment = restored_text[start_pos:end_pos] - original_value = _apply_original_case(masked_segment, original_value) - restored_text = restored_text[:start_pos] + original_value + restored_text[end_pos:] + if start_pos < prev_end: + continue + masked_segment = restored_text[start_pos:end_pos] + if masked_segment.lower() != masked_value.lower(): + continue + original_value = _apply_original_case(masked_segment, original_value) + segments.append(restored_text[prev_end:start_pos]) + segments.append(original_value) + prev_end = end_pos + if segments: + segments.append(restored_text[prev_end:]) + restored_text = ''.join(segments) return restored_text, stats diff --git a/unmasking/helpers.py b/unmasking/helpers.py index 39aafcf..e54ab1f 100644 --- a/unmasking/helpers.py +++ b/unmasking/helpers.py @@ -4,7 +4,7 @@ """ Helper functions for unmasking operations. -Extracted from unmask_data.py during v2.5.1 refactoring. +Extracted from unmask_data.py during the package refactoring (v2.5.0). """ import re diff --git a/unmasking/io.py b/unmasking/io.py index 68e8e3e..ca1235a 100644 --- a/unmasking/io.py +++ b/unmasking/io.py @@ -4,7 +4,7 @@ """ File I/O and validation for unmasking operations. -Extracted from unmask_data.py during v2.5.1 refactoring. +Extracted from unmask_data.py during the package refactoring (v2.5.0). """ import json