From d8ca02da8aada71b266f56c41dd14b179d76fc04 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 26 Mar 2026 17:28:43 -0400 Subject: [PATCH 01/14] modify usfm for chapter-level drafting to avoid import issues; move remarks to chapters --- .../paratext_project_text_updater_base.py | 3 +- machine/corpora/update_usfm_parser_handler.py | 34 ++++++++++++++----- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0e7bfdfd..0a80c407 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -29,6 +29,7 @@ def __init__( def update_usfm( self, book_id: str, + chapters: Optional[Sequence[int]] = None, rows: Optional[Sequence[UpdateUsfmRow]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, @@ -61,7 +62,7 @@ def update_usfm( ) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) - return handler.get_usfm(self._settings.stylesheet) + return handler.get_usfm(self._settings.stylesheet, chapters) except Exception as e: error_message = ( f"An error occurred while parsing the usfm for '{book_id}'" diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 9d95850c..4c187ac7 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -334,27 +334,43 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) - if embed_outside_of_block: self._end_update_block(state, [scripture_ref]) - def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: + def get_usfm( + self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None + ) -> str: if isinstance(stylesheet, str): stylesheet = UsfmStylesheet(stylesheet) tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) + if chapters is not None: + tokens = self._get_incremental_draft_tokens(tokens, chapters) if len(self._remarks) > 0: remark_tokens: List[UsfmToken] = [] for remark in self._remarks: remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) if len(tokens) > 0: - index = 0 - markers_to_skip = {"id", "ide", "rem"} - while tokens[index].marker in markers_to_skip: - index += 1 - if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: - index += 1 - for remark_token in reversed(remark_tokens): - tokens.insert(index, remark_token) + for index, token in enumerate(tokens): + if token.type == UsfmTokenType.CHAPTER: + tokens[index + 1 : index + 1] = remark_tokens return tokenizer.detokenize(tokens) + def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]: + incremental_draft_tokens: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + incremental_draft_tokens.append(token) + continue + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True + incremental_draft_tokens.append(token) + else: + in_chapter = False + elif in_chapter: + incremental_draft_tokens.append(token) + return incremental_draft_tokens + def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: row_texts: List[str] = [] row_metadata = None From aef5d5d71a892fac1b299a5dfbda9ad84768f91b Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Wed, 15 Apr 2026 14:02:28 -0400 Subject: [PATCH 02/14] move filtering before token processing --- .../paratext_project_text_updater_base.py | 36 ++++++++++++++++--- machine/corpora/update_usfm_parser_handler.py | 23 +----------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0a80c407..a7dc464d 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Callable, Iterable, Optional, Sequence, Union +from typing import Callable, Iterable, List, Optional, Sequence, Union from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings @@ -10,7 +10,9 @@ UpdateUsfmRow, UpdateUsfmTextBehavior, ) -from .usfm_parser import parse_usfm +from .usfm_parser import UsfmParser +from .usfm_token import UsfmTokenType +from .usfm_tokenizer import UsfmToken, UsfmTokenizer from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError @@ -61,8 +63,12 @@ def update_usfm( compare_segments=compare_segments, ) try: - parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) - return handler.get_usfm(self._settings.stylesheet, chapters) + tokenizer = UsfmTokenizer(self._settings.stylesheet) + tokens = tokenizer.tokenize(usfm) + tokens = self.filter_tokens_by_chapter(tokens, chapters) + parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) + parser.process_tokens() + return handler.get_usfm(self._settings.stylesheet) except Exception as e: error_message = ( f"An error occurred while parsing the usfm for '{book_id}'" @@ -70,3 +76,25 @@ def update_usfm( f". Error: '{e}'" ) raise RuntimeError(error_message) from e + + def filter_tokens_by_chapter( + self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None + ) -> Sequence[UsfmToken]: + if chapters is None: + return tokens + tokens_within_chapters: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + tokens_within_chapters.append(token) + if 1 in chapters: + in_chapter = True + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True + tokens_within_chapters.append(token) + else: + in_chapter = False + elif in_chapter: + tokens_within_chapters.append(token) + return tokens_within_chapters diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 4c187ac7..78fd85f4 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -334,15 +334,11 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) - if embed_outside_of_block: self._end_update_block(state, [scripture_ref]) - def get_usfm( - self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None - ) -> str: + def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): stylesheet = UsfmStylesheet(stylesheet) tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) - if chapters is not None: - tokens = self._get_incremental_draft_tokens(tokens, chapters) if len(self._remarks) > 0: remark_tokens: List[UsfmToken] = [] for remark in self._remarks: @@ -354,23 +350,6 @@ def get_usfm( tokens[index + 1 : index + 1] = remark_tokens return tokenizer.detokenize(tokens) - def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]: - incremental_draft_tokens: List[UsfmToken] = [] - in_chapter: bool = False - for index, token in enumerate(tokens): - if index == 0 and token.marker == "id": - incremental_draft_tokens.append(token) - continue - elif token.type == UsfmTokenType.CHAPTER: - if token.data and int(token.data) in chapters: - in_chapter = True - incremental_draft_tokens.append(token) - else: - in_chapter = False - elif in_chapter: - incremental_draft_tokens.append(token) - return incremental_draft_tokens - def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: row_texts: List[str] = [] row_metadata = None From e42370887ef0b161edd325abbad82b795efd423d Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Wed, 15 Apr 2026 17:08:20 -0400 Subject: [PATCH 03/14] add test case for chapter filtering --- machine/corpora/__init__.py | 2 +- .../paratext_project_text_updater_base.py | 45 ++++++++++--------- .../test_update_usfm_parser_handler.py | 37 ++++++++++++++- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index d07e52ee..7cbc2889 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -27,7 +27,7 @@ from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase -from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase +from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index a7dc464d..a32bdb99 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -31,8 +31,8 @@ def __init__( def update_usfm( self, book_id: str, - chapters: Optional[Sequence[int]] = None, rows: Optional[Sequence[UpdateUsfmRow]] = None, + chapters: Optional[Sequence[int]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -65,7 +65,7 @@ def update_usfm( try: tokenizer = UsfmTokenizer(self._settings.stylesheet) tokens = tokenizer.tokenize(usfm) - tokens = self.filter_tokens_by_chapter(tokens, chapters) + tokens = filter_tokens_by_chapter(tokens, chapters) parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) parser.process_tokens() return handler.get_usfm(self._settings.stylesheet) @@ -77,24 +77,25 @@ def update_usfm( ) raise RuntimeError(error_message) from e - def filter_tokens_by_chapter( - self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None - ) -> Sequence[UsfmToken]: - if chapters is None: - return tokens - tokens_within_chapters: List[UsfmToken] = [] - in_chapter: bool = False - for index, token in enumerate(tokens): - if index == 0 and token.marker == "id": - tokens_within_chapters.append(token) - if 1 in chapters: - in_chapter = True - elif token.type == UsfmTokenType.CHAPTER: - if token.data and int(token.data) in chapters: - in_chapter = True - tokens_within_chapters.append(token) - else: - in_chapter = False - elif in_chapter: + +def filter_tokens_by_chapter( + tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None +) -> Sequence[UsfmToken]: + if chapters is None: + return tokens + tokens_within_chapters: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + tokens_within_chapters.append(token) + if 1 in chapters: + in_chapter = True + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True tokens_within_chapters.append(token) - return tokens_within_chapters + else: + in_chapter = False + elif in_chapter: + tokens_within_chapters.append(token) + return tokens_within_chapters diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index a9c1cdc1..940878bf 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -9,10 +9,12 @@ UpdateUsfmParserHandler, UpdateUsfmRow, UpdateUsfmTextBehavior, + UsfmParser, + UsfmTokenizer, UsfmUpdateBlock, UsfmUpdateBlockElementType, UsfmUpdateBlockHandler, - parse_usfm, + filter_tokens_by_chapter, ) @@ -1494,6 +1496,31 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text(): ) +def test_filter_chapters() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [2, 4] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT +\c 2 +\v 1 Some text +\c 4 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] @@ -1501,6 +1528,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]: def update_usfm( rows: Optional[Sequence[UpdateUsfmRow]] = None, source: Optional[str] = None, + chapters: Optional[Sequence[int]] = None, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -1516,6 +1544,7 @@ def update_usfm( return updater.update_usfm( "MAT", rows, + chapters, id_text, text_behavior, paragraph_behavior, @@ -1542,7 +1571,11 @@ def update_usfm( lambda _: False, compare_segments, ) - parse_usfm(source, updater) + tokenizer = UsfmTokenizer() + tokens = tokenizer.tokenize(source) + tokens = filter_tokens_by_chapter(tokens, chapters) + parser = UsfmParser(tokens, updater) + parser.process_tokens() return updater.get_usfm() From 1e2e99956e51c0620fb4b2876516d8498e902692 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 16 Apr 2026 10:02:53 -0400 Subject: [PATCH 04/14] make sure all text in \id is included --- machine/corpora/paratext_project_text_updater_base.py | 9 ++++++--- tests/corpora/test_update_usfm_parser_handler.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index a32bdb99..77e4ec3a 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -85,17 +85,20 @@ def filter_tokens_by_chapter( return tokens tokens_within_chapters: List[UsfmToken] = [] in_chapter: bool = False + in_id_marker: bool = False for index, token in enumerate(tokens): if index == 0 and token.marker == "id": - tokens_within_chapters.append(token) + in_id_marker = True if 1 in chapters: in_chapter = True + elif in_id_marker and token.marker is not None and token.marker != "id": + in_id_marker = False elif token.type == UsfmTokenType.CHAPTER: if token.data and int(token.data) in chapters: in_chapter = True - tokens_within_chapters.append(token) else: in_chapter = False - elif in_chapter: + + if in_id_marker or in_chapter: tokens_within_chapters.append(token) return tokens_within_chapters diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 940878bf..1505444b 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1512,7 +1512,7 @@ def test_filter_chapters() -> None: """ chapters = [2, 4] target = update_usfm(chapters=chapters, source=usfm) - result = r"""\id MAT + result = r"""\id MAT - Test \c 2 \v 1 Some text \c 4 From 707119c38bb2038fa7c6c19b3a4f178b3a9a0c0a Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 16 Apr 2026 11:19:18 -0400 Subject: [PATCH 05/14] update remark test and ensure remarks are added at the end of existing chapter remarks --- machine/corpora/update_usfm_parser_handler.py | 7 ++++++- tests/corpora/test_update_usfm_parser_handler.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 78fd85f4..5317a85b 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -347,7 +347,12 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if len(tokens) > 0: for index, token in enumerate(tokens): if token.type == UsfmTokenType.CHAPTER: - tokens[index + 1 : index + 1] = remark_tokens + insertion_index = index + 1 + while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem": + insertion_index += 1 + if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT: + insertion_index += 1 + tokens[insertion_index:insertion_index] = remark_tokens return tokenizer.detokenize(tokens) def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 1505444b..e896e21b 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1389,17 +1389,22 @@ def test_pass_remark(): \v 1 Some text \v 2 \v 3 Other text +\c 2 +\v 1 More text """ target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark -\rem New remark \c 1 +\rem New remark \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem New remark +\v 1 More text """ assert_usfm_equals(target, result) @@ -1408,12 +1413,16 @@ def test_pass_remark(): result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark +\c 1 \rem New remark \rem New remark 2 -\c 1 \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem New remark +\rem New remark 2 +\v 1 More text """ assert_usfm_equals(target, result) From e1865ea7afd4405b87e78b7ef1bc78df0246511c Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 16 Apr 2026 11:57:28 -0400 Subject: [PATCH 06/14] add test case for including chapter 1 and header information --- .../test_update_usfm_parser_handler.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index e896e21b..90ba2bba 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1530,6 +1530,34 @@ def test_filter_chapters() -> None: assert_usfm_equals(target, result) +def test_filter_chapters_with_chapter_1_and_header() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [1, 3] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 3 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] From 4a8fb5052c9e6a767ec7204c853af89e1d90b6f8 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 27 Apr 2026 15:13:25 -0400 Subject: [PATCH 07/14] support both book-level and chapter-level remarks --- .../paratext_project_text_updater_base.py | 4 +- machine/corpora/update_usfm_parser_handler.py | 49 +++++++++++----- .../test_update_usfm_parser_handler.py | 57 ++++++++++++++----- 3 files changed, 81 insertions(+), 29 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 77e4ec3a..526f5d50 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Callable, Iterable, List, Optional, Sequence, Union +from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings @@ -40,7 +40,7 @@ def update_usfm( style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, - remarks: Optional[Iterable[str]] = None, + remarks: Optional[Iterable[Tuple[int, str]]] = None, error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, compare_segments: bool = False, ) -> Optional[str]: diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 5317a85b..fe405a64 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -53,7 +53,7 @@ def __init__( style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, - remarks: Optional[Iterable[str]] = None, + remarks: Optional[Iterable[Tuple[int, str]]] = None, error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, compare_segments: bool = False, ) -> None: @@ -340,19 +340,42 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) if len(self._remarks) > 0: - remark_tokens: List[UsfmToken] = [] - for remark in self._remarks: - remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) - remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) + remark_tokens_by_chapter: Dict[int, List[UsfmToken]] = {} + for chapter_num, remark in self._remarks: + chapter_tokens = remark_tokens_by_chapter.setdefault(chapter_num, []) + chapter_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) + chapter_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) if len(tokens) > 0: - for index, token in enumerate(tokens): - if token.type == UsfmTokenType.CHAPTER: - insertion_index = index + 1 - while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem": - insertion_index += 1 - if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT: - insertion_index += 1 - tokens[insertion_index:insertion_index] = remark_tokens + for chapter_num, remark_tokens in remark_tokens_by_chapter.items(): + if chapter_num == 0: + index = 0 + markers_to_skip = {"id", "ide", "rem"} + else: + index = next( + ( + i + for i, token in enumerate(tokens) + if token.type == UsfmTokenType.CHAPTER + and token.data is not None + and str(token.data).isdigit() + and int(token.data) == chapter_num + ), + -1, + ) + if index == -1: + continue + index += 1 + markers_to_skip = {"rem"} + + if index >= len(tokens): + tokens.extend(remark_tokens) + else: + while index < len(tokens) and tokens[index].marker in markers_to_skip: + index += 1 + if index < len(tokens) and tokens[index].type == UsfmTokenType.TEXT: + index += 1 + + tokens[index:index] = remark_tokens return tokenizer.detokenize(tokens) def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 90ba2bba..b4bc2c2c 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Sequence, Union +from typing import Iterable, List, Optional, Sequence, Tuple, Union from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, ignore_line_endings @@ -1390,41 +1390,70 @@ def test_pass_remark(): \v 2 \v 3 Other text \c 2 +\rem Existing remark \v 1 More text +\c 3 +\v 1 Additional text """ - target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) + target = update_usfm( + rows, + usfm, + text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, + remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2")], + ) result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark +\rem New remark 0 \c 1 -\rem New remark +\rem New remark 1 \v 1 Some text \v 2 Update 2 \v 3 Other text \c 2 -\rem New remark +\rem Existing remark +\rem New remark 2 \v 1 More text +\c 3 +\v 1 Additional text """ assert_usfm_equals(target, result) - target = update_usfm(rows, target, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark 2"]) + +def test_pass_remark_0_no_existing_remark(): + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "Update 1", + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "Update 2", + ), + ] + usfm = r"""\id MAT - Test +\ide UTF-8 +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +""" + target = update_usfm( + rows, + usfm, + text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, + remarks=[(0, "New remark 0")], + ) result = r"""\id MAT - Test \ide UTF-8 -\rem Existing remark +\rem New remark 0 \c 1 -\rem New remark -\rem New remark 2 \v 1 Some text \v 2 Update 2 \v 3 Other text -\c 2 -\rem New remark -\rem New remark 2 -\v 1 More text """ - assert_usfm_equals(target, result) @@ -1573,7 +1602,7 @@ def update_usfm( style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Iterable[str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, - remarks: Optional[Iterable[str]] = None, + remarks: Optional[Iterable[Tuple[int, str]]] = None, compare_segments: bool = False, ) -> Optional[str]: if source is None: From 74dcf8f8b80aad0933463fe5d049d3b3f6717496 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 27 Apr 2026 15:18:22 -0400 Subject: [PATCH 08/14] add test case for multiple remarks for the same chapter --- .../test_update_usfm_parser_handler.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index b4bc2c2c..9e053f46 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1457,6 +1457,48 @@ def test_pass_remark_0_no_existing_remark(): assert_usfm_equals(target, result) +def test_pass_multiple_remarks_same_chapter() -> None: + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "Update 1", + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "Update 2", + ), + ] + usfm = r"""\id MAT - Test +\ide UTF-8 +\rem Existing remark +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +""" + + target = update_usfm( + rows, + usfm, + text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, + remarks=[(0, "New remark 0.1"), (0, "New remark 0.2"), (1, "New remark 1.1"), (1, "New remark 1.2")], + ) + result = r"""\id MAT - Test +\ide UTF-8 +\rem Existing remark +\rem New remark 0.1 +\rem New remark 0.2 +\c 1 +\rem New remark 1.1 +\rem New remark 1.2 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text +""" + + assert_usfm_equals(target, result) + + def test_update_block_footnote_in_published_chapter_number(): rows = [UpdateUsfmRow(scr_ref("ESG 1:0/2:s"), "Update 1")] usfm = r"""\id ESG - Test From ef9041f9de643ed8b33b4d01e60624ce39fd8a86 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 27 Apr 2026 17:59:32 -0400 Subject: [PATCH 09/14] fix init.py --- machine/corpora/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 7cbc2889..4a0817a6 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -115,6 +115,7 @@ "FileParatextProjectTermsParser", "FileParatextProjectTextUpdater", "FileParatextProjectVersificationErrorDetector", + "filter_tokens_by_chapter", "flatten", "is_scripture", "KeyTerm", From d9ecd2ac29a97591e95cefdee388dca126a5d0f9 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 27 Apr 2026 18:12:03 -0400 Subject: [PATCH 10/14] cover edge case of chapter as last marker --- tests/corpora/test_update_usfm_parser_handler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 9e053f46..dcc76ea0 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1393,14 +1393,13 @@ def test_pass_remark(): \rem Existing remark \v 1 More text \c 3 -\v 1 Additional text """ target = update_usfm( rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, - remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2")], + remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2"), (3, "New remark 3")], ) result = r"""\id MAT - Test \ide UTF-8 @@ -1416,7 +1415,7 @@ def test_pass_remark(): \rem New remark 2 \v 1 More text \c 3 -\v 1 Additional text +\rem New remark 3 """ assert_usfm_equals(target, result) From e39f94c478e4de7cc0292ff047c75e1613066cc7 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 27 Apr 2026 18:39:55 -0400 Subject: [PATCH 11/14] handle malformed chapter numbers --- .../paratext_project_text_updater_base.py | 4 +++- .../test_update_usfm_parser_handler.py | 24 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 526f5d50..fc13d7b9 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -14,6 +14,7 @@ from .usfm_token import UsfmTokenType from .usfm_tokenizer import UsfmToken, UsfmTokenizer from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError +from ..utils.string_utils import parse_integer class ParatextProjectTextUpdaterBase(ABC): @@ -94,7 +95,8 @@ def filter_tokens_by_chapter( elif in_id_marker and token.marker is not None and token.marker != "id": in_id_marker = False elif token.type == UsfmTokenType.CHAPTER: - if token.data and int(token.data) in chapters: + chapter_num = parse_integer(token.data) if token.data else None + if chapter_num is not None and chapter_num in chapters: in_chapter = True else: in_chapter = False diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index dcc76ea0..3f08a3cf 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1389,7 +1389,7 @@ def test_pass_remark(): \v 1 Some text \v 2 \v 3 Other text -\c 2 +\c 2. \rem Existing remark \v 1 More text \c 3 @@ -1410,7 +1410,7 @@ def test_pass_remark(): \v 1 Some text \v 2 Update 2 \v 3 Other text -\c 2 +\c 2. \rem Existing remark \rem New remark 2 \v 1 More text @@ -1628,6 +1628,26 @@ def test_filter_chapters_with_chapter_1_and_header() -> None: assert_usfm_equals(target, result) +def test_filter_chapters_with_bad_chapter_reference() -> None: + usfm = r"""\id MAT - Test +\c 1. +\v 1 Some text +\c 2. +\v 1 Some text +\c 3 +\v 1 Some text with good chapter reference +\c 4 +\v 1 Some text with good chapter reference +""" + chapters = [2, 4] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\c 4 +\v 1 Some text with good chapter reference +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] From 9cfd578ad3994ccded4df6b28e9a5e4130f52c72 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 27 Apr 2026 18:47:58 -0400 Subject: [PATCH 12/14] use parse_usfm; fix pass_remark test --- machine/corpora/paratext_project_text_updater_base.py | 7 +++---- machine/corpora/usfm_parser.py | 2 +- tests/corpora/test_update_usfm_parser_handler.py | 9 ++++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index fc13d7b9..7eee05fc 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,6 +1,7 @@ from abc import ABC from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union +from ..utils.string_utils import parse_integer from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase @@ -10,11 +11,10 @@ UpdateUsfmRow, UpdateUsfmTextBehavior, ) -from .usfm_parser import UsfmParser +from .usfm_parser import parse_usfm from .usfm_token import UsfmTokenType from .usfm_tokenizer import UsfmToken, UsfmTokenizer from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError -from ..utils.string_utils import parse_integer class ParatextProjectTextUpdaterBase(ABC): @@ -67,8 +67,7 @@ def update_usfm( tokenizer = UsfmTokenizer(self._settings.stylesheet) tokens = tokenizer.tokenize(usfm) tokens = filter_tokens_by_chapter(tokens, chapters) - parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) - parser.process_tokens() + parse_usfm(tokens, handler, self._settings.stylesheet, self._settings.versification) return handler.get_usfm(self._settings.stylesheet) except Exception as e: error_message = ( diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py index a37d5396..28de4aee 100644 --- a/machine/corpora/usfm_parser.py +++ b/machine/corpora/usfm_parser.py @@ -17,7 +17,7 @@ def parse_usfm( - usfm: str, + usfm: Union[str, Sequence[UsfmToken]], handler: UsfmParserHandler, stylesheet: Union[StrPath, UsfmStylesheet] = "usfm.sty", versification: Optional[Versification] = None, diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 3f08a3cf..278b795d 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -9,12 +9,12 @@ UpdateUsfmParserHandler, UpdateUsfmRow, UpdateUsfmTextBehavior, - UsfmParser, UsfmTokenizer, UsfmUpdateBlock, UsfmUpdateBlockElementType, UsfmUpdateBlockHandler, filter_tokens_by_chapter, + parse_usfm, ) @@ -1389,7 +1389,7 @@ def test_pass_remark(): \v 1 Some text \v 2 \v 3 Other text -\c 2. +\c 2 \rem Existing remark \v 1 More text \c 3 @@ -1410,7 +1410,7 @@ def test_pass_remark(): \v 1 Some text \v 2 Update 2 \v 3 Other text -\c 2. +\c 2 \rem Existing remark \rem New remark 2 \v 1 More text @@ -1701,8 +1701,7 @@ def update_usfm( tokenizer = UsfmTokenizer() tokens = tokenizer.tokenize(source) tokens = filter_tokens_by_chapter(tokens, chapters) - parser = UsfmParser(tokens, updater) - parser.process_tokens() + parse_usfm(tokens, updater) return updater.get_usfm() From 911df510990aa05aca2d536cec6b58c4c663b3c7 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Tue, 28 Apr 2026 16:54:49 -0400 Subject: [PATCH 13/14] don't filter if chapters is empty list --- machine/corpora/paratext_project_text_updater_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 7eee05fc..1ef9a1e4 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -81,7 +81,7 @@ def update_usfm( def filter_tokens_by_chapter( tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None ) -> Sequence[UsfmToken]: - if chapters is None: + if not chapters: return tokens tokens_within_chapters: List[UsfmToken] = [] in_chapter: bool = False From cf0ea05cded61ad7c4088215680c4af4a693e447 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Tue, 28 Apr 2026 17:01:43 -0400 Subject: [PATCH 14/14] revert change --- machine/corpora/paratext_project_text_updater_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 1ef9a1e4..7eee05fc 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -81,7 +81,7 @@ def update_usfm( def filter_tokens_by_chapter( tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None ) -> Sequence[UsfmToken]: - if not chapters: + if chapters is None: return tokens tokens_within_chapters: List[UsfmToken] = [] in_chapter: bool = False