sillsdev · mshannon-sil · Mar 26, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -27,7 +27,7 @@
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
-from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
+from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
 from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
@@ -115,6 +115,7 @@
     "FileParatextProjectTermsParser",
     "FileParatextProjectTextUpdater",
     "FileParatextProjectVersificationErrorDetector",
+    "filter_tokens_by_chapter",
     "flatten",
     "is_scripture",
     "KeyTerm",

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,6 +1,7 @@
 from abc import ABC
-from typing import Callable, Iterable, Optional, Sequence, Union
+from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union
 
+from ..utils.string_utils import parse_integer
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
@@ -11,6 +12,8 @@
     UpdateUsfmTextBehavior,
 )
 from .usfm_parser import parse_usfm
+from .usfm_token import UsfmTokenType
+from .usfm_tokenizer import UsfmToken, UsfmTokenizer
 from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
 
 
@@ -30,14 +33,15 @@ def update_usfm(
         self,
         book_id: str,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
+        chapters: Optional[Sequence[int]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
         paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
         embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
         style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
         preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
         update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
-        remarks: Optional[Iterable[str]] = None,
+        remarks: Optional[Iterable[Tuple[int, str]]] = None,
         error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
         compare_segments: bool = False,
     ) -> Optional[str]:
@@ -60,7 +64,10 @@ def update_usfm(
             compare_segments=compare_segments,
         )
         try:
-            parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
+            tokenizer = UsfmTokenizer(self._settings.stylesheet)
+            tokens = tokenizer.tokenize(usfm)
+            tokens = filter_tokens_by_chapter(tokens, chapters)
+            parse_usfm(tokens, handler, self._settings.stylesheet, self._settings.versification)
             return handler.get_usfm(self._settings.stylesheet)
         except Exception as e:
             error_message = (
@@ -69,3 +76,30 @@ def update_usfm(
                 f". Error: '{e}'"
             )
             raise RuntimeError(error_message) from e
+
+
+def filter_tokens_by_chapter(
+    tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
+) -> Sequence[UsfmToken]:
+    if chapters is None:
+        return tokens
+    tokens_within_chapters: List[UsfmToken] = []
+    in_chapter: bool = False
+    in_id_marker: bool = False
+    for index, token in enumerate(tokens):
+        if index == 0 and token.marker == "id":
+            in_id_marker = True
+            if 1 in chapters:
+                in_chapter = True
+        elif in_id_marker and token.marker is not None and token.marker != "id":
+            in_id_marker = False
+        elif token.type == UsfmTokenType.CHAPTER:
+            chapter_num = parse_integer(token.data) if token.data else None
+            if chapter_num is not None and chapter_num in chapters:
+                in_chapter = True
+            else:
+                in_chapter = False
+
+        if in_id_marker or in_chapter:
+            tokens_within_chapters.append(token)
+    return tokens_within_chapters
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
@@ -53,7 +53,7 @@ def __init__(
         style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
         preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
         update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
-        remarks: Optional[Iterable[str]] = None,
+        remarks: Optional[Iterable[Tuple[int, str]]] = None,
         error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
         compare_segments: bool = False,
     ) -> None:
@@ -340,19 +340,42 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
         tokenizer = UsfmTokenizer(stylesheet)
         tokens = list(self._tokens)
         if len(self._remarks) > 0:
-            remark_tokens: List[UsfmToken] = []
-            for remark in self._remarks:
-                remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
-                remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
+            remark_tokens_by_chapter: Dict[int, List[UsfmToken]] = {}
+            for chapter_num, remark in self._remarks:
+                chapter_tokens = remark_tokens_by_chapter.setdefault(chapter_num, [])
+                chapter_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
+                chapter_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
             if len(tokens) > 0:
-                index = 0
-                markers_to_skip = {"id", "ide", "rem"}
-                while tokens[index].marker in markers_to_skip:
-                    index += 1
-                    if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
+                for chapter_num, remark_tokens in remark_tokens_by_chapter.items():
+                    if chapter_num == 0:
+                        index = 0
+                        markers_to_skip = {"id", "ide", "rem"}
+                    else:
+                        index = next(
+                            (
+                                i
+                                for i, token in enumerate(tokens)
+                                if token.type == UsfmTokenType.CHAPTER
+                                and token.data is not None
+                                and str(token.data).isdigit()
+                                and int(token.data) == chapter_num
+                            ),
+                            -1,
+                        )
+                        if index == -1:
+                            continue
                         index += 1
-                for remark_token in reversed(remark_tokens):
-                    tokens.insert(index, remark_token)
+                        markers_to_skip = {"rem"}
+
+                    if index >= len(tokens):
+                        tokens.extend(remark_tokens)
+                    else:
+                        while index < len(tokens) and tokens[index].marker in markers_to_skip:
+                            index += 1
+                            if index < len(tokens) and tokens[index].type == UsfmTokenType.TEXT:
+                                index += 1
+
+                        tokens[index:index] = remark_tokens
         return tokenizer.detokenize(tokens)
 
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:

diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py
@@ -17,7 +17,7 @@
 
 
 def parse_usfm(
-    usfm: str,
+    usfm: Union[str, Sequence[UsfmToken]],
     handler: UsfmParserHandler,
     stylesheet: Union[StrPath, UsfmStylesheet] = "usfm.sty",
     versification: Optional[Versification] = None,