From d8ca02da8aada71b266f56c41dd14b179d76fc04 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 26 Mar 2026 17:28:43 -0400
Subject: [PATCH 01/14] modify usfm for chapter-level drafting to avoid import
 issues; move remarks to chapters

---
 .../paratext_project_text_updater_base.py     |  3 +-
 machine/corpora/update_usfm_parser_handler.py | 34 ++++++++++++++-----
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 0e7bfdfd..0a80c407 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -29,6 +29,7 @@ def __init__(
     def update_usfm(
         self,
         book_id: str,
+        chapters: Optional[Sequence[int]] = None,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
@@ -61,7 +62,7 @@ def update_usfm(
         )
         try:
             parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
-            return handler.get_usfm(self._settings.stylesheet)
+            return handler.get_usfm(self._settings.stylesheet, chapters)
         except Exception as e:
             error_message = (
                 f"An error occurred while parsing the usfm for '{book_id}'"
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 9d95850c..4c187ac7 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -334,27 +334,43 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -
         if embed_outside_of_block:
             self._end_update_block(state, [scripture_ref])
 
-    def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
+    def get_usfm(
+        self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None
+    ) -> str:
         if isinstance(stylesheet, str):
             stylesheet = UsfmStylesheet(stylesheet)
         tokenizer = UsfmTokenizer(stylesheet)
         tokens = list(self._tokens)
+        if chapters is not None:
+            tokens = self._get_incremental_draft_tokens(tokens, chapters)
         if len(self._remarks) > 0:
             remark_tokens: List[UsfmToken] = []
             for remark in self._remarks:
                 remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
                 remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
             if len(tokens) > 0:
-                index = 0
-                markers_to_skip = {"id", "ide", "rem"}
-                while tokens[index].marker in markers_to_skip:
-                    index += 1
-                    if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
-                        index += 1
-                for remark_token in reversed(remark_tokens):
-                    tokens.insert(index, remark_token)
+                for index, token in enumerate(tokens):
+                    if token.type == UsfmTokenType.CHAPTER:
+                        tokens[index + 1 : index + 1] = remark_tokens
         return tokenizer.detokenize(tokens)
 
+    def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]:
+        incremental_draft_tokens: List[UsfmToken] = []
+        in_chapter: bool = False
+        for index, token in enumerate(tokens):
+            if index == 0 and token.marker == "id":
+                incremental_draft_tokens.append(token)
+                continue
+            elif token.type == UsfmTokenType.CHAPTER:
+                if token.data and int(token.data) in chapters:
+                    in_chapter = True
+                    incremental_draft_tokens.append(token)
+                else:
+                    in_chapter = False
+            elif in_chapter:
+                incremental_draft_tokens.append(token)
+        return incremental_draft_tokens
+
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
         row_texts: List[str] = []
         row_metadata = None

From aef5d5d71a892fac1b299a5dfbda9ad84768f91b Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Wed, 15 Apr 2026 14:02:28 -0400
Subject: [PATCH 02/14] move filtering before token processing

---
 .../paratext_project_text_updater_base.py     | 36 ++++++++++++++++---
 machine/corpora/update_usfm_parser_handler.py | 23 +-----------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 0a80c407..a7dc464d 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Callable, Iterable, Optional, Sequence, Union
+from typing import Callable, Iterable, List, Optional, Sequence, Union
 
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
@@ -10,7 +10,9 @@
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
 )
-from .usfm_parser import parse_usfm
+from .usfm_parser import UsfmParser
+from .usfm_token import UsfmTokenType
+from .usfm_tokenizer import UsfmToken, UsfmTokenizer
 from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
 
 
@@ -61,8 +63,12 @@ def update_usfm(
             compare_segments=compare_segments,
         )
         try:
-            parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
-            return handler.get_usfm(self._settings.stylesheet, chapters)
+            tokenizer = UsfmTokenizer(self._settings.stylesheet)
+            tokens = tokenizer.tokenize(usfm)
+            tokens = self.filter_tokens_by_chapter(tokens, chapters)
+            parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
+            parser.process_tokens()
+            return handler.get_usfm(self._settings.stylesheet)
         except Exception as e:
             error_message = (
                 f"An error occurred while parsing the usfm for '{book_id}'"
@@ -70,3 +76,25 @@ def update_usfm(
                 f". Error: '{e}'"
             )
             raise RuntimeError(error_message) from e
+
+    def filter_tokens_by_chapter(
+        self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
+    ) -> Sequence[UsfmToken]:
+        if chapters is None:
+            return tokens
+        tokens_within_chapters: List[UsfmToken] = []
+        in_chapter: bool = False
+        for index, token in enumerate(tokens):
+            if index == 0 and token.marker == "id":
+                tokens_within_chapters.append(token)
+                if 1 in chapters:
+                    in_chapter = True
+            elif token.type == UsfmTokenType.CHAPTER:
+                if token.data and int(token.data) in chapters:
+                    in_chapter = True
+                    tokens_within_chapters.append(token)
+                else:
+                    in_chapter = False
+            elif in_chapter:
+                tokens_within_chapters.append(token)
+        return tokens_within_chapters
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 4c187ac7..78fd85f4 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -334,15 +334,11 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -
         if embed_outside_of_block:
             self._end_update_block(state, [scripture_ref])
 
-    def get_usfm(
-        self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None
-    ) -> str:
+    def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
         if isinstance(stylesheet, str):
             stylesheet = UsfmStylesheet(stylesheet)
         tokenizer = UsfmTokenizer(stylesheet)
         tokens = list(self._tokens)
-        if chapters is not None:
-            tokens = self._get_incremental_draft_tokens(tokens, chapters)
         if len(self._remarks) > 0:
             remark_tokens: List[UsfmToken] = []
             for remark in self._remarks:
@@ -354,23 +350,6 @@ def get_usfm(
                         tokens[index + 1 : index + 1] = remark_tokens
         return tokenizer.detokenize(tokens)
 
-    def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]:
-        incremental_draft_tokens: List[UsfmToken] = []
-        in_chapter: bool = False
-        for index, token in enumerate(tokens):
-            if index == 0 and token.marker == "id":
-                incremental_draft_tokens.append(token)
-                continue
-            elif token.type == UsfmTokenType.CHAPTER:
-                if token.data and int(token.data) in chapters:
-                    in_chapter = True
-                    incremental_draft_tokens.append(token)
-                else:
-                    in_chapter = False
-            elif in_chapter:
-                incremental_draft_tokens.append(token)
-        return incremental_draft_tokens
-
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
         row_texts: List[str] = []
         row_metadata = None

From e42370887ef0b161edd325abbad82b795efd423d Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Wed, 15 Apr 2026 17:08:20 -0400
Subject: [PATCH 03/14] add test case for chapter filtering

---
 machine/corpora/__init__.py                   |  2 +-
 .../paratext_project_text_updater_base.py     | 45 ++++++++++---------
 .../test_update_usfm_parser_handler.py        | 37 ++++++++++++++-
 3 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index d07e52ee..7cbc2889 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -27,7 +27,7 @@
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
-from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
+from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
 from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index a7dc464d..a32bdb99 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -31,8 +31,8 @@ def __init__(
     def update_usfm(
         self,
         book_id: str,
-        chapters: Optional[Sequence[int]] = None,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
+        chapters: Optional[Sequence[int]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
         paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -65,7 +65,7 @@ def update_usfm(
         try:
             tokenizer = UsfmTokenizer(self._settings.stylesheet)
             tokens = tokenizer.tokenize(usfm)
-            tokens = self.filter_tokens_by_chapter(tokens, chapters)
+            tokens = filter_tokens_by_chapter(tokens, chapters)
             parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
             parser.process_tokens()
             return handler.get_usfm(self._settings.stylesheet)
@@ -77,24 +77,25 @@ def update_usfm(
             )
             raise RuntimeError(error_message) from e
 
-    def filter_tokens_by_chapter(
-        self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
-    ) -> Sequence[UsfmToken]:
-        if chapters is None:
-            return tokens
-        tokens_within_chapters: List[UsfmToken] = []
-        in_chapter: bool = False
-        for index, token in enumerate(tokens):
-            if index == 0 and token.marker == "id":
-                tokens_within_chapters.append(token)
-                if 1 in chapters:
-                    in_chapter = True
-            elif token.type == UsfmTokenType.CHAPTER:
-                if token.data and int(token.data) in chapters:
-                    in_chapter = True
-                    tokens_within_chapters.append(token)
-                else:
-                    in_chapter = False
-            elif in_chapter:
+
+def filter_tokens_by_chapter(
+    tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
+) -> Sequence[UsfmToken]:
+    if chapters is None:
+        return tokens
+    tokens_within_chapters: List[UsfmToken] = []
+    in_chapter: bool = False
+    for index, token in enumerate(tokens):
+        if index == 0 and token.marker == "id":
+            tokens_within_chapters.append(token)
+            if 1 in chapters:
+                in_chapter = True
+        elif token.type == UsfmTokenType.CHAPTER:
+            if token.data and int(token.data) in chapters:
+                in_chapter = True
                 tokens_within_chapters.append(token)
-        return tokens_within_chapters
+            else:
+                in_chapter = False
+        elif in_chapter:
+            tokens_within_chapters.append(token)
+    return tokens_within_chapters
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index a9c1cdc1..940878bf 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -9,10 +9,12 @@
     UpdateUsfmParserHandler,
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
+    UsfmParser,
+    UsfmTokenizer,
     UsfmUpdateBlock,
     UsfmUpdateBlockElementType,
     UsfmUpdateBlockHandler,
-    parse_usfm,
+    filter_tokens_by_chapter,
 )
 
 
@@ -1494,6 +1496,31 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text():
     )
 
 
+def test_filter_chapters() -> None:
+    usfm = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 2
+\v 1 Some text
+\c 3
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    chapters = [2, 4]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT
+\c 2
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 
@@ -1501,6 +1528,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]:
 def update_usfm(
     rows: Optional[Sequence[UpdateUsfmRow]] = None,
     source: Optional[str] = None,
+    chapters: Optional[Sequence[int]] = None,
     id_text: Optional[str] = None,
     text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW,
     paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -1516,6 +1544,7 @@ def update_usfm(
         return updater.update_usfm(
             "MAT",
             rows,
+            chapters,
             id_text,
             text_behavior,
             paragraph_behavior,
@@ -1542,7 +1571,11 @@ def update_usfm(
             lambda _: False,
             compare_segments,
         )
-        parse_usfm(source, updater)
+        tokenizer = UsfmTokenizer()
+        tokens = tokenizer.tokenize(source)
+        tokens = filter_tokens_by_chapter(tokens, chapters)
+        parser = UsfmParser(tokens, updater)
+        parser.process_tokens()
         return updater.get_usfm()
 
 

From 1e2e99956e51c0620fb4b2876516d8498e902692 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 16 Apr 2026 10:02:53 -0400
Subject: [PATCH 04/14] make sure all text in \id is included

---
 machine/corpora/paratext_project_text_updater_base.py | 9 ++++++---
 tests/corpora/test_update_usfm_parser_handler.py      | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index a32bdb99..77e4ec3a 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -85,17 +85,20 @@ def filter_tokens_by_chapter(
         return tokens
     tokens_within_chapters: List[UsfmToken] = []
     in_chapter: bool = False
+    in_id_marker: bool = False
     for index, token in enumerate(tokens):
         if index == 0 and token.marker == "id":
-            tokens_within_chapters.append(token)
+            in_id_marker = True
             if 1 in chapters:
                 in_chapter = True
+        elif in_id_marker and token.marker is not None and token.marker != "id":
+            in_id_marker = False
         elif token.type == UsfmTokenType.CHAPTER:
             if token.data and int(token.data) in chapters:
                 in_chapter = True
-                tokens_within_chapters.append(token)
             else:
                 in_chapter = False
-        elif in_chapter:
+
+        if in_id_marker or in_chapter:
             tokens_within_chapters.append(token)
     return tokens_within_chapters
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 940878bf..1505444b 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1512,7 +1512,7 @@ def test_filter_chapters() -> None:
 """
     chapters = [2, 4]
     target = update_usfm(chapters=chapters, source=usfm)
-    result = r"""\id MAT
+    result = r"""\id MAT - Test
 \c 2
 \v 1 Some text
 \c 4

From 707119c38bb2038fa7c6c19b3a4f178b3a9a0c0a Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 16 Apr 2026 11:19:18 -0400
Subject: [PATCH 05/14] update remark test and ensure remarks are added at the
 end of existing chapter remarks

---
 machine/corpora/update_usfm_parser_handler.py    |  7 ++++++-
 tests/corpora/test_update_usfm_parser_handler.py | 13 +++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 78fd85f4..5317a85b 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -347,7 +347,12 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
             if len(tokens) > 0:
                 for index, token in enumerate(tokens):
                     if token.type == UsfmTokenType.CHAPTER:
-                        tokens[index + 1 : index + 1] = remark_tokens
+                        insertion_index = index + 1
+                        while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem":
+                            insertion_index += 1
+                            if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT:
+                                insertion_index += 1
+                        tokens[insertion_index:insertion_index] = remark_tokens
         return tokenizer.detokenize(tokens)
 
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 1505444b..e896e21b 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1389,17 +1389,22 @@ def test_pass_remark():
 \v 1 Some text
 \v 2
 \v 3 Other text
+\c 2
+\v 1 More text
 """
 
     target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"])
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
-\rem New remark
 \c 1
+\rem New remark
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
+\c 2
+\rem New remark
+\v 1 More text
 """
 
     assert_usfm_equals(target, result)
@@ -1408,12 +1413,16 @@ def test_pass_remark():
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
+\c 1
 \rem New remark
 \rem New remark 2
-\c 1
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
+\c 2
+\rem New remark
+\rem New remark 2
+\v 1 More text
 """
 
     assert_usfm_equals(target, result)

From e1865ea7afd4405b87e78b7ef1bc78df0246511c Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 16 Apr 2026 11:57:28 -0400
Subject: [PATCH 06/14] add test case for including chapter 1 and header
 information

---
 .../test_update_usfm_parser_handler.py        | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index e896e21b..90ba2bba 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1530,6 +1530,34 @@ def test_filter_chapters() -> None:
     assert_usfm_equals(target, result)
 
 
+def test_filter_chapters_with_chapter_1_and_header() -> None:
+    usfm = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 2
+\v 1 Some text
+\c 3
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    chapters = [1, 3]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 3
+\v 1 Some text
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 

From 4a8fb5052c9e6a767ec7204c853af89e1d90b6f8 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Mon, 27 Apr 2026 15:13:25 -0400
Subject: [PATCH 07/14] support both book-level and chapter-level remarks

---
 .../paratext_project_text_updater_base.py     |  4 +-
 machine/corpora/update_usfm_parser_handler.py | 49 +++++++++++-----
 .../test_update_usfm_parser_handler.py        | 57 ++++++++++++++-----
 3 files changed, 81 insertions(+), 29 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 77e4ec3a..526f5d50 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Callable, Iterable, List, Optional, Sequence, Union
+from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union
 
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
@@ -40,7 +40,7 @@ def update_usfm(
         style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
         preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
         update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
-        remarks: Optional[Iterable[str]] = None,
+        remarks: Optional[Iterable[Tuple[int, str]]] = None,
         error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
         compare_segments: bool = False,
     ) -> Optional[str]:
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 5317a85b..fe405a64 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -53,7 +53,7 @@ def __init__(
         style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
         preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
         update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
-        remarks: Optional[Iterable[str]] = None,
+        remarks: Optional[Iterable[Tuple[int, str]]] = None,
         error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
         compare_segments: bool = False,
     ) -> None:
@@ -340,19 +340,42 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
         tokenizer = UsfmTokenizer(stylesheet)
         tokens = list(self._tokens)
         if len(self._remarks) > 0:
-            remark_tokens: List[UsfmToken] = []
-            for remark in self._remarks:
-                remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
-                remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
+            remark_tokens_by_chapter: Dict[int, List[UsfmToken]] = {}
+            for chapter_num, remark in self._remarks:
+                chapter_tokens = remark_tokens_by_chapter.setdefault(chapter_num, [])
+                chapter_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
+                chapter_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
             if len(tokens) > 0:
-                for index, token in enumerate(tokens):
-                    if token.type == UsfmTokenType.CHAPTER:
-                        insertion_index = index + 1
-                        while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem":
-                            insertion_index += 1
-                            if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT:
-                                insertion_index += 1
-                        tokens[insertion_index:insertion_index] = remark_tokens
+                for chapter_num, remark_tokens in remark_tokens_by_chapter.items():
+                    if chapter_num == 0:
+                        index = 0
+                        markers_to_skip = {"id", "ide", "rem"}
+                    else:
+                        index = next(
+                            (
+                                i
+                                for i, token in enumerate(tokens)
+                                if token.type == UsfmTokenType.CHAPTER
+                                and token.data is not None
+                                and str(token.data).isdigit()
+                                and int(token.data) == chapter_num
+                            ),
+                            -1,
+                        )
+                        if index == -1:
+                            continue
+                        index += 1
+                        markers_to_skip = {"rem"}
+
+                    if index >= len(tokens):
+                        tokens.extend(remark_tokens)
+                    else:
+                        while index < len(tokens) and tokens[index].marker in markers_to_skip:
+                            index += 1
+                            if index < len(tokens) and tokens[index].type == UsfmTokenType.TEXT:
+                                index += 1
+
+                        tokens[index:index] = remark_tokens
         return tokenizer.detokenize(tokens)
 
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 90ba2bba..b4bc2c2c 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Sequence, Union
+from typing import Iterable, List, Optional, Sequence, Tuple, Union
 
 from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, ignore_line_endings
 
@@ -1390,41 +1390,70 @@ def test_pass_remark():
 \v 2
 \v 3 Other text
 \c 2
+\rem Existing remark
 \v 1 More text
+\c 3
+\v 1 Additional text
 """
 
-    target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"])
+    target = update_usfm(
+        rows,
+        usfm,
+        text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING,
+        remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2")],
+    )
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
+\rem New remark 0
 \c 1
-\rem New remark
+\rem New remark 1
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
 \c 2
-\rem New remark
+\rem Existing remark
+\rem New remark 2
 \v 1 More text
+\c 3
+\v 1 Additional text
 """
 
     assert_usfm_equals(target, result)
 
-    target = update_usfm(rows, target, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark 2"])
+
+def test_pass_remark_0_no_existing_remark():
+    rows = [
+        UpdateUsfmRow(
+            scr_ref("MAT 1:1"),
+            "Update 1",
+        ),
+        UpdateUsfmRow(
+            scr_ref("MAT 1:2"),
+            "Update 2",
+        ),
+    ]
+    usfm = r"""\id MAT - Test
+\ide UTF-8
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+"""
+    target = update_usfm(
+        rows,
+        usfm,
+        text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING,
+        remarks=[(0, "New remark 0")],
+    )
     result = r"""\id MAT - Test
 \ide UTF-8
-\rem Existing remark
+\rem New remark 0
 \c 1
-\rem New remark
-\rem New remark 2
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
-\c 2
-\rem New remark
-\rem New remark 2
-\v 1 More text
 """
-
     assert_usfm_equals(target, result)
 
 
@@ -1573,7 +1602,7 @@ def update_usfm(
     style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
     preserve_paragraph_styles: Optional[Iterable[str]] = None,
     update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
-    remarks: Optional[Iterable[str]] = None,
+    remarks: Optional[Iterable[Tuple[int, str]]] = None,
     compare_segments: bool = False,
 ) -> Optional[str]:
     if source is None:

From 74dcf8f8b80aad0933463fe5d049d3b3f6717496 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Mon, 27 Apr 2026 15:18:22 -0400
Subject: [PATCH 08/14] add test case for multiple remarks for the same chapter

---
 .../test_update_usfm_parser_handler.py        | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index b4bc2c2c..9e053f46 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1457,6 +1457,48 @@ def test_pass_remark_0_no_existing_remark():
     assert_usfm_equals(target, result)
 
 
+def test_pass_multiple_remarks_same_chapter() -> None:
+    rows = [
+        UpdateUsfmRow(
+            scr_ref("MAT 1:1"),
+            "Update 1",
+        ),
+        UpdateUsfmRow(
+            scr_ref("MAT 1:2"),
+            "Update 2",
+        ),
+    ]
+    usfm = r"""\id MAT - Test
+\ide UTF-8
+\rem Existing remark
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+"""
+
+    target = update_usfm(
+        rows,
+        usfm,
+        text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING,
+        remarks=[(0, "New remark 0.1"), (0, "New remark 0.2"), (1, "New remark 1.1"), (1, "New remark 1.2")],
+    )
+    result = r"""\id MAT - Test
+\ide UTF-8
+\rem Existing remark
+\rem New remark 0.1
+\rem New remark 0.2
+\c 1
+\rem New remark 1.1
+\rem New remark 1.2
+\v 1 Some text
+\v 2 Update 2
+\v 3 Other text
+"""
+
+    assert_usfm_equals(target, result)
+
+
 def test_update_block_footnote_in_published_chapter_number():
     rows = [UpdateUsfmRow(scr_ref("ESG 1:0/2:s"), "Update 1")]
     usfm = r"""\id ESG - Test

From ef9041f9de643ed8b33b4d01e60624ce39fd8a86 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Mon, 27 Apr 2026 17:59:32 -0400
Subject: [PATCH 09/14] fix init.py

---
 machine/corpora/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index 7cbc2889..4a0817a6 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -115,6 +115,7 @@
     "FileParatextProjectTermsParser",
     "FileParatextProjectTextUpdater",
     "FileParatextProjectVersificationErrorDetector",
+    "filter_tokens_by_chapter",
     "flatten",
     "is_scripture",
     "KeyTerm",

From d9ecd2ac29a97591e95cefdee388dca126a5d0f9 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Mon, 27 Apr 2026 18:12:03 -0400
Subject: [PATCH 10/14] cover edge case of chapter as last marker

---
 tests/corpora/test_update_usfm_parser_handler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 9e053f46..dcc76ea0 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1393,14 +1393,13 @@ def test_pass_remark():
 \rem Existing remark
 \v 1 More text
 \c 3
-\v 1 Additional text
 """
 
     target = update_usfm(
         rows,
         usfm,
         text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING,
-        remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2")],
+        remarks=[(0, "New remark 0"), (1, "New remark 1"), (2, "New remark 2"), (3, "New remark 3")],
     )
     result = r"""\id MAT - Test
 \ide UTF-8
@@ -1416,7 +1415,7 @@ def test_pass_remark():
 \rem New remark 2
 \v 1 More text
 \c 3
-\v 1 Additional text
+\rem New remark 3
 """
 
     assert_usfm_equals(target, result)

From e39f94c478e4de7cc0292ff047c75e1613066cc7 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Mon, 27 Apr 2026 18:39:55 -0400
Subject: [PATCH 11/14] handle malformed chapter numbers

---
 .../paratext_project_text_updater_base.py     |  4 +++-
 .../test_update_usfm_parser_handler.py        | 24 +++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 526f5d50..fc13d7b9 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -14,6 +14,7 @@
 from .usfm_token import UsfmTokenType
 from .usfm_tokenizer import UsfmToken, UsfmTokenizer
 from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
+from ..utils.string_utils import parse_integer
 
 
 class ParatextProjectTextUpdaterBase(ABC):
@@ -94,7 +95,8 @@ def filter_tokens_by_chapter(
         elif in_id_marker and token.marker is not None and token.marker != "id":
             in_id_marker = False
         elif token.type == UsfmTokenType.CHAPTER:
-            if token.data and int(token.data) in chapters:
+            chapter_num = parse_integer(token.data) if token.data else None
+            if chapter_num is not None and chapter_num in chapters:
                 in_chapter = True
             else:
                 in_chapter = False
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index dcc76ea0..3f08a3cf 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1389,7 +1389,7 @@ def test_pass_remark():
 \v 1 Some text
 \v 2
 \v 3 Other text
-\c 2
+\c 2.
 \rem Existing remark
 \v 1 More text
 \c 3
@@ -1410,7 +1410,7 @@ def test_pass_remark():
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
-\c 2
+\c 2.
 \rem Existing remark
 \rem New remark 2
 \v 1 More text
@@ -1628,6 +1628,26 @@ def test_filter_chapters_with_chapter_1_and_header() -> None:
     assert_usfm_equals(target, result)
 
 
+def test_filter_chapters_with_bad_chapter_reference() -> None:
+    usfm = r"""\id MAT - Test
+\c 1.
+\v 1 Some text
+\c 2.
+\v 1 Some text
+\c 3
+\v 1 Some text with good chapter reference
+\c 4
+\v 1 Some text with good chapter reference
+"""
+    chapters = [2, 4]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT - Test
+\c 4
+\v 1 Some text with good chapter reference
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 

From 9cfd578ad3994ccded4df6b28e9a5e4130f52c72 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Mon, 27 Apr 2026 18:47:58 -0400
Subject: [PATCH 12/14] use parse_usfm; fix pass_remark test

---
 machine/corpora/paratext_project_text_updater_base.py | 7 +++----
 machine/corpora/usfm_parser.py                        | 2 +-
 tests/corpora/test_update_usfm_parser_handler.py      | 9 ++++-----
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index fc13d7b9..7eee05fc 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,6 +1,7 @@
 from abc import ABC
 from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union
 
+from ..utils.string_utils import parse_integer
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
@@ -10,11 +11,10 @@
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
 )
-from .usfm_parser import UsfmParser
+from .usfm_parser import parse_usfm
 from .usfm_token import UsfmTokenType
 from .usfm_tokenizer import UsfmToken, UsfmTokenizer
 from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
-from ..utils.string_utils import parse_integer
 
 
 class ParatextProjectTextUpdaterBase(ABC):
@@ -67,8 +67,7 @@ def update_usfm(
             tokenizer = UsfmTokenizer(self._settings.stylesheet)
             tokens = tokenizer.tokenize(usfm)
             tokens = filter_tokens_by_chapter(tokens, chapters)
-            parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
-            parser.process_tokens()
+            parse_usfm(tokens, handler, self._settings.stylesheet, self._settings.versification)
             return handler.get_usfm(self._settings.stylesheet)
         except Exception as e:
             error_message = (
diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py
index a37d5396..28de4aee 100644
--- a/machine/corpora/usfm_parser.py
+++ b/machine/corpora/usfm_parser.py
@@ -17,7 +17,7 @@
 
 
 def parse_usfm(
-    usfm: str,
+    usfm: Union[str, Sequence[UsfmToken]],
     handler: UsfmParserHandler,
     stylesheet: Union[StrPath, UsfmStylesheet] = "usfm.sty",
     versification: Optional[Versification] = None,
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 3f08a3cf..278b795d 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -9,12 +9,12 @@
     UpdateUsfmParserHandler,
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
-    UsfmParser,
     UsfmTokenizer,
     UsfmUpdateBlock,
     UsfmUpdateBlockElementType,
     UsfmUpdateBlockHandler,
     filter_tokens_by_chapter,
+    parse_usfm,
 )
 
 
@@ -1389,7 +1389,7 @@ def test_pass_remark():
 \v 1 Some text
 \v 2
 \v 3 Other text
-\c 2.
+\c 2
 \rem Existing remark
 \v 1 More text
 \c 3
@@ -1410,7 +1410,7 @@ def test_pass_remark():
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
-\c 2.
+\c 2
 \rem Existing remark
 \rem New remark 2
 \v 1 More text
@@ -1701,8 +1701,7 @@ def update_usfm(
         tokenizer = UsfmTokenizer()
         tokens = tokenizer.tokenize(source)
         tokens = filter_tokens_by_chapter(tokens, chapters)
-        parser = UsfmParser(tokens, updater)
-        parser.process_tokens()
+        parse_usfm(tokens, updater)
         return updater.get_usfm()
 
 

From 911df510990aa05aca2d536cec6b58c4c663b3c7 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Tue, 28 Apr 2026 16:54:49 -0400
Subject: [PATCH 13/14] don't filter if chapters is empty list

---
 machine/corpora/paratext_project_text_updater_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 7eee05fc..1ef9a1e4 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -81,7 +81,7 @@ def update_usfm(
 def filter_tokens_by_chapter(
     tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
 ) -> Sequence[UsfmToken]:
-    if chapters is None:
+    if not chapters:
         return tokens
     tokens_within_chapters: List[UsfmToken] = []
     in_chapter: bool = False

From cf0ea05cded61ad7c4088215680c4af4a693e447 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Tue, 28 Apr 2026 17:01:43 -0400
Subject: [PATCH 14/14] revert change

---
 machine/corpora/paratext_project_text_updater_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 1ef9a1e4..7eee05fc 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -81,7 +81,7 @@ def update_usfm(
 def filter_tokens_by_chapter(
     tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
 ) -> Sequence[UsfmToken]:
-    if not chapters:
+    if chapters is None:
         return tokens
     tokens_within_chapters: List[UsfmToken] = []
     in_chapter: bool = False