Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
from .paratext_text_corpus import ParatextTextCorpus
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
Expand Down Expand Up @@ -115,6 +115,7 @@
"FileParatextProjectTermsParser",
"FileParatextProjectTextUpdater",
"FileParatextProjectVersificationErrorDetector",
"filter_tokens_by_chapter",
"flatten",
"is_scripture",
"KeyTerm",
Expand Down
40 changes: 37 additions & 3 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC
from typing import Callable, Iterable, Optional, Sequence, Union
from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union

from ..utils.string_utils import parse_integer
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
Expand All @@ -11,6 +12,8 @@
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_token import UsfmTokenType
from .usfm_tokenizer import UsfmToken, UsfmTokenizer
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError


Expand All @@ -30,14 +33,15 @@ def update_usfm(
self,
book_id: str,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
chapters: Optional[Sequence[int]] = None,
full_name: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
remarks: Optional[Iterable[Tuple[int, str]]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> Optional[str]:
Expand All @@ -60,7 +64,10 @@ def update_usfm(
compare_segments=compare_segments,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
tokenizer = UsfmTokenizer(self._settings.stylesheet)
tokens = tokenizer.tokenize(usfm)
tokens = filter_tokens_by_chapter(tokens, chapters)
parse_usfm(tokens, handler, self._settings.stylesheet, self._settings.versification)
return handler.get_usfm(self._settings.stylesheet)
except Exception as e:
error_message = (
Expand All @@ -69,3 +76,30 @@ def update_usfm(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e


def filter_tokens_by_chapter(
tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
) -> Sequence[UsfmToken]:
if chapters is None:
return tokens
tokens_within_chapters: List[UsfmToken] = []
in_chapter: bool = False
in_id_marker: bool = False
for index, token in enumerate(tokens):
if index == 0 and token.marker == "id":
in_id_marker = True
if 1 in chapters:
in_chapter = True
elif in_id_marker and token.marker is not None and token.marker != "id":
in_id_marker = False
elif token.type == UsfmTokenType.CHAPTER:
chapter_num = parse_integer(token.data) if token.data else None
if chapter_num is not None and chapter_num in chapters:
in_chapter = True
else:
in_chapter = False

if in_id_marker or in_chapter:
tokens_within_chapters.append(token)
return tokens_within_chapters
47 changes: 35 additions & 12 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
remarks: Optional[Iterable[Tuple[int, str]]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> None:
Expand Down Expand Up @@ -340,19 +340,42 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
tokenizer = UsfmTokenizer(stylesheet)
tokens = list(self._tokens)
if len(self._remarks) > 0:
remark_tokens: List[UsfmToken] = []
for remark in self._remarks:
remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
remark_tokens_by_chapter: Dict[int, List[UsfmToken]] = {}
for chapter_num, remark in self._remarks:
chapter_tokens = remark_tokens_by_chapter.setdefault(chapter_num, [])
chapter_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
chapter_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
if len(tokens) > 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
while tokens[index].marker in markers_to_skip:
index += 1
if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
for chapter_num, remark_tokens in remark_tokens_by_chapter.items():
if chapter_num == 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
else:
index = next(
(
i
for i, token in enumerate(tokens)
if token.type == UsfmTokenType.CHAPTER
and token.data is not None
and str(token.data).isdigit()
and int(token.data) == chapter_num
),
-1,
)
if index == -1:
continue
index += 1
for remark_token in reversed(remark_tokens):
tokens.insert(index, remark_token)
markers_to_skip = {"rem"}

if index >= len(tokens):
tokens.extend(remark_tokens)
else:
while index < len(tokens) and tokens[index].marker in markers_to_skip:
index += 1
if index < len(tokens) and tokens[index].type == UsfmTokenType.TEXT:
index += 1

tokens[index:index] = remark_tokens
return tokenizer.detokenize(tokens)

def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
Expand Down
2 changes: 1 addition & 1 deletion machine/corpora/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


def parse_usfm(
usfm: str,
usfm: Union[str, Sequence[UsfmToken]],
handler: UsfmParserHandler,
stylesheet: Union[StrPath, UsfmStylesheet] = "usfm.sty",
versification: Optional[Versification] = None,
Expand Down
Loading
Loading