From eb175a5aa7d0de5fd6ca82ae357398144acd7292 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:21:08 -0500 Subject: [PATCH 1/7] Speed up docs deployment --- ci/customization/customize_doc.py | 240 +++++++++++++++++------------- ci/download_from_s3.sh | 41 ++++- 2 files changed, 178 insertions(+), 103 deletions(-) diff --git a/ci/customization/customize_doc.py b/ci/customization/customize_doc.py index 2236f2a24e6..3558b0ee9f4 100644 --- a/ci/customization/customize_doc.py +++ b/ci/customization/customize_doc.py @@ -10,7 +10,7 @@ import os import re import sys -from copy import deepcopy +from concurrent.futures import ProcessPoolExecutor import yaml @@ -23,6 +23,44 @@ NVIDIA_STYLE_TAG_ID = "nvidia-selector-css" FA_TAG_ID = "rapids-fa-tag" +LIB_PATH_DICT = None +PROJECT_TO_VERSIONS_DICT = None +SELECTOR_PROJECT_NAMES = None + + +def initialize_worker( + lib_path_dict: dict, + project_to_versions_dict: dict, + selector_project_names: set[str], +) -> None: + """Initializes read-only configuration used by each worker process.""" + global LIB_PATH_DICT, PROJECT_TO_VERSIONS_DICT, SELECTOR_PROJECT_NAMES + LIB_PATH_DICT = lib_path_dict + PROJECT_TO_VERSIONS_DICT = project_to_versions_dict + SELECTOR_PROJECT_NAMES = selector_project_names + + +def customize_manifest_file(filepath: str) -> None: + """Customizes one HTML file from the generated manifest.""" + project_name = get_lib_from_fp( + lib_path_dict=LIB_PATH_DICT, + filepath=filepath, + ) + main( + filepath=filepath, + lib_path_dict=LIB_PATH_DICT, + project_name=project_name, + versions_dict=PROJECT_TO_VERSIONS_DICT[project_name], + selector_project_names=SELECTOR_PROJECT_NAMES, + ) + + +def process_count() -> int: + """Returns the number of CPUs available to this process, like nproc.""" + if hasattr(os, "sched_getaffinity"): + return max(1, len(os.sched_getaffinity(0))) + return max(1, os.cpu_count() or 1) + def get_version_from_fp(*, filepath: str, versions_dict: dict): """ @@ -225,21 +263,11 @@ def create_pixel_tags(soup): return [head_tag, body_tag] -def uses_nvidia_sphinx_theme(soup) -> bool: - """ - Returns whether the document already uses the NVIDIA Sphinx Theme. - """ - return any( - "nvidia-sphinx-theme" in link.get("href", "") - for link in soup.find_all("link", href=True) - ) - - -def create_css_link_tag(soup): +def create_css_link_tag(soup, *, is_nvidia_theme: bool): """ Creates and returns the stylesheet tag for the injected selectors. """ - if uses_nvidia_sphinx_theme(soup): + if is_nvidia_theme: return soup.new_tag( "link", id=NVIDIA_STYLE_TAG_ID, @@ -253,51 +281,86 @@ def create_css_link_tag(soup): return script_tag -def delete_element(soup, selector): - """ - Deletes element from soup object if it already exists - """ - try: - soup.select(f"{selector}")[0].extract() - except Exception: - pass - - -def delete_rapids_custom_css_links(soup): +def delete_rapids_custom_css_links(links): """ Deletes global RAPIDS custom CSS links from NVIDIA-themed pages. """ - for link in soup.find_all("link", href=True): - if link["href"].endswith("/assets/css/custom.css"): - link.extract() + for link in links: + link.extract() -def delete_existing_elements(soup): +def delete_existing_elements(elements, *, doc_type: str, reference_el): """ Deletes any existing page elements to prevent duplicates on the page """ - doxygen_title_area = "#titlearea > table" - sphinx_home_btn = ".wy-side-nav-search .icon.icon-home" - sphinx_doc_version = ".wy-side-nav-search .version" - existing_jtd_container = "#rapids-jtd-container" - existing_pydata_container = "#rapids-pydata-container" - existing_doxygen_container = "#rapids-doxygen-container" - - for element in [ - existing_jtd_container, - existing_pydata_container, - existing_doxygen_container, - sphinx_doc_version, - sphinx_home_btn, - doxygen_title_area, - f"#{SCRIPT_TAG_ID}", - f"#{STYLE_TAG_ID}", - f"#{NVIDIA_STYLE_TAG_ID}", - f"#{FA_TAG_ID}", - f"#{PIXEL_SRC_TAG_ID}", - f"#{PIXEL_INVOCATION_TAG_ID}", - ]: - delete_element(soup, element) + for element in elements: + element.extract() + + if doc_type == "doxygen": + if table := reference_el.find("table", recursive=False): + table.extract() + + if doc_type == "jtd": + if version := reference_el.find(class_="version"): + version.extract() + if home_button := reference_el.find( + lambda tag: {"icon", "icon-home"}.issubset(tag.get("class", [])) + ): + home_button.extract() + + +def inspect_document(soup, *, filepath: str): + """Collects theme and customization state in one document traversal.""" + removable_ids = { + "rapids-jtd-container", + "rapids-pydata-container", + "rapids-doxygen-container", + SCRIPT_TAG_ID, + STYLE_TAG_ID, + NVIDIA_STYLE_TAG_ID, + FA_TAG_ID, + PIXEL_SRC_TAG_ID, + PIXEL_INVOCATION_TAG_ID, + } + existing_elements = [] + rapids_css_links = [] + references = {} + is_nvidia_theme = False + + for element in soup.find_all(True): + element_id = element.get("id") + if element_id in removable_ids: + existing_elements.append(element) + + classes = element.get("class", []) + if "wy-side-nav-search" in classes and "jtd" not in references: + references["jtd"] = element + elif element_id == "titlearea" and "doxygen" not in references: + references["doxygen"] = element + elif "bd-sidebar" in classes and "pydata" not in references: + references["pydata"] = element + + if element.name == "link" and (href := element.get("href")): + if "nvidia-sphinx-theme" in href: + is_nvidia_theme = True + if element_id not in removable_ids and href.endswith( + "/assets/css/custom.css" + ): + rapids_css_links.append(element) + + for doc_type in ("jtd", "doxygen", "pydata"): + if doc_type in references: + return ( + doc_type, + references[doc_type], + is_nvidia_theme, + existing_elements, + rapids_css_links, + ) + + raise UnsupportedThemeError( + f"Couldn't identify {filepath} as a supported theme type. Skipping file." + ) class UnsupportedThemeError(ValueError): @@ -309,33 +372,6 @@ class UnsupportedThemeError(ValueError): pass -def get_theme_info(soup, *, filepath: str): - """ - Determines what theme a given HTML file is using or exits if it's - not able to be determined. Returns a string identifier and reference element - that is used for inserting the library/version selectors to the doc. - """ - # Sphinx Themes - jtd_identifier = ".wy-side-nav-search" # Just-the-docs theme - pydata_identifier = ".bd-sidebar" # Pydata theme - - # Doxygen - doxygen_identifier = "#titlearea" - - if soup.select(jtd_identifier): - return "jtd", soup.select(jtd_identifier)[0] - - if soup.select(doxygen_identifier): - return "doxygen", soup.select(doxygen_identifier)[0] - - if soup.select(pydata_identifier): - return "pydata", soup.select(pydata_identifier)[0] - - raise UnsupportedThemeError( - f"Couldn't identify {filepath} as a supported theme type. Skipping file." - ) - - def main( *, filepath: str, @@ -355,15 +391,25 @@ def main( soup = BeautifulSoup(fp, "html5lib") try: - doc_type, reference_el = get_theme_info(soup, filepath=filepath) + ( + doc_type, + reference_el, + is_nvidia_theme, + existing_elements, + rapids_css_links, + ) = inspect_document(soup, filepath=filepath) except UnsupportedThemeError as err: print(f"{str(err)}", file=sys.stderr) return # Delete any existing added/unnecessary elements - delete_existing_elements(soup) - if uses_nvidia_sphinx_theme(soup): - delete_rapids_custom_css_links(soup) + delete_existing_elements( + existing_elements, + doc_type=doc_type, + reference_el=reference_el, + ) + if is_nvidia_theme: + delete_rapids_custom_css_links(rapids_css_links) # Add Font Awesome to Doxygen for icons if doc_type == "doxygen": @@ -392,7 +438,7 @@ def main( container = soup.new_tag("div", id=f"rapids-{doc_type}-container") script_tag = create_script_tag(soup) [pix_head_tag, pix_body_tag] = create_pixel_tags(soup) - style_tab = create_css_link_tag(soup) + style_tab = create_css_link_tag(soup, is_nvidia_theme=is_nvidia_theme) # Append elements to container container.append(home_btn_container) @@ -428,21 +474,15 @@ def main( SELECTOR_PROJECT_NAMES = get_selector_project_names(docs_yml_path=DOCS_YML_PATH) with open(MANIFEST_FILEPATH) as manifest_file: - for line in manifest_file: - filepath = line.strip() - - lib_path_dict = deepcopy(LIB_PATH_DICT) - - # determine project name (e.g. 'cudf') - project_name = get_lib_from_fp( - lib_path_dict=lib_path_dict, - filepath=filepath, - ) - - main( - filepath=filepath, - lib_path_dict=lib_path_dict, - project_name=project_name, - versions_dict=deepcopy(PROJECT_TO_VERSIONS_DICT[project_name]), - selector_project_names=SELECTOR_PROJECT_NAMES, - ) + filepaths = [line.strip() for line in manifest_file if line.strip()] + + with ProcessPoolExecutor( + max_workers=process_count(), + initializer=initialize_worker, + initargs=( + LIB_PATH_DICT, + PROJECT_TO_VERSIONS_DICT, + SELECTOR_PROJECT_NAMES, + ), + ) as executor: + list(executor.map(customize_manifest_file, filepaths, chunksize=8)) diff --git a/ci/download_from_s3.sh b/ci/download_from_s3.sh index 37144b54ab9..060b29e2373 100755 --- a/ci/download_from_s3.sh +++ b/ci/download_from_s3.sh @@ -1,5 +1,5 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -15,6 +15,9 @@ deployment: ${JEKYLL_DIR}/deployment " export DOCS_BUCKET="rapidsai-docs" +MAX_CONCURRENT_DOWNLOADS=4 +DOWNLOAD_PIDS=() + # Checks that the "_site" directory exists from a Jekyll build. Also ensures # that the directories that are pulled from S3 aren't already present in the # "_site" directory since that could cause problems. @@ -64,6 +67,37 @@ aws_cp() { "${DST}" } +# Starts an S3 copy and waits when the concurrency limit is reached. +start_aws_cp() { + local DST SRC + + SRC=$1 + DST=$2 + + aws_cp "${SRC}" "${DST}" & + DOWNLOAD_PIDS+=("$!") + + if [ "${#DOWNLOAD_PIDS[@]}" -ge "${MAX_CONCURRENT_DOWNLOADS}" ]; then + wait_for_aws_cp + fi +} + +# Waits for the oldest running S3 copy and propagates its failure. +wait_for_aws_cp() { + local PID + + PID=${DOWNLOAD_PIDS[0]} + wait "${PID}" + DOWNLOAD_PIDS=("${DOWNLOAD_PIDS[@]:1}") +} + +# Waits for all remaining S3 copies. +wait_for_all_aws_cp() { + while [ "${#DOWNLOAD_PIDS[@]}" -gt 0 ]; do + wait_for_aws_cp + done +} + # Downloads the RAPIDS libraries' documentation files from S3 and places them # into the "_site/api" folder. download_lib_docs() { @@ -99,7 +133,7 @@ download_lib_docs() { # copy the relevant files from S3 to the local directory SRC="s3://${DOCS_BUCKET}/${PROJECT}/html/${VERSION_NUMBER}/" DST="$(yq -n 'env(GENERATED_DIRS)|.libs')/${PROJECT}/${VERSION_NUMBER}/" - aws_cp "${SRC}" "${DST}" + start_aws_cp "${SRC}" "${DST}" done # for VERSION_NAME done # for PROJECT @@ -115,10 +149,11 @@ download_deployment_docs() { SRC="s3://${DOCS_BUCKET}/deployment/html/${VERSION}/" DST="$(yq -n 'env(GENERATED_DIRS)|.deployment')/${VERSION}/" - aws_cp "${SRC}" "${DST}" + start_aws_cp "${SRC}" "${DST}" done } check_dirs download_lib_docs download_deployment_docs +wait_for_all_aws_cp From d372b503f05273381082639c81687fb34aa1b871 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:26:31 -0500 Subject: [PATCH 2/7] Add temporary deployment benchmark job --- .github/workflows/pr.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 6cde6c075a7..656df50bed9 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,3 +20,31 @@ jobs: with: persist-credentials: false - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + + benchmark-deploy: + name: Benchmark deployment pipeline (no deploy) + if: github.ref_name == 'pull-request/804' + runs-on: ubuntu-latest + timeout-minutes: 60 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + - uses: ruby/setup-ruby@6aaa311d81eba98ae12eaffbcb63296ace0efcde # v1.307.0 + - name: Build Jekyll Site + run: | + bundle install + bundle exec jekyll build + - uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 3600 + - name: Fetch doc files from S3 + run: ci/download_from_s3.sh + - name: Post-process docs + run: ci/post-process.sh From f79e773990830bf2d43e5e85c3b3741b8fa406c8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:35:57 -0500 Subject: [PATCH 3/7] Revert "Add temporary deployment benchmark job" This reverts commit d372b503f05273381082639c81687fb34aa1b871. --- .github/workflows/pr.yaml | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 656df50bed9..6cde6c075a7 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,31 +20,3 @@ jobs: with: persist-credentials: false - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 - - benchmark-deploy: - name: Benchmark deployment pipeline (no deploy) - if: github.ref_name == 'pull-request/804' - runs-on: ubuntu-latest - timeout-minutes: 60 - permissions: - id-token: write - contents: read - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - persist-credentials: false - - uses: ruby/setup-ruby@6aaa311d81eba98ae12eaffbcb63296ace0efcde # v1.307.0 - - name: Build Jekyll Site - run: | - bundle install - bundle exec jekyll build - - uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 - with: - role-to-assume: ${{ vars.AWS_ROLE_ARN }} - aws-region: ${{ vars.AWS_REGION }} - role-duration-seconds: 3600 - - name: Fetch doc files from S3 - run: ci/download_from_s3.sh - - name: Post-process docs - run: ci/post-process.sh From f743fb05ac018141ea63bb447f035bebbaa4548c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:41:35 -0500 Subject: [PATCH 4/7] Reduce deployment logging and increase S3 concurrency --- ci/customization/customize_doc.py | 12 +++++++----- ci/download_from_s3.sh | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ci/customization/customize_doc.py b/ci/customization/customize_doc.py index 3558b0ee9f4..d073bef9608 100644 --- a/ci/customization/customize_doc.py +++ b/ci/customization/customize_doc.py @@ -168,7 +168,6 @@ def create_version_options( option_href = version_path version_text = f"{version_name} ({version_number_str})" if version_name == doc_version["name"]: - print(f"default version: {version_name}") is_selected = True options.append( {"selected": is_selected, "href": option_href, "text": version_text} @@ -197,7 +196,6 @@ def create_library_options( continue is_selected = False if lib == project_name: - print(f"default lib: {lib}") is_selected = True options.append({"selected": is_selected, "href": option_href, "text": lib}) @@ -385,8 +383,6 @@ def main( parse the file and add library/version selectors and a Home button """ - print(f"--- {filepath} ---") - with open(filepath) as fp: soup = BeautifulSoup(fp, "html5lib") @@ -485,4 +481,10 @@ def main( SELECTOR_PROJECT_NAMES, ), ) as executor: - list(executor.map(customize_manifest_file, filepaths, chunksize=8)) + results = executor.map(customize_manifest_file, filepaths, chunksize=8) + for completed, _ in enumerate(results, start=1): + if completed % 1000 == 0 or completed == len(filepaths): + print( + f"Customized {completed}/{len(filepaths)} HTML files", + flush=True, + ) diff --git a/ci/download_from_s3.sh b/ci/download_from_s3.sh index 060b29e2373..9d260586ffe 100755 --- a/ci/download_from_s3.sh +++ b/ci/download_from_s3.sh @@ -15,7 +15,7 @@ deployment: ${JEKYLL_DIR}/deployment " export DOCS_BUCKET="rapidsai-docs" -MAX_CONCURRENT_DOWNLOADS=4 +MAX_CONCURRENT_DOWNLOADS=8 DOWNLOAD_PIDS=() # Checks that the "_site" directory exists from a Jekyll build. Also ensures From 7c3cf2d38a9f0263e13652fe1003a7cc995ce00b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:42:00 -0500 Subject: [PATCH 5/7] Add temporary eight-fetch benchmark job --- .github/workflows/pr.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 6cde6c075a7..656df50bed9 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,3 +20,31 @@ jobs: with: persist-credentials: false - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + + benchmark-deploy: + name: Benchmark deployment pipeline (no deploy) + if: github.ref_name == 'pull-request/804' + runs-on: ubuntu-latest + timeout-minutes: 60 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + - uses: ruby/setup-ruby@6aaa311d81eba98ae12eaffbcb63296ace0efcde # v1.307.0 + - name: Build Jekyll Site + run: | + bundle install + bundle exec jekyll build + - uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 3600 + - name: Fetch doc files from S3 + run: ci/download_from_s3.sh + - name: Post-process docs + run: ci/post-process.sh From 586b1a9b00425f21e0595ef81669efe85ace0c8b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:52:12 -0500 Subject: [PATCH 6/7] Revert "Add temporary eight-fetch benchmark job" This reverts commit 7c3cf2d38a9f0263e13652fe1003a7cc995ce00b. --- .github/workflows/pr.yaml | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 656df50bed9..6cde6c075a7 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,31 +20,3 @@ jobs: with: persist-credentials: false - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 - - benchmark-deploy: - name: Benchmark deployment pipeline (no deploy) - if: github.ref_name == 'pull-request/804' - runs-on: ubuntu-latest - timeout-minutes: 60 - permissions: - id-token: write - contents: read - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - persist-credentials: false - - uses: ruby/setup-ruby@6aaa311d81eba98ae12eaffbcb63296ace0efcde # v1.307.0 - - name: Build Jekyll Site - run: | - bundle install - bundle exec jekyll build - - uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 - with: - role-to-assume: ${{ vars.AWS_ROLE_ARN }} - aws-region: ${{ vars.AWS_REGION }} - role-duration-seconds: 3600 - - name: Fetch doc files from S3 - run: ci/download_from_s3.sh - - name: Post-process docs - run: ci/post-process.sh From d0be178ff30b8d105c6ec85e7e995633efdb93c2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Jul 2026 20:57:08 -0500 Subject: [PATCH 7/7] Limit concurrent S3 fetches to four --- ci/download_from_s3.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/download_from_s3.sh b/ci/download_from_s3.sh index 9d260586ffe..060b29e2373 100755 --- a/ci/download_from_s3.sh +++ b/ci/download_from_s3.sh @@ -15,7 +15,7 @@ deployment: ${JEKYLL_DIR}/deployment " export DOCS_BUCKET="rapidsai-docs" -MAX_CONCURRENT_DOWNLOADS=8 +MAX_CONCURRENT_DOWNLOADS=4 DOWNLOAD_PIDS=() # Checks that the "_site" directory exists from a Jekyll build. Also ensures