diff --git a/ci/customization/customize_doc.py b/ci/customization/customize_doc.py index 2236f2a24e6..d073bef9608 100644 --- a/ci/customization/customize_doc.py +++ b/ci/customization/customize_doc.py @@ -10,7 +10,7 @@ import os import re import sys -from copy import deepcopy +from concurrent.futures import ProcessPoolExecutor import yaml @@ -23,6 +23,44 @@ NVIDIA_STYLE_TAG_ID = "nvidia-selector-css" FA_TAG_ID = "rapids-fa-tag" +LIB_PATH_DICT = None +PROJECT_TO_VERSIONS_DICT = None +SELECTOR_PROJECT_NAMES = None + + +def initialize_worker( + lib_path_dict: dict, + project_to_versions_dict: dict, + selector_project_names: set[str], +) -> None: + """Initializes read-only configuration used by each worker process.""" + global LIB_PATH_DICT, PROJECT_TO_VERSIONS_DICT, SELECTOR_PROJECT_NAMES + LIB_PATH_DICT = lib_path_dict + PROJECT_TO_VERSIONS_DICT = project_to_versions_dict + SELECTOR_PROJECT_NAMES = selector_project_names + + +def customize_manifest_file(filepath: str) -> None: + """Customizes one HTML file from the generated manifest.""" + project_name = get_lib_from_fp( + lib_path_dict=LIB_PATH_DICT, + filepath=filepath, + ) + main( + filepath=filepath, + lib_path_dict=LIB_PATH_DICT, + project_name=project_name, + versions_dict=PROJECT_TO_VERSIONS_DICT[project_name], + selector_project_names=SELECTOR_PROJECT_NAMES, + ) + + +def process_count() -> int: + """Returns the number of CPUs available to this process, like nproc.""" + if hasattr(os, "sched_getaffinity"): + return max(1, len(os.sched_getaffinity(0))) + return max(1, os.cpu_count() or 1) + def get_version_from_fp(*, filepath: str, versions_dict: dict): """ @@ -130,7 +168,6 @@ def create_version_options( option_href = version_path version_text = f"{version_name} ({version_number_str})" if version_name == doc_version["name"]: - print(f"default version: {version_name}") is_selected = True options.append( {"selected": is_selected, "href": option_href, "text": version_text} @@ -159,7 +196,6 @@ def create_library_options( continue is_selected = False if lib == project_name: - print(f"default lib: {lib}") is_selected = True options.append({"selected": is_selected, "href": option_href, "text": lib}) @@ -225,21 +261,11 @@ def create_pixel_tags(soup): return [head_tag, body_tag] -def uses_nvidia_sphinx_theme(soup) -> bool: - """ - Returns whether the document already uses the NVIDIA Sphinx Theme. - """ - return any( - "nvidia-sphinx-theme" in link.get("href", "") - for link in soup.find_all("link", href=True) - ) - - -def create_css_link_tag(soup): +def create_css_link_tag(soup, *, is_nvidia_theme: bool): """ Creates and returns the stylesheet tag for the injected selectors. """ - if uses_nvidia_sphinx_theme(soup): + if is_nvidia_theme: return soup.new_tag( "link", id=NVIDIA_STYLE_TAG_ID, @@ -253,51 +279,86 @@ def create_css_link_tag(soup): return script_tag -def delete_element(soup, selector): - """ - Deletes element from soup object if it already exists - """ - try: - soup.select(f"{selector}")[0].extract() - except Exception: - pass - - -def delete_rapids_custom_css_links(soup): +def delete_rapids_custom_css_links(links): """ Deletes global RAPIDS custom CSS links from NVIDIA-themed pages. """ - for link in soup.find_all("link", href=True): - if link["href"].endswith("/assets/css/custom.css"): - link.extract() + for link in links: + link.extract() -def delete_existing_elements(soup): +def delete_existing_elements(elements, *, doc_type: str, reference_el): """ Deletes any existing page elements to prevent duplicates on the page """ - doxygen_title_area = "#titlearea > table" - sphinx_home_btn = ".wy-side-nav-search .icon.icon-home" - sphinx_doc_version = ".wy-side-nav-search .version" - existing_jtd_container = "#rapids-jtd-container" - existing_pydata_container = "#rapids-pydata-container" - existing_doxygen_container = "#rapids-doxygen-container" - - for element in [ - existing_jtd_container, - existing_pydata_container, - existing_doxygen_container, - sphinx_doc_version, - sphinx_home_btn, - doxygen_title_area, - f"#{SCRIPT_TAG_ID}", - f"#{STYLE_TAG_ID}", - f"#{NVIDIA_STYLE_TAG_ID}", - f"#{FA_TAG_ID}", - f"#{PIXEL_SRC_TAG_ID}", - f"#{PIXEL_INVOCATION_TAG_ID}", - ]: - delete_element(soup, element) + for element in elements: + element.extract() + + if doc_type == "doxygen": + if table := reference_el.find("table", recursive=False): + table.extract() + + if doc_type == "jtd": + if version := reference_el.find(class_="version"): + version.extract() + if home_button := reference_el.find( + lambda tag: {"icon", "icon-home"}.issubset(tag.get("class", [])) + ): + home_button.extract() + + +def inspect_document(soup, *, filepath: str): + """Collects theme and customization state in one document traversal.""" + removable_ids = { + "rapids-jtd-container", + "rapids-pydata-container", + "rapids-doxygen-container", + SCRIPT_TAG_ID, + STYLE_TAG_ID, + NVIDIA_STYLE_TAG_ID, + FA_TAG_ID, + PIXEL_SRC_TAG_ID, + PIXEL_INVOCATION_TAG_ID, + } + existing_elements = [] + rapids_css_links = [] + references = {} + is_nvidia_theme = False + + for element in soup.find_all(True): + element_id = element.get("id") + if element_id in removable_ids: + existing_elements.append(element) + + classes = element.get("class", []) + if "wy-side-nav-search" in classes and "jtd" not in references: + references["jtd"] = element + elif element_id == "titlearea" and "doxygen" not in references: + references["doxygen"] = element + elif "bd-sidebar" in classes and "pydata" not in references: + references["pydata"] = element + + if element.name == "link" and (href := element.get("href")): + if "nvidia-sphinx-theme" in href: + is_nvidia_theme = True + if element_id not in removable_ids and href.endswith( + "/assets/css/custom.css" + ): + rapids_css_links.append(element) + + for doc_type in ("jtd", "doxygen", "pydata"): + if doc_type in references: + return ( + doc_type, + references[doc_type], + is_nvidia_theme, + existing_elements, + rapids_css_links, + ) + + raise UnsupportedThemeError( + f"Couldn't identify {filepath} as a supported theme type. Skipping file." + ) class UnsupportedThemeError(ValueError): @@ -309,33 +370,6 @@ class UnsupportedThemeError(ValueError): pass -def get_theme_info(soup, *, filepath: str): - """ - Determines what theme a given HTML file is using or exits if it's - not able to be determined. Returns a string identifier and reference element - that is used for inserting the library/version selectors to the doc. - """ - # Sphinx Themes - jtd_identifier = ".wy-side-nav-search" # Just-the-docs theme - pydata_identifier = ".bd-sidebar" # Pydata theme - - # Doxygen - doxygen_identifier = "#titlearea" - - if soup.select(jtd_identifier): - return "jtd", soup.select(jtd_identifier)[0] - - if soup.select(doxygen_identifier): - return "doxygen", soup.select(doxygen_identifier)[0] - - if soup.select(pydata_identifier): - return "pydata", soup.select(pydata_identifier)[0] - - raise UnsupportedThemeError( - f"Couldn't identify {filepath} as a supported theme type. Skipping file." - ) - - def main( *, filepath: str, @@ -349,21 +383,29 @@ def main( parse the file and add library/version selectors and a Home button """ - print(f"--- {filepath} ---") - with open(filepath) as fp: soup = BeautifulSoup(fp, "html5lib") try: - doc_type, reference_el = get_theme_info(soup, filepath=filepath) + ( + doc_type, + reference_el, + is_nvidia_theme, + existing_elements, + rapids_css_links, + ) = inspect_document(soup, filepath=filepath) except UnsupportedThemeError as err: print(f"{str(err)}", file=sys.stderr) return # Delete any existing added/unnecessary elements - delete_existing_elements(soup) - if uses_nvidia_sphinx_theme(soup): - delete_rapids_custom_css_links(soup) + delete_existing_elements( + existing_elements, + doc_type=doc_type, + reference_el=reference_el, + ) + if is_nvidia_theme: + delete_rapids_custom_css_links(rapids_css_links) # Add Font Awesome to Doxygen for icons if doc_type == "doxygen": @@ -392,7 +434,7 @@ def main( container = soup.new_tag("div", id=f"rapids-{doc_type}-container") script_tag = create_script_tag(soup) [pix_head_tag, pix_body_tag] = create_pixel_tags(soup) - style_tab = create_css_link_tag(soup) + style_tab = create_css_link_tag(soup, is_nvidia_theme=is_nvidia_theme) # Append elements to container container.append(home_btn_container) @@ -428,21 +470,21 @@ def main( SELECTOR_PROJECT_NAMES = get_selector_project_names(docs_yml_path=DOCS_YML_PATH) with open(MANIFEST_FILEPATH) as manifest_file: - for line in manifest_file: - filepath = line.strip() - - lib_path_dict = deepcopy(LIB_PATH_DICT) - - # determine project name (e.g. 'cudf') - project_name = get_lib_from_fp( - lib_path_dict=lib_path_dict, - filepath=filepath, - ) - - main( - filepath=filepath, - lib_path_dict=lib_path_dict, - project_name=project_name, - versions_dict=deepcopy(PROJECT_TO_VERSIONS_DICT[project_name]), - selector_project_names=SELECTOR_PROJECT_NAMES, - ) + filepaths = [line.strip() for line in manifest_file if line.strip()] + + with ProcessPoolExecutor( + max_workers=process_count(), + initializer=initialize_worker, + initargs=( + LIB_PATH_DICT, + PROJECT_TO_VERSIONS_DICT, + SELECTOR_PROJECT_NAMES, + ), + ) as executor: + results = executor.map(customize_manifest_file, filepaths, chunksize=8) + for completed, _ in enumerate(results, start=1): + if completed % 1000 == 0 or completed == len(filepaths): + print( + f"Customized {completed}/{len(filepaths)} HTML files", + flush=True, + ) diff --git a/ci/download_from_s3.sh b/ci/download_from_s3.sh index 37144b54ab9..060b29e2373 100755 --- a/ci/download_from_s3.sh +++ b/ci/download_from_s3.sh @@ -1,5 +1,5 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -15,6 +15,9 @@ deployment: ${JEKYLL_DIR}/deployment " export DOCS_BUCKET="rapidsai-docs" +MAX_CONCURRENT_DOWNLOADS=4 +DOWNLOAD_PIDS=() + # Checks that the "_site" directory exists from a Jekyll build. Also ensures # that the directories that are pulled from S3 aren't already present in the # "_site" directory since that could cause problems. @@ -64,6 +67,37 @@ aws_cp() { "${DST}" } +# Starts an S3 copy and waits when the concurrency limit is reached. +start_aws_cp() { + local DST SRC + + SRC=$1 + DST=$2 + + aws_cp "${SRC}" "${DST}" & + DOWNLOAD_PIDS+=("$!") + + if [ "${#DOWNLOAD_PIDS[@]}" -ge "${MAX_CONCURRENT_DOWNLOADS}" ]; then + wait_for_aws_cp + fi +} + +# Waits for the oldest running S3 copy and propagates its failure. +wait_for_aws_cp() { + local PID + + PID=${DOWNLOAD_PIDS[0]} + wait "${PID}" + DOWNLOAD_PIDS=("${DOWNLOAD_PIDS[@]:1}") +} + +# Waits for all remaining S3 copies. +wait_for_all_aws_cp() { + while [ "${#DOWNLOAD_PIDS[@]}" -gt 0 ]; do + wait_for_aws_cp + done +} + # Downloads the RAPIDS libraries' documentation files from S3 and places them # into the "_site/api" folder. download_lib_docs() { @@ -99,7 +133,7 @@ download_lib_docs() { # copy the relevant files from S3 to the local directory SRC="s3://${DOCS_BUCKET}/${PROJECT}/html/${VERSION_NUMBER}/" DST="$(yq -n 'env(GENERATED_DIRS)|.libs')/${PROJECT}/${VERSION_NUMBER}/" - aws_cp "${SRC}" "${DST}" + start_aws_cp "${SRC}" "${DST}" done # for VERSION_NAME done # for PROJECT @@ -115,10 +149,11 @@ download_deployment_docs() { SRC="s3://${DOCS_BUCKET}/deployment/html/${VERSION}/" DST="$(yq -n 'env(GENERATED_DIRS)|.deployment')/${VERSION}/" - aws_cp "${SRC}" "${DST}" + start_aws_cp "${SRC}" "${DST}" done } check_dirs download_lib_docs download_deployment_docs +wait_for_all_aws_cp