diff --git a/.github/scripts/extract_guest_metadata.py b/.github/scripts/extract_guest_metadata.py index ee308d2..9515b11 100644 --- a/.github/scripts/extract_guest_metadata.py +++ b/.github/scripts/extract_guest_metadata.py @@ -4,13 +4,13 @@ import json import os -import re import sys import urllib.request -from datetime import datetime import sentry_sdk +from osf_issue_parser import parse_issue, to_guest_promo_metadata + sentry_sdk.init( dsn=os.environ.get("SENTRY_DSN", ""), traces_sample_rate=0, @@ -20,12 +20,6 @@ GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") REPO = "githubevents/open-source-friday" -HOST_NAMES = { - "AndreaGriffiths11": "Andrea Griffiths", - "KevinCrosby": "Kevin Crosby", - "marlenezw": "Marlene Mhangami", - "madebygps": "Gwyneth Peña-Siguenza", -} def fetch_issue(number: int) -> dict: @@ -39,24 +33,6 @@ def fetch_issue(number: int) -> dict: return json.loads(resp.read()) -def parse_field(body: str, field: str) -> str: - m = re.search(rf"### {re.escape(field)}\s*\n+(.+?)(?:\n###|\Z)", body, re.DOTALL) - if m: - val = m.group(1).strip() - if val.upper() not in ("_NO RESPONSE_", "TBD", "NOT YET", ""): - return val - return "" - - -def parse_date(raw: str) -> str: - for fmt in ("%m-%d-%Y", "%m-%d-%y", "%m/%d/%Y", "%B %d, %Y", "%B %-d, %Y"): - try: - return datetime.strptime(raw.strip(), fmt).strftime("%B %-d, %Y") - except ValueError: - continue - return raw - - def main() -> None: if len(sys.argv) < 2: print("Usage: extract_guest_metadata.py ") @@ -64,66 +40,16 @@ def main() -> None: number = int(sys.argv[1]) issue = fetch_issue(number) - body = issue.get("body") or "" - title = issue.get("title") or "" - - guest_name = parse_field(body, "Name") - github_handle = parse_field(body, "GitHub Handle").lstrip("@") - bio = parse_field(body, "Tell us about yourself") - project_name = parse_field(body, "Project Name") - project_url = parse_field(body, "Project Repo Link") - raw_date = parse_field(body, "Dates") - - # Calendly-style body: "Name: Angela Wen @handle" - if not guest_name: - m = re.search(r"Name:\s+(.+?)(?:\s*@\S+)?\s*$", body, re.MULTILINE) - if m: - guest_name = m.group(1).strip() - - # Title-based date fallback - if not raw_date or raw_date.upper() in ("TBD", "_NO RESPONSE_", "NOT YET", ""): - m = re.search(r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})", title) - if m: - raw_date = m.group(1) - - stream_date = parse_date(raw_date) if raw_date else "Date TBD" - - assignees = issue.get("assignees") or [] - # Every new issue auto-assigns all four hosts via the issue template. - # The actual host is decided in a team meeting and only known once the - # other assignees have been removed. Treat host as TBD until exactly one - # assignee remains — the video template hides the "with {host}" line when - # host_name == "TBD". - if len(assignees) == 1: - login = assignees[0]["login"] - host_name = HOST_NAMES.get(login, login) - else: - host_name = "TBD" - - # Truncate bio for video overlay - if bio and len(bio) > 280: - bio = bio[:277] + "..." - - metadata = { - "guest_name": guest_name or "Guest", - "github_handle": github_handle, - "project_name": project_name or "Open Source", - "project_url": project_url, - "bio": bio, - "stream_date": stream_date, - "stream_time": "1 PM ET", - "host_name": host_name, - "issue_number": number, - "issue_url": issue["html_url"], - "has_audio": False, # set to True by workflow after TTS succeeds - } + + parsed = parse_issue(issue) + metadata = to_guest_promo_metadata(parsed) output_path = os.environ.get("METADATA_OUTPUT", "video/public/guest-promo.json") os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w") as f: json.dump(metadata, f, indent=2) - print(f"✅ Wrote metadata for '{guest_name}' → {output_path}") + print(f"✅ Wrote metadata for '{metadata['guest_name']}' → {output_path}") print(json.dumps(metadata, indent=2)) diff --git a/.github/scripts/osf_issue_parser.py b/.github/scripts/osf_issue_parser.py new file mode 100644 index 0000000..e23e8cc --- /dev/null +++ b/.github/scripts/osf_issue_parser.py @@ -0,0 +1,192 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from datetime import datetime + +# Map GitHub logins → display names for the hosting team +HOST_NAMES = { + "AndreaGriffiths11": "Andrea Griffiths", + "KevinCrosby": "Kevin Crosby", + "marlenezw": "Marlene Mhangami", + "madebygps": "Gwyneth Peña-Siguenza", +} + +EMPTY_VALUES = {"TBD", "_NO RESPONSE_", "NOT YET", ""} + +_MONTHS = ( + "January|February|March|April|May|June|" + "July|August|September|October|November|December" +) + +@dataclass +class ParsedIssue: + guest_name: str + github_handle: str + bio: str + project_name: str + project_url: str + raw_date: str + date_obj: datetime | None + date_str: str + host_name: str # Single host for promo video (TBD if multiple assignees) + all_hosts: str # All assignees joined by commas for the schedule table + url: str + number: int + + +def parse_field(body: str, label: str) -> str: + """Extract the value under a ###